diff --git a/CMakeLists.txt b/CMakeLists.txt index 26882d9f7e25c293fac867a11d01ae13c49e270c..3d4f3cde6a6a63bb1fd0548f1ebd68c7315f1c06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -308,7 +308,7 @@ if (BUILD_TESTS AND EXISTS ${PROJECT_SOURCE_DIR}/googletest/CMakeLists.txt) endif() if (BUILD_TRAINING_TOOLS) -add_subdirectory(training) +add_subdirectory(src/training) endif() get_target_property(tesseract_NAME libtesseract NAME) diff --git a/configure.ac b/configure.ac index c38ceed86b76ff27bd270847d6d61117c7eb0d6a..b05edf53bf3d28fdc3ba820822548440aa8fcbb7 100644 --- a/configure.ac +++ b/configure.ac @@ -502,7 +502,7 @@ AC_CONFIG_FILES([java/com/google/scrollview/Makefile]) AC_CONFIG_FILES([java/com/google/scrollview/events/Makefile]) AC_CONFIG_FILES([java/com/google/scrollview/ui/Makefile]) AC_CONFIG_FILES([doc/Makefile]) -AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(training/Makefile)]) +AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(src/training/Makefile)]) AC_OUTPUT # Final message diff --git a/cppan.yml b/cppan.yml index 2ec4f49d2fbf2a7a32da5a388c512ceccddd00e1..98e07938b8d50528e795200ca44017fbdc0c761a 100644 --- a/cppan.yml +++ b/cppan.yml @@ -172,7 +172,7 @@ projects: tessopt: type: lib static_only: true - files: training/tessopt.* + files: src/training/tessopt.* include_directories: training dependencies: libtesseract @@ -180,104 +180,104 @@ projects: type: lib static_only: true files: - - training/commandlineflags.cpp - - training/commandlineflags.h - - training/commontraining.cpp - - training/commontraining.h + - src/training/commandlineflags.cpp + - src/training/commandlineflags.h + - src/training/commontraining.cpp + - src/training/commontraining.h include_directories: training dependencies: - tessopt ambiguous_words: - files: training/ambiguous_words.cpp + files: src/training/ambiguous_words.cpp dependencies: - libtesseract classifier_tester: - files: training/classifier_tester.cpp + files: src/training/classifier_tester.cpp dependencies: common_training combine_lang_model: - files: training/combine_lang_model.cpp + files: src/training/combine_lang_model.cpp dependencies: unicharset_training combine_tessdata: - files: training/combine_tessdata.cpp + files: src/training/combine_tessdata.cpp dependencies: libtesseract cntraining: - files: training/cntraining.cpp + files: src/training/cntraining.cpp dependencies: common_training dawg2wordlist: - files: training/dawg2wordlist.cpp + files: src/training/dawg2wordlist.cpp dependencies: libtesseract mftraining: files: - - training/mftraining.cpp - - training/mergenf.* + - src/training/mftraining.cpp + - src/training/mergenf.* dependencies: common_training shapeclustering: - files: training/shapeclustering.cpp + files: src/training/shapeclustering.cpp dependencies: common_training unicharset_extractor: - files: training/unicharset_extractor.cpp + files: src/training/unicharset_extractor.cpp dependencies: unicharset_training wordlist2dawg: - files: training/wordlist2dawg.cpp + files: src/training/wordlist2dawg.cpp dependencies: libtesseract unicharset_training: type: lib static_only: true files: - - training/fileio.* - - training/icuerrorcode.h - - training/lang_model_helpers.* - - training/lstmtester.* - - training/normstrngs.* - - training/unicharset_training_utils.* - - training/validat.* + - src/training/fileio.* + - src/training/icuerrorcode.h + - src/training/lang_model_helpers.* + - src/training/lstmtester.* + - src/training/normstrngs.* + - src/training/unicharset_training_utils.* + - src/training/validat.* include_directories: training dependencies: - common_training - pvt.cppan.demo.unicode.icu.i18n lstmeval: - files: training/lstmeval.cpp + files: src/training/lstmeval.cpp dependencies: unicharset_training lstmtraining: - files: training/lstmtraining.cpp + files: src/training/lstmtraining.cpp dependencies: unicharset_training set_unicharset_properties: - files: training/set_unicharset_properties.cpp + files: src/training/set_unicharset_properties.cpp dependencies: unicharset_training text2image: files: - - training/text2image.cpp - - training/boxchar.cpp - - training/boxchar.h - - training/degradeimage.cpp - - training/degradeimage.h - - training/ligature_table.cpp - - training/ligature_table.h - - training/normstrngs.cpp - - training/normstrngs.h - - training/pango_font_info.cpp - - training/pango_font_info.h - - training/stringrenderer.cpp - - training/stringrenderer.h - - training/tlog.cpp - - training/tlog.h - - training/util.h - - training/icuerrorcode.h + - src/training/text2image.cpp + - src/training/boxchar.cpp + - src/training/boxchar.h + - src/training/degradeimage.cpp + - src/training/degradeimage.h + - src/training/ligature_table.cpp + - src/training/ligature_table.h + - src/training/normstrngs.cpp + - src/training/normstrngs.h + - src/training/pango_font_info.cpp + - src/training/pango_font_info.h + - src/training/stringrenderer.cpp + - src/training/stringrenderer.h + - src/training/tlog.cpp + - src/training/tlog.h + - src/training/util.h + - src/training/icuerrorcode.h dependencies: - unicharset_training diff --git a/training/CMakeLists.txt b/src/training/CMakeLists.txt similarity index 100% rename from training/CMakeLists.txt rename to src/training/CMakeLists.txt diff --git a/training/Makefile.am b/src/training/Makefile.am similarity index 100% rename from training/Makefile.am rename to src/training/Makefile.am diff --git a/training/ambiguous_words.cpp b/src/training/ambiguous_words.cpp similarity index 100% rename from training/ambiguous_words.cpp rename to src/training/ambiguous_words.cpp diff --git a/training/boxchar.cpp b/src/training/boxchar.cpp similarity index 100% rename from training/boxchar.cpp rename to src/training/boxchar.cpp diff --git a/training/boxchar.h b/src/training/boxchar.h similarity index 100% rename from training/boxchar.h rename to src/training/boxchar.h diff --git a/training/classifier_tester.cpp b/src/training/classifier_tester.cpp similarity index 100% rename from training/classifier_tester.cpp rename to src/training/classifier_tester.cpp diff --git a/training/cntraining.cpp b/src/training/cntraining.cpp similarity index 100% rename from training/cntraining.cpp rename to src/training/cntraining.cpp diff --git a/training/combine_lang_model.cpp b/src/training/combine_lang_model.cpp similarity index 100% rename from training/combine_lang_model.cpp rename to src/training/combine_lang_model.cpp diff --git a/training/combine_tessdata.cpp b/src/training/combine_tessdata.cpp similarity index 100% rename from training/combine_tessdata.cpp rename to src/training/combine_tessdata.cpp diff --git a/training/commandlineflags.cpp b/src/training/commandlineflags.cpp similarity index 100% rename from training/commandlineflags.cpp rename to src/training/commandlineflags.cpp diff --git a/training/commandlineflags.h b/src/training/commandlineflags.h similarity index 100% rename from training/commandlineflags.h rename to src/training/commandlineflags.h diff --git a/training/commontraining.cpp b/src/training/commontraining.cpp similarity index 100% rename from training/commontraining.cpp rename to src/training/commontraining.cpp diff --git a/training/commontraining.h b/src/training/commontraining.h similarity index 100% rename from training/commontraining.h rename to src/training/commontraining.h diff --git a/training/dawg2wordlist.cpp b/src/training/dawg2wordlist.cpp similarity index 100% rename from training/dawg2wordlist.cpp rename to src/training/dawg2wordlist.cpp diff --git a/training/degradeimage.cpp b/src/training/degradeimage.cpp similarity index 97% rename from training/degradeimage.cpp rename to src/training/degradeimage.cpp index 76f6cf092a12ff967c524262f3c2adf1c89c1e47..d0b4b77d019a5f86902bc8773fc979e33a9a27d7 100644 --- a/training/degradeimage.cpp +++ b/src/training/degradeimage.cpp @@ -1,310 +1,310 @@ -/********************************************************************** - * File: degradeimage.cpp - * Description: Function to degrade an image (usually of text) as if it - * has been printed and then scanned. - * Authors: Ray Smith - * Created: Tue Nov 19 2013 - * - * (C) Copyright 2013, Google Inc. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0 - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - **********************************************************************/ - -#include "degradeimage.h" - -#include -#include "allheaders.h" // from leptonica -#include "genericvector.h" -#include "helpers.h" // For TRand. -#include "rect.h" - -namespace tesseract { - -// A randomized perspective distortion can be applied to synthetic input. -// The perspective distortion comes from leptonica, which uses 2 sets of 4 -// corners to determine the distortion. There are random values for each of -// the x numbers x0..x3 and y0..y3, except for x2 and x3 which are instead -// defined in terms of a single shear value. This reduces the degrees of -// freedom enough to make the distortion more realistic than it would otherwise -// be if all 8 coordinates could move independently. -// One additional factor is used for the color of the pixels that don't exist -// in the source image. -// Name for each of the randomizing factors. -enum FactorNames { - FN_INCOLOR, - FN_Y0, - FN_Y1, - FN_Y2, - FN_Y3, - FN_X0, - FN_X1, - FN_SHEAR, - // x2 = x1 - shear - // x3 = x0 + shear - FN_NUM_FACTORS -}; - -// Rotation is +/- kRotationRange radians. -const float kRotationRange = 0.02f; -// Number of grey levels to shift by for each exposure step. -const int kExposureFactor = 16; -// Salt and pepper noise is +/- kSaltnPepper. -const int kSaltnPepper = 5; -// Min sum of width + height on which to operate the ramp. -const int kMinRampSize = 1000; - -// Degrade the pix as if by a print/copy/scan cycle with exposure > 0 -// corresponding to darkening on the copier and <0 lighter and 0 not copied. -// Exposures in [-2,2] are most useful, with -3 and 3 being extreme. -// If rotation is nullptr, rotation is skipped. If *rotation is non-zero, the -// pix is rotated by *rotation else it is randomly rotated and *rotation is -// modified. -// -// HOW IT WORKS: -// Most of the process is really dictated by the fact that the minimum -// available convolution is 3X3, which is too big really to simulate a -// good quality print/scan process. (2X2 would be better.) -// 1 pixel wide inputs are heavily smeared by the 3X3 convolution, making the -// images generally biased to being too light, so most of the work is to make -// them darker. 3 levels of thickening/darkening are achieved with 2 dilations, -// (using a greyscale erosion) one heavy (by being before convolution) and one -// light (after convolution). -// With no dilation, after covolution, the images are so light that a heavy -// constant offset is required to make the 0 image look reasonable. A simple -// constant offset multiple of exposure to undo this value is enough to achieve -// all the required lightening. This gives the advantage that exposure level 1 -// with a single dilation gives a good impression of the broken-yet-too-dark -// problem that is often seen in scans. -// A small random rotation gives some varying greyscale values on the edges, -// and some random salt and pepper noise on top helps to realistically jaggy-up -// the edges. -// Finally a greyscale ramp provides a continuum of effects between exposure -// levels. -Pix* DegradeImage(Pix* input, int exposure, TRand* randomizer, - float* rotation) { - Pix* pix = pixConvertTo8(input, false); - pixDestroy(&input); - input = pix; - int width = pixGetWidth(input); - int height = pixGetHeight(input); - if (exposure >= 2) { - // An erosion simulates the spreading darkening of a dark copy. - // This is backwards to binary morphology, - // see http://www.leptonica.com/grayscale-morphology.html - pix = input; - input = pixErodeGray(pix, 3, 3); - pixDestroy(&pix); - } - // A convolution is essential to any mode as no scanner produces an - // image as sharp as the electronic image. - pix = pixBlockconv(input, 1, 1); - pixDestroy(&input); - // A small random rotation helps to make the edges jaggy in a realistic way. - if (rotation != nullptr) { - float radians_clockwise = 0.0f; - if (*rotation) { - radians_clockwise = *rotation; - } else if (randomizer != nullptr) { - radians_clockwise = randomizer->SignedRand(kRotationRange); - } - - input = pixRotate(pix, radians_clockwise, - L_ROTATE_AREA_MAP, L_BRING_IN_WHITE, - 0, 0); - // Rotate the boxes to match. - *rotation = radians_clockwise; - pixDestroy(&pix); - } else { - input = pix; - } - - if (exposure >= 3 || exposure == 1) { - // Erosion after the convolution is not as heavy as before, so it is - // good for level 1 and in addition as a level 3. - // This is backwards to binary morphology, - // see http://www.leptonica.com/grayscale-morphology.html - pix = input; - input = pixErodeGray(pix, 3, 3); - pixDestroy(&pix); - } - // The convolution really needed to be 2x2 to be realistic enough, but - // we only have 3x3, so we have to bias the image darker or lose thin - // strokes. - int erosion_offset = 0; - // For light and 0 exposure, there is no dilation, so compensate for the - // convolution with a big darkening bias which is undone for lighter - // exposures. - if (exposure <= 0) - erosion_offset = -3 * kExposureFactor; - // Add in a general offset of the greyscales for the exposure level so - // a threshold of 128 gives a reasonable binary result. - erosion_offset -= exposure * kExposureFactor; - // Add a gradual fade over the page and a small amount of salt and pepper - // noise to simulate noise in the sensor/paper fibres and varying - // illumination. - l_uint32* data = pixGetData(input); - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - int pixel = GET_DATA_BYTE(data, x); - if (randomizer != nullptr) - pixel += randomizer->IntRand() % (kSaltnPepper*2 + 1) - kSaltnPepper; - if (height + width > kMinRampSize) - pixel -= (2*x + y) * 32 / (height + width); - pixel += erosion_offset; - if (pixel < 0) - pixel = 0; - if (pixel > 255) - pixel = 255; - SET_DATA_BYTE(data, x, pixel); - } - data += input->wpl; - } - return input; -} - -// Creates and returns a Pix distorted by various means according to the bool -// flags. If boxes is not nullptr, the boxes are resized/positioned according to -// any spatial distortion and also by the integer reduction factor box_scale -// so they will match what the network will output. -// Returns nullptr on error. The returned Pix must be pixDestroyed. -Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert, - bool white_noise, bool smooth_noise, bool blur, - int box_reduction, TRand* randomizer, - GenericVector* boxes) { - Pix* distorted = pixCopy(nullptr, const_cast(pix)); - // Things to do to synthetic training data. - if (invert && randomizer->SignedRand(1.0) < 0) - pixInvert(distorted, distorted); - if ((white_noise || smooth_noise) && randomizer->SignedRand(1.0) > 0.0) { - // TODO(rays) Cook noise in a more thread-safe manner than rand(). - // Attempt to make the sequences reproducible. - srand(randomizer->IntRand()); - Pix* pixn = pixAddGaussianNoise(distorted, 8.0); - pixDestroy(&distorted); - if (smooth_noise) { - distorted = pixBlockconv(pixn, 1, 1); - pixDestroy(&pixn); - } else { - distorted = pixn; - } - } - if (blur && randomizer->SignedRand(1.0) > 0.0) { - Pix* blurred = pixBlockconv(distorted, 1, 1); - pixDestroy(&distorted); - distorted = blurred; - } - if (perspective) - GeneratePerspectiveDistortion(0, 0, randomizer, &distorted, boxes); - if (boxes != nullptr) { - for (int b = 0; b < boxes->size(); ++b) { - (*boxes)[b].scale(1.0f / box_reduction); - if ((*boxes)[b].width() <= 0) - (*boxes)[b].set_right((*boxes)[b].left() + 1); - } - } - return distorted; -} - -// Distorts anything that has a non-null pointer with the same pseudo-random -// perspective distortion. Width and height only need to be set if there -// is no pix. If there is a pix, then they will be taken from there. -void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer, - Pix** pix, GenericVector* boxes) { - if (pix != nullptr && *pix != nullptr) { - width = pixGetWidth(*pix); - height = pixGetHeight(*pix); - } - float* im_coeffs = nullptr; - float* box_coeffs = nullptr; - l_int32 incolor = - ProjectiveCoeffs(width, height, randomizer, &im_coeffs, &box_coeffs); - if (pix != nullptr && *pix != nullptr) { - // Transform the image. - Pix* transformed = pixProjective(*pix, im_coeffs, incolor); - if (transformed == nullptr) { - tprintf("Projective transformation failed!!\n"); - return; - } - pixDestroy(pix); - *pix = transformed; - } - if (boxes != nullptr) { - // Transform the boxes. - for (int b = 0; b < boxes->size(); ++b) { - int x1, y1, x2, y2; - const TBOX& box = (*boxes)[b]; - projectiveXformSampledPt(box_coeffs, box.left(), height - box.top(), &x1, - &y1); - projectiveXformSampledPt(box_coeffs, box.right(), height - box.bottom(), - &x2, &y2); - TBOX new_box1(x1, height - y2, x2, height - y1); - projectiveXformSampledPt(box_coeffs, box.left(), height - box.bottom(), - &x1, &y1); - projectiveXformSampledPt(box_coeffs, box.right(), height - box.top(), &x2, - &y2); - TBOX new_box2(x1, height - y1, x2, height - y2); - (*boxes)[b] = new_box1.bounding_union(new_box2); - } - } - free(im_coeffs); - free(box_coeffs); -} - -// Computes the coefficients of a randomized projective transformation. -// The image transform requires backward transformation coefficient, and the -// box transform the forward coefficients. -// Returns the incolor arg to pixProjective. -int ProjectiveCoeffs(int width, int height, TRand* randomizer, - float** im_coeffs, float** box_coeffs) { - // Setup "from" points. - Pta* src_pts = ptaCreate(4); - ptaAddPt(src_pts, 0.0f, 0.0f); - ptaAddPt(src_pts, width, 0.0f); - ptaAddPt(src_pts, width, height); - ptaAddPt(src_pts, 0.0f, height); - // Extract factors from pseudo-random sequence. - float factors[FN_NUM_FACTORS]; - float shear = 0.0f; // Shear is signed. - for (int i = 0; i < FN_NUM_FACTORS; ++i) { - // Everything is squared to make wild values rarer. - if (i == FN_SHEAR) { - // Shear is signed. - shear = randomizer->SignedRand(0.5 / 3.0); - shear = shear >= 0.0 ? shear * shear : -shear * shear; - // Keep the sheared points within the original rectangle. - if (shear < -factors[FN_X0]) shear = -factors[FN_X0]; - if (shear > factors[FN_X1]) shear = factors[FN_X1]; - factors[i] = shear; - } else if (i != FN_INCOLOR) { - factors[i] = fabs(randomizer->SignedRand(1.0)); - if (i <= FN_Y3) - factors[i] *= 5.0 / 8.0; - else - factors[i] *= 0.5; - factors[i] *= factors[i]; - } - } - // Setup "to" points. - Pta* dest_pts = ptaCreate(4); - ptaAddPt(dest_pts, factors[FN_X0] * width, factors[FN_Y0] * height); - ptaAddPt(dest_pts, (1.0f - factors[FN_X1]) * width, factors[FN_Y1] * height); - ptaAddPt(dest_pts, (1.0f - factors[FN_X1] + shear) * width, - (1 - factors[FN_Y2]) * height); - ptaAddPt(dest_pts, (factors[FN_X0] + shear) * width, - (1 - factors[FN_Y3]) * height); - getProjectiveXformCoeffs(dest_pts, src_pts, im_coeffs); - getProjectiveXformCoeffs(src_pts, dest_pts, box_coeffs); - ptaDestroy(&src_pts); - ptaDestroy(&dest_pts); - return factors[FN_INCOLOR] > 0.5f ? L_BRING_IN_WHITE : L_BRING_IN_BLACK; -} - -} // namespace tesseract +/********************************************************************** + * File: degradeimage.cpp + * Description: Function to degrade an image (usually of text) as if it + * has been printed and then scanned. + * Authors: Ray Smith + * Created: Tue Nov 19 2013 + * + * (C) Copyright 2013, Google Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **********************************************************************/ + +#include "degradeimage.h" + +#include +#include "allheaders.h" // from leptonica +#include "genericvector.h" +#include "helpers.h" // For TRand. +#include "rect.h" + +namespace tesseract { + +// A randomized perspective distortion can be applied to synthetic input. +// The perspective distortion comes from leptonica, which uses 2 sets of 4 +// corners to determine the distortion. There are random values for each of +// the x numbers x0..x3 and y0..y3, except for x2 and x3 which are instead +// defined in terms of a single shear value. This reduces the degrees of +// freedom enough to make the distortion more realistic than it would otherwise +// be if all 8 coordinates could move independently. +// One additional factor is used for the color of the pixels that don't exist +// in the source image. +// Name for each of the randomizing factors. +enum FactorNames { + FN_INCOLOR, + FN_Y0, + FN_Y1, + FN_Y2, + FN_Y3, + FN_X0, + FN_X1, + FN_SHEAR, + // x2 = x1 - shear + // x3 = x0 + shear + FN_NUM_FACTORS +}; + +// Rotation is +/- kRotationRange radians. +const float kRotationRange = 0.02f; +// Number of grey levels to shift by for each exposure step. +const int kExposureFactor = 16; +// Salt and pepper noise is +/- kSaltnPepper. +const int kSaltnPepper = 5; +// Min sum of width + height on which to operate the ramp. +const int kMinRampSize = 1000; + +// Degrade the pix as if by a print/copy/scan cycle with exposure > 0 +// corresponding to darkening on the copier and <0 lighter and 0 not copied. +// Exposures in [-2,2] are most useful, with -3 and 3 being extreme. +// If rotation is nullptr, rotation is skipped. If *rotation is non-zero, the +// pix is rotated by *rotation else it is randomly rotated and *rotation is +// modified. +// +// HOW IT WORKS: +// Most of the process is really dictated by the fact that the minimum +// available convolution is 3X3, which is too big really to simulate a +// good quality print/scan process. (2X2 would be better.) +// 1 pixel wide inputs are heavily smeared by the 3X3 convolution, making the +// images generally biased to being too light, so most of the work is to make +// them darker. 3 levels of thickening/darkening are achieved with 2 dilations, +// (using a greyscale erosion) one heavy (by being before convolution) and one +// light (after convolution). +// With no dilation, after covolution, the images are so light that a heavy +// constant offset is required to make the 0 image look reasonable. A simple +// constant offset multiple of exposure to undo this value is enough to achieve +// all the required lightening. This gives the advantage that exposure level 1 +// with a single dilation gives a good impression of the broken-yet-too-dark +// problem that is often seen in scans. +// A small random rotation gives some varying greyscale values on the edges, +// and some random salt and pepper noise on top helps to realistically jaggy-up +// the edges. +// Finally a greyscale ramp provides a continuum of effects between exposure +// levels. +Pix* DegradeImage(Pix* input, int exposure, TRand* randomizer, + float* rotation) { + Pix* pix = pixConvertTo8(input, false); + pixDestroy(&input); + input = pix; + int width = pixGetWidth(input); + int height = pixGetHeight(input); + if (exposure >= 2) { + // An erosion simulates the spreading darkening of a dark copy. + // This is backwards to binary morphology, + // see http://www.leptonica.com/grayscale-morphology.html + pix = input; + input = pixErodeGray(pix, 3, 3); + pixDestroy(&pix); + } + // A convolution is essential to any mode as no scanner produces an + // image as sharp as the electronic image. + pix = pixBlockconv(input, 1, 1); + pixDestroy(&input); + // A small random rotation helps to make the edges jaggy in a realistic way. + if (rotation != nullptr) { + float radians_clockwise = 0.0f; + if (*rotation) { + radians_clockwise = *rotation; + } else if (randomizer != nullptr) { + radians_clockwise = randomizer->SignedRand(kRotationRange); + } + + input = pixRotate(pix, radians_clockwise, + L_ROTATE_AREA_MAP, L_BRING_IN_WHITE, + 0, 0); + // Rotate the boxes to match. + *rotation = radians_clockwise; + pixDestroy(&pix); + } else { + input = pix; + } + + if (exposure >= 3 || exposure == 1) { + // Erosion after the convolution is not as heavy as before, so it is + // good for level 1 and in addition as a level 3. + // This is backwards to binary morphology, + // see http://www.leptonica.com/grayscale-morphology.html + pix = input; + input = pixErodeGray(pix, 3, 3); + pixDestroy(&pix); + } + // The convolution really needed to be 2x2 to be realistic enough, but + // we only have 3x3, so we have to bias the image darker or lose thin + // strokes. + int erosion_offset = 0; + // For light and 0 exposure, there is no dilation, so compensate for the + // convolution with a big darkening bias which is undone for lighter + // exposures. + if (exposure <= 0) + erosion_offset = -3 * kExposureFactor; + // Add in a general offset of the greyscales for the exposure level so + // a threshold of 128 gives a reasonable binary result. + erosion_offset -= exposure * kExposureFactor; + // Add a gradual fade over the page and a small amount of salt and pepper + // noise to simulate noise in the sensor/paper fibres and varying + // illumination. + l_uint32* data = pixGetData(input); + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + int pixel = GET_DATA_BYTE(data, x); + if (randomizer != nullptr) + pixel += randomizer->IntRand() % (kSaltnPepper*2 + 1) - kSaltnPepper; + if (height + width > kMinRampSize) + pixel -= (2*x + y) * 32 / (height + width); + pixel += erosion_offset; + if (pixel < 0) + pixel = 0; + if (pixel > 255) + pixel = 255; + SET_DATA_BYTE(data, x, pixel); + } + data += input->wpl; + } + return input; +} + +// Creates and returns a Pix distorted by various means according to the bool +// flags. If boxes is not nullptr, the boxes are resized/positioned according to +// any spatial distortion and also by the integer reduction factor box_scale +// so they will match what the network will output. +// Returns nullptr on error. The returned Pix must be pixDestroyed. +Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert, + bool white_noise, bool smooth_noise, bool blur, + int box_reduction, TRand* randomizer, + GenericVector* boxes) { + Pix* distorted = pixCopy(nullptr, const_cast(pix)); + // Things to do to synthetic training data. + if (invert && randomizer->SignedRand(1.0) < 0) + pixInvert(distorted, distorted); + if ((white_noise || smooth_noise) && randomizer->SignedRand(1.0) > 0.0) { + // TODO(rays) Cook noise in a more thread-safe manner than rand(). + // Attempt to make the sequences reproducible. + srand(randomizer->IntRand()); + Pix* pixn = pixAddGaussianNoise(distorted, 8.0); + pixDestroy(&distorted); + if (smooth_noise) { + distorted = pixBlockconv(pixn, 1, 1); + pixDestroy(&pixn); + } else { + distorted = pixn; + } + } + if (blur && randomizer->SignedRand(1.0) > 0.0) { + Pix* blurred = pixBlockconv(distorted, 1, 1); + pixDestroy(&distorted); + distorted = blurred; + } + if (perspective) + GeneratePerspectiveDistortion(0, 0, randomizer, &distorted, boxes); + if (boxes != nullptr) { + for (int b = 0; b < boxes->size(); ++b) { + (*boxes)[b].scale(1.0f / box_reduction); + if ((*boxes)[b].width() <= 0) + (*boxes)[b].set_right((*boxes)[b].left() + 1); + } + } + return distorted; +} + +// Distorts anything that has a non-null pointer with the same pseudo-random +// perspective distortion. Width and height only need to be set if there +// is no pix. If there is a pix, then they will be taken from there. +void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer, + Pix** pix, GenericVector* boxes) { + if (pix != nullptr && *pix != nullptr) { + width = pixGetWidth(*pix); + height = pixGetHeight(*pix); + } + float* im_coeffs = nullptr; + float* box_coeffs = nullptr; + l_int32 incolor = + ProjectiveCoeffs(width, height, randomizer, &im_coeffs, &box_coeffs); + if (pix != nullptr && *pix != nullptr) { + // Transform the image. + Pix* transformed = pixProjective(*pix, im_coeffs, incolor); + if (transformed == nullptr) { + tprintf("Projective transformation failed!!\n"); + return; + } + pixDestroy(pix); + *pix = transformed; + } + if (boxes != nullptr) { + // Transform the boxes. + for (int b = 0; b < boxes->size(); ++b) { + int x1, y1, x2, y2; + const TBOX& box = (*boxes)[b]; + projectiveXformSampledPt(box_coeffs, box.left(), height - box.top(), &x1, + &y1); + projectiveXformSampledPt(box_coeffs, box.right(), height - box.bottom(), + &x2, &y2); + TBOX new_box1(x1, height - y2, x2, height - y1); + projectiveXformSampledPt(box_coeffs, box.left(), height - box.bottom(), + &x1, &y1); + projectiveXformSampledPt(box_coeffs, box.right(), height - box.top(), &x2, + &y2); + TBOX new_box2(x1, height - y1, x2, height - y2); + (*boxes)[b] = new_box1.bounding_union(new_box2); + } + } + free(im_coeffs); + free(box_coeffs); +} + +// Computes the coefficients of a randomized projective transformation. +// The image transform requires backward transformation coefficient, and the +// box transform the forward coefficients. +// Returns the incolor arg to pixProjective. +int ProjectiveCoeffs(int width, int height, TRand* randomizer, + float** im_coeffs, float** box_coeffs) { + // Setup "from" points. + Pta* src_pts = ptaCreate(4); + ptaAddPt(src_pts, 0.0f, 0.0f); + ptaAddPt(src_pts, width, 0.0f); + ptaAddPt(src_pts, width, height); + ptaAddPt(src_pts, 0.0f, height); + // Extract factors from pseudo-random sequence. + float factors[FN_NUM_FACTORS]; + float shear = 0.0f; // Shear is signed. + for (int i = 0; i < FN_NUM_FACTORS; ++i) { + // Everything is squared to make wild values rarer. + if (i == FN_SHEAR) { + // Shear is signed. + shear = randomizer->SignedRand(0.5 / 3.0); + shear = shear >= 0.0 ? shear * shear : -shear * shear; + // Keep the sheared points within the original rectangle. + if (shear < -factors[FN_X0]) shear = -factors[FN_X0]; + if (shear > factors[FN_X1]) shear = factors[FN_X1]; + factors[i] = shear; + } else if (i != FN_INCOLOR) { + factors[i] = fabs(randomizer->SignedRand(1.0)); + if (i <= FN_Y3) + factors[i] *= 5.0 / 8.0; + else + factors[i] *= 0.5; + factors[i] *= factors[i]; + } + } + // Setup "to" points. + Pta* dest_pts = ptaCreate(4); + ptaAddPt(dest_pts, factors[FN_X0] * width, factors[FN_Y0] * height); + ptaAddPt(dest_pts, (1.0f - factors[FN_X1]) * width, factors[FN_Y1] * height); + ptaAddPt(dest_pts, (1.0f - factors[FN_X1] + shear) * width, + (1 - factors[FN_Y2]) * height); + ptaAddPt(dest_pts, (factors[FN_X0] + shear) * width, + (1 - factors[FN_Y3]) * height); + getProjectiveXformCoeffs(dest_pts, src_pts, im_coeffs); + getProjectiveXformCoeffs(src_pts, dest_pts, box_coeffs); + ptaDestroy(&src_pts); + ptaDestroy(&dest_pts); + return factors[FN_INCOLOR] > 0.5f ? L_BRING_IN_WHITE : L_BRING_IN_BLACK; +} + +} // namespace tesseract diff --git a/training/degradeimage.h b/src/training/degradeimage.h similarity index 98% rename from training/degradeimage.h rename to src/training/degradeimage.h index 85e35f0ad28edc628baab4d423b7f0c38476e3e3..98ef764f878a0d84aa8e4afa12f9b84152bb4020 100644 --- a/training/degradeimage.h +++ b/src/training/degradeimage.h @@ -1,61 +1,61 @@ -/********************************************************************** - * File: degradeimage.h - * Description: Function to degrade an image (usually of text) as if it - * has been printed and then scanned. - * Authors: Ray Smith - * Created: Tue Nov 19 2013 - * - * (C) Copyright 2013, Google Inc. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0 - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - **********************************************************************/ -#ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_ -#define TESSERACT_TRAINING_DEGRADEIMAGE_H_ - -#include "allheaders.h" -#include "genericvector.h" -#include "helpers.h" // For TRand. -#include "rect.h" - -namespace tesseract { - -// Degrade the pix as if by a print/copy/scan cycle with exposure > 0 -// corresponding to darkening on the copier and <0 lighter and 0 not copied. -// If rotation is not nullptr, the clockwise rotation in radians is saved there. -// The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.) -// The input image is destroyed and a different image returned. -struct Pix* DegradeImage(struct Pix* input, int exposure, TRand* randomizer, - float* rotation); - -// Creates and returns a Pix distorted by various means according to the bool -// flags. If boxes is not nullptr, the boxes are resized/positioned according to -// any spatial distortion and also by the integer reduction factor box_scale -// so they will match what the network will output. -// Returns nullptr on error. The returned Pix must be pixDestroyed. -Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert, - bool white_noise, bool smooth_noise, bool blur, - int box_reduction, TRand* randomizer, - GenericVector* boxes); -// Distorts anything that has a non-null pointer with the same pseudo-random -// perspective distortion. Width and height only need to be set if there -// is no pix. If there is a pix, then they will be taken from there. -void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer, - Pix** pix, GenericVector* boxes); -// Computes the coefficients of a randomized projective transformation. -// The image transform requires backward transformation coefficient, and the -// box transform the forward coefficients. -// Returns the incolor arg to pixProjective. -int ProjectiveCoeffs(int width, int height, TRand* randomizer, - float** im_coeffs, float** box_coeffs); - -} // namespace tesseract - -#endif // TESSERACT_TRAINING_DEGRADEIMAGE_H_ +/********************************************************************** + * File: degradeimage.h + * Description: Function to degrade an image (usually of text) as if it + * has been printed and then scanned. + * Authors: Ray Smith + * Created: Tue Nov 19 2013 + * + * (C) Copyright 2013, Google Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **********************************************************************/ +#ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_ +#define TESSERACT_TRAINING_DEGRADEIMAGE_H_ + +#include "allheaders.h" +#include "genericvector.h" +#include "helpers.h" // For TRand. +#include "rect.h" + +namespace tesseract { + +// Degrade the pix as if by a print/copy/scan cycle with exposure > 0 +// corresponding to darkening on the copier and <0 lighter and 0 not copied. +// If rotation is not nullptr, the clockwise rotation in radians is saved there. +// The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.) +// The input image is destroyed and a different image returned. +struct Pix* DegradeImage(struct Pix* input, int exposure, TRand* randomizer, + float* rotation); + +// Creates and returns a Pix distorted by various means according to the bool +// flags. If boxes is not nullptr, the boxes are resized/positioned according to +// any spatial distortion and also by the integer reduction factor box_scale +// so they will match what the network will output. +// Returns nullptr on error. The returned Pix must be pixDestroyed. +Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert, + bool white_noise, bool smooth_noise, bool blur, + int box_reduction, TRand* randomizer, + GenericVector* boxes); +// Distorts anything that has a non-null pointer with the same pseudo-random +// perspective distortion. Width and height only need to be set if there +// is no pix. If there is a pix, then they will be taken from there. +void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer, + Pix** pix, GenericVector* boxes); +// Computes the coefficients of a randomized projective transformation. +// The image transform requires backward transformation coefficient, and the +// box transform the forward coefficients. +// Returns the incolor arg to pixProjective. +int ProjectiveCoeffs(int width, int height, TRand* randomizer, + float** im_coeffs, float** box_coeffs); + +} // namespace tesseract + +#endif // TESSERACT_TRAINING_DEGRADEIMAGE_H_ diff --git a/training/fileio.cpp b/src/training/fileio.cpp similarity index 100% rename from training/fileio.cpp rename to src/training/fileio.cpp diff --git a/training/fileio.h b/src/training/fileio.h similarity index 100% rename from training/fileio.h rename to src/training/fileio.h diff --git a/training/icuerrorcode.h b/src/training/icuerrorcode.h similarity index 97% rename from training/icuerrorcode.h rename to src/training/icuerrorcode.h index a606415f04f40f4ede37658413c0f99d4acfdb85..0a1b4ec5dd32157b4aaf34ff80d2b3c026b29c63 100644 --- a/training/icuerrorcode.h +++ b/src/training/icuerrorcode.h @@ -1,66 +1,66 @@ -/********************************************************************** - * File: icuerrorcode.h - * Description: Wrapper class for UErrorCode, with conversion operators for - * direct use in ICU C and C++ APIs. - * Author: Fredrik Roubert - * Created: Thu July 4 2013 - * - * Features: - * - The constructor initializes the internal UErrorCode to U_ZERO_ERROR, - * removing one common source of errors. - * - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking - * UErrorCode& (reference), via conversion operators. - * - Automatic checking for success when it goes out of scope. On failure, - * the destructor will log an error message and exit. - * - * Most of ICU will handle errors gracefully and provide sensible fallbacks. - * Using IcuErrorCode, it is therefore possible to write very compact code - * that does sensible things on failure and provides logging for debugging. - * - * Example: - * IcuErrorCode icuerrorcode; - * return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL; - * - * (C) Copyright 2013, Google Inc. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0 - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - **********************************************************************/ -#ifndef TESSERACT_CCUTIL_ICUERRORCODE_H_ -#define TESSERACT_CCUTIL_ICUERRORCODE_H_ - -#include "tprintf.h" -#include "unicode/errorcode.h" // From libicu - -namespace tesseract { - -class IcuErrorCode : public icu::ErrorCode { - public: - IcuErrorCode() {} - virtual ~IcuErrorCode() { - if (isFailure()) { - handleFailure(); - } - } - - protected: - virtual void handleFailure() const { - tprintf("ICU ERROR: %s", errorName()); - exit(errorCode); - } - - private: - // Disallow implicit copying of object. - IcuErrorCode(const IcuErrorCode&); - void operator=(const IcuErrorCode&); -}; - -} // namespace tesseract -#endif // TESSERACT_CCUTIL_ICUERRORCODE_H_ +/********************************************************************** + * File: icuerrorcode.h + * Description: Wrapper class for UErrorCode, with conversion operators for + * direct use in ICU C and C++ APIs. + * Author: Fredrik Roubert + * Created: Thu July 4 2013 + * + * Features: + * - The constructor initializes the internal UErrorCode to U_ZERO_ERROR, + * removing one common source of errors. + * - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking + * UErrorCode& (reference), via conversion operators. + * - Automatic checking for success when it goes out of scope. On failure, + * the destructor will log an error message and exit. + * + * Most of ICU will handle errors gracefully and provide sensible fallbacks. + * Using IcuErrorCode, it is therefore possible to write very compact code + * that does sensible things on failure and provides logging for debugging. + * + * Example: + * IcuErrorCode icuerrorcode; + * return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL; + * + * (C) Copyright 2013, Google Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **********************************************************************/ +#ifndef TESSERACT_CCUTIL_ICUERRORCODE_H_ +#define TESSERACT_CCUTIL_ICUERRORCODE_H_ + +#include "tprintf.h" +#include "unicode/errorcode.h" // From libicu + +namespace tesseract { + +class IcuErrorCode : public icu::ErrorCode { + public: + IcuErrorCode() {} + virtual ~IcuErrorCode() { + if (isFailure()) { + handleFailure(); + } + } + + protected: + virtual void handleFailure() const { + tprintf("ICU ERROR: %s", errorName()); + exit(errorCode); + } + + private: + // Disallow implicit copying of object. + IcuErrorCode(const IcuErrorCode&); + void operator=(const IcuErrorCode&); +}; + +} // namespace tesseract +#endif // TESSERACT_CCUTIL_ICUERRORCODE_H_ diff --git a/training/lang_model_helpers.cpp b/src/training/lang_model_helpers.cpp similarity index 100% rename from training/lang_model_helpers.cpp rename to src/training/lang_model_helpers.cpp diff --git a/training/lang_model_helpers.h b/src/training/lang_model_helpers.h similarity index 100% rename from training/lang_model_helpers.h rename to src/training/lang_model_helpers.h diff --git a/training/language-specific.sh b/src/training/language-specific.sh old mode 100755 new mode 100644 similarity index 100% rename from training/language-specific.sh rename to src/training/language-specific.sh diff --git a/training/ligature_table.cpp b/src/training/ligature_table.cpp similarity index 100% rename from training/ligature_table.cpp rename to src/training/ligature_table.cpp diff --git a/training/ligature_table.h b/src/training/ligature_table.h similarity index 100% rename from training/ligature_table.h rename to src/training/ligature_table.h diff --git a/training/lstmeval.cpp b/src/training/lstmeval.cpp similarity index 100% rename from training/lstmeval.cpp rename to src/training/lstmeval.cpp diff --git a/training/lstmtester.cpp b/src/training/lstmtester.cpp similarity index 100% rename from training/lstmtester.cpp rename to src/training/lstmtester.cpp diff --git a/training/lstmtester.h b/src/training/lstmtester.h similarity index 100% rename from training/lstmtester.h rename to src/training/lstmtester.h diff --git a/training/lstmtraining.cpp b/src/training/lstmtraining.cpp similarity index 100% rename from training/lstmtraining.cpp rename to src/training/lstmtraining.cpp diff --git a/training/merge_unicharsets.cpp b/src/training/merge_unicharsets.cpp similarity index 100% rename from training/merge_unicharsets.cpp rename to src/training/merge_unicharsets.cpp diff --git a/training/mergenf.cpp b/src/training/mergenf.cpp similarity index 96% rename from training/mergenf.cpp rename to src/training/mergenf.cpp index 4c8e7037d8787ff2ba8ca26811699d301552dd30..d79a1824347a671a96d1fd57b4923814bb77b7f6 100644 --- a/training/mergenf.cpp +++ b/src/training/mergenf.cpp @@ -1,353 +1,353 @@ -/****************************************************************************** -** Filename: MergeNF.c -** Purpose: Program for merging similar nano-feature protos -** Author: Dan Johnson -** History: Wed Nov 21 09:55:23 1990, DSJ, Created. -** - ** (c) Copyright Hewlett-Packard Company, 1988. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. -******************************************************************************/ -#include "mergenf.h" -#include "host.h" -#include "efio.h" -#include "clusttool.h" -#include "cluster.h" -#include "oldlist.h" -#include "protos.h" -#include "ndminx.h" -#include "ocrfeatures.h" -#include "const.h" -#include "featdefs.h" -#include "intproto.h" -#include "params.h" - -#include -#include -#include - -/*-------------------once in subfeat---------------------------------*/ -double_VAR(training_angle_match_scale, 1.0, "Angle Match Scale ..."); - -double_VAR(training_similarity_midpoint, 0.0075, "Similarity Midpoint ..."); - -double_VAR(training_similarity_curl, 2.0, "Similarity Curl ..."); - -/*-----------------------------once in fasttrain----------------------------------*/ -double_VAR(training_tangent_bbox_pad, 0.5, "Tangent bounding box pad ..."); - -double_VAR(training_orthogonal_bbox_pad, 2.5, "Orthogonal bounding box pad ..."); - -double_VAR(training_angle_pad, 45.0, "Angle pad ..."); - -/** - * Compare protos p1 and p2 and return an estimate of the - * worst evidence rating that will result for any part of p1 - * that is compared to p2. In other words, if p1 were broken - * into pico-features and each pico-feature was matched to p2, - * what is the worst evidence rating that will be achieved for - * any pico-feature. - * - * @param p1, p2 protos to be compared - * - * Globals: none - * - * @return Worst possible result when matching p1 to p2. - * @note Exceptions: none - * @note History: Mon Nov 26 08:27:53 1990, DSJ, Created. - */ -FLOAT32 CompareProtos(PROTO p1, PROTO p2) { - FEATURE Feature; - FLOAT32 WorstEvidence = WORST_EVIDENCE; - FLOAT32 Evidence; - FLOAT32 Angle, Length; - - /* if p1 and p2 are not close in length, don't let them match */ - Length = fabs (p1->Length - p2->Length); - if (Length > MAX_LENGTH_MISMATCH) - return (0.0); - - /* create a dummy pico-feature to be used for comparisons */ - Feature = NewFeature (&PicoFeatDesc); - Feature->Params[PicoFeatDir] = p1->Angle; - - /* convert angle to radians */ - Angle = p1->Angle * 2.0 * PI; - - /* find distance from center of p1 to 1/2 picofeat from end */ - Length = p1->Length / 2.0 - GetPicoFeatureLength () / 2.0; - if (Length < 0) Length = 0; - - /* set the dummy pico-feature at one end of p1 and match it to p2 */ - Feature->Params[PicoFeatX] = p1->X + cos (Angle) * Length; - Feature->Params[PicoFeatY] = p1->Y + sin (Angle) * Length; - if (DummyFastMatch (Feature, p2)) { - Evidence = SubfeatureEvidence (Feature, p2); - if (Evidence < WorstEvidence) - WorstEvidence = Evidence; - } else { - FreeFeature(Feature); - return 0.0; - } - - /* set the dummy pico-feature at the other end of p1 and match it to p2 */ - Feature->Params[PicoFeatX] = p1->X - cos (Angle) * Length; - Feature->Params[PicoFeatY] = p1->Y - sin (Angle) * Length; - if (DummyFastMatch (Feature, p2)) { - Evidence = SubfeatureEvidence (Feature, p2); - if (Evidence < WorstEvidence) - WorstEvidence = Evidence; - } else { - FreeFeature(Feature); - return 0.0; - } - - FreeFeature (Feature); - return (WorstEvidence); - -} /* CompareProtos */ - -/** - * This routine computes a proto which is the weighted - * average of protos p1 and p2. The new proto is returned - * in MergedProto. - * - * @param p1, p2 protos to be merged - * @param w1, w2 weight of each proto - * @param MergedProto place to put resulting merged proto - * - * Globals: none - * - * @return none (results are returned in MergedProto) - * @note Exceptions: none - * @note History: Mon Nov 26 08:15:08 1990, DSJ, Created. - */ -void ComputeMergedProto (PROTO p1, - PROTO p2, - FLOAT32 w1, - FLOAT32 w2, - PROTO MergedProto) { - FLOAT32 TotalWeight; - - TotalWeight = w1 + w2; - w1 /= TotalWeight; - w2 /= TotalWeight; - - MergedProto->X = p1->X * w1 + p2->X * w2; - MergedProto->Y = p1->Y * w1 + p2->Y * w2; - MergedProto->Length = p1->Length * w1 + p2->Length * w2; - MergedProto->Angle = p1->Angle * w1 + p2->Angle * w2; - FillABC(MergedProto); -} /* ComputeMergedProto */ - -/** - * This routine searches through all of the prototypes in - * Class and returns the id of the proto which would provide - * the best approximation of Prototype. If no close - * approximation can be found, NO_PROTO is returned. - * - * @param Class class to search for matching old proto in - * @param NumMerged # of protos merged into each proto of Class - * @param Prototype new proto to find match for - * - * Globals: none - * - * @return Id of closest proto in Class or NO_PROTO. - * @note Exceptions: none - * @note History: Sat Nov 24 11:42:58 1990, DSJ, Created. - */ -int FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[], - PROTOTYPE *Prototype) { - PROTO_STRUCT NewProto; - PROTO_STRUCT MergedProto; - int Pid; - PROTO Proto; - int BestProto; - FLOAT32 BestMatch; - FLOAT32 Match, OldMatch, NewMatch; - - MakeNewFromOld (&NewProto, Prototype); - - BestProto = NO_PROTO; - BestMatch = WORST_MATCH_ALLOWED; - for (Pid = 0; Pid < Class->NumProtos; Pid++) { - Proto = ProtoIn(Class, Pid); - ComputeMergedProto(Proto, &NewProto, - (FLOAT32) NumMerged[Pid], 1.0, &MergedProto); - OldMatch = CompareProtos(Proto, &MergedProto); - NewMatch = CompareProtos(&NewProto, &MergedProto); - Match = MIN(OldMatch, NewMatch); - if (Match > BestMatch) { - BestProto = Pid; - BestMatch = Match; - } - } - return BestProto; -} /* FindClosestExistingProto */ - -/** - * This fills in the fields of the New proto based on the - * fields of the Old proto. - * - * @param New new proto to be filled in - * @param Old old proto to be converted - * - * Globals: none - * - * Exceptions: none - * History: Mon Nov 26 09:45:39 1990, DSJ, Created. - */ -void MakeNewFromOld(PROTO New, PROTOTYPE *Old) { - New->X = CenterX(Old->Mean); - New->Y = CenterY(Old->Mean); - New->Length = LengthOf(Old->Mean); - New->Angle = OrientationOf(Old->Mean); - FillABC(New); -} /* MakeNewFromOld */ - -/*-------------------once in subfeat---------------------------------*/ - -/** - * @name SubfeatureEvidence - * - * Compare a feature to a prototype. Print the result. - */ -FLOAT32 SubfeatureEvidence(FEATURE Feature, PROTO Proto) { - float Distance; - float Dangle; - - Dangle = Proto->Angle - Feature->Params[PicoFeatDir]; - if (Dangle < -0.5) Dangle += 1.0; - if (Dangle > 0.5) Dangle -= 1.0; - Dangle *= training_angle_match_scale; - - Distance = Proto->A * Feature->Params[PicoFeatX] + - Proto->B * Feature->Params[PicoFeatY] + - Proto->C; - - return (EvidenceOf (Distance * Distance + Dangle * Dangle)); -} - -/** - * @name EvidenceOf - * - * Return the new type of evidence number corresponding to this - * distance value. This number is no longer based on the chi squared - * approximation. The equation that represents the transform is: - * 1 / (1 + (sim / midpoint) ^ curl) - */ -double EvidenceOf (double Similarity) { - - Similarity /= training_similarity_midpoint; - - if (training_similarity_curl == 3) - Similarity = Similarity * Similarity * Similarity; - else if (training_similarity_curl == 2) - Similarity = Similarity * Similarity; - else - Similarity = pow (Similarity, training_similarity_curl); - - return (1.0 / (1.0 + Similarity)); -} - -/** - * This routine returns TRUE if Feature would be matched - * by a fast match table built from Proto. - * - * @param Feature feature to be "fast matched" to proto - * @param Proto proto being "fast matched" against - * - * Globals: - * - training_tangent_bbox_pad bounding box pad tangent to proto - * - training_orthogonal_bbox_pad bounding box pad orthogonal to proto - * - * @return TRUE if feature could match Proto. - * @note Exceptions: none - * @note History: Wed Nov 14 17:19:58 1990, DSJ, Created. - */ -BOOL8 DummyFastMatch ( - FEATURE Feature, - PROTO Proto) -{ - FRECT BoundingBox; - FLOAT32 MaxAngleError; - FLOAT32 AngleError; - - MaxAngleError = training_angle_pad / 360.0; - AngleError = fabs (Proto->Angle - Feature->Params[PicoFeatDir]); - if (AngleError > 0.5) - AngleError = 1.0 - AngleError; - - if (AngleError > MaxAngleError) - return (FALSE); - - ComputePaddedBoundingBox (Proto, - training_tangent_bbox_pad * GetPicoFeatureLength (), - training_orthogonal_bbox_pad * GetPicoFeatureLength (), - &BoundingBox); - - return PointInside(&BoundingBox, Feature->Params[PicoFeatX], - Feature->Params[PicoFeatY]); -} /* DummyFastMatch */ - -/** - * This routine computes a bounding box that encloses the - * specified proto along with some padding. The - * amount of padding is specified as separate distances - * in the tangential and orthogonal directions. - * - * @param Proto proto to compute bounding box for - * @param TangentPad amount of pad to add in direction of segment - * @param OrthogonalPad amount of pad to add orthogonal to segment - * @param[out] BoundingBox place to put results - * - * Globals: none - * - * @return none (results are returned in BoundingBox) - * @note Exceptions: none - * @note History: Wed Nov 14 14:55:30 1990, DSJ, Created. - */ -void ComputePaddedBoundingBox (PROTO Proto, FLOAT32 TangentPad, - FLOAT32 OrthogonalPad, FRECT *BoundingBox) { - FLOAT32 Pad, Length, Angle; - FLOAT32 CosOfAngle, SinOfAngle; - - Length = Proto->Length / 2.0 + TangentPad; - Angle = Proto->Angle * 2.0 * PI; - CosOfAngle = fabs(cos(Angle)); - SinOfAngle = fabs(sin(Angle)); - - Pad = MAX (CosOfAngle * Length, SinOfAngle * OrthogonalPad); - BoundingBox->MinX = Proto->X - Pad; - BoundingBox->MaxX = Proto->X + Pad; - - Pad = MAX(SinOfAngle * Length, CosOfAngle * OrthogonalPad); - BoundingBox->MinY = Proto->Y - Pad; - BoundingBox->MaxY = Proto->Y + Pad; - -} /* ComputePaddedBoundingBox */ - -/** - * Return TRUE if point (X,Y) is inside of Rectangle. - * - * Globals: none - * - * @return TRUE if point (X,Y) is inside of Rectangle. - * @note Exceptions: none - * @note History: Wed Nov 14 17:26:35 1990, DSJ, Created. - */ -BOOL8 PointInside(FRECT *Rectangle, FLOAT32 X, FLOAT32 Y) { - if (X < Rectangle->MinX) return (FALSE); - if (X > Rectangle->MaxX) return (FALSE); - if (Y < Rectangle->MinY) return (FALSE); - if (Y > Rectangle->MaxY) return (FALSE); - return (TRUE); - -} /* PointInside */ +/****************************************************************************** +** Filename: MergeNF.c +** Purpose: Program for merging similar nano-feature protos +** Author: Dan Johnson +** History: Wed Nov 21 09:55:23 1990, DSJ, Created. +** + ** (c) Copyright Hewlett-Packard Company, 1988. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. +******************************************************************************/ +#include "mergenf.h" +#include "host.h" +#include "efio.h" +#include "clusttool.h" +#include "cluster.h" +#include "oldlist.h" +#include "protos.h" +#include "ndminx.h" +#include "ocrfeatures.h" +#include "const.h" +#include "featdefs.h" +#include "intproto.h" +#include "params.h" + +#include +#include +#include + +/*-------------------once in subfeat---------------------------------*/ +double_VAR(training_angle_match_scale, 1.0, "Angle Match Scale ..."); + +double_VAR(training_similarity_midpoint, 0.0075, "Similarity Midpoint ..."); + +double_VAR(training_similarity_curl, 2.0, "Similarity Curl ..."); + +/*-----------------------------once in fasttrain----------------------------------*/ +double_VAR(training_tangent_bbox_pad, 0.5, "Tangent bounding box pad ..."); + +double_VAR(training_orthogonal_bbox_pad, 2.5, "Orthogonal bounding box pad ..."); + +double_VAR(training_angle_pad, 45.0, "Angle pad ..."); + +/** + * Compare protos p1 and p2 and return an estimate of the + * worst evidence rating that will result for any part of p1 + * that is compared to p2. In other words, if p1 were broken + * into pico-features and each pico-feature was matched to p2, + * what is the worst evidence rating that will be achieved for + * any pico-feature. + * + * @param p1, p2 protos to be compared + * + * Globals: none + * + * @return Worst possible result when matching p1 to p2. + * @note Exceptions: none + * @note History: Mon Nov 26 08:27:53 1990, DSJ, Created. + */ +FLOAT32 CompareProtos(PROTO p1, PROTO p2) { + FEATURE Feature; + FLOAT32 WorstEvidence = WORST_EVIDENCE; + FLOAT32 Evidence; + FLOAT32 Angle, Length; + + /* if p1 and p2 are not close in length, don't let them match */ + Length = fabs (p1->Length - p2->Length); + if (Length > MAX_LENGTH_MISMATCH) + return (0.0); + + /* create a dummy pico-feature to be used for comparisons */ + Feature = NewFeature (&PicoFeatDesc); + Feature->Params[PicoFeatDir] = p1->Angle; + + /* convert angle to radians */ + Angle = p1->Angle * 2.0 * PI; + + /* find distance from center of p1 to 1/2 picofeat from end */ + Length = p1->Length / 2.0 - GetPicoFeatureLength () / 2.0; + if (Length < 0) Length = 0; + + /* set the dummy pico-feature at one end of p1 and match it to p2 */ + Feature->Params[PicoFeatX] = p1->X + cos (Angle) * Length; + Feature->Params[PicoFeatY] = p1->Y + sin (Angle) * Length; + if (DummyFastMatch (Feature, p2)) { + Evidence = SubfeatureEvidence (Feature, p2); + if (Evidence < WorstEvidence) + WorstEvidence = Evidence; + } else { + FreeFeature(Feature); + return 0.0; + } + + /* set the dummy pico-feature at the other end of p1 and match it to p2 */ + Feature->Params[PicoFeatX] = p1->X - cos (Angle) * Length; + Feature->Params[PicoFeatY] = p1->Y - sin (Angle) * Length; + if (DummyFastMatch (Feature, p2)) { + Evidence = SubfeatureEvidence (Feature, p2); + if (Evidence < WorstEvidence) + WorstEvidence = Evidence; + } else { + FreeFeature(Feature); + return 0.0; + } + + FreeFeature (Feature); + return (WorstEvidence); + +} /* CompareProtos */ + +/** + * This routine computes a proto which is the weighted + * average of protos p1 and p2. The new proto is returned + * in MergedProto. + * + * @param p1, p2 protos to be merged + * @param w1, w2 weight of each proto + * @param MergedProto place to put resulting merged proto + * + * Globals: none + * + * @return none (results are returned in MergedProto) + * @note Exceptions: none + * @note History: Mon Nov 26 08:15:08 1990, DSJ, Created. + */ +void ComputeMergedProto (PROTO p1, + PROTO p2, + FLOAT32 w1, + FLOAT32 w2, + PROTO MergedProto) { + FLOAT32 TotalWeight; + + TotalWeight = w1 + w2; + w1 /= TotalWeight; + w2 /= TotalWeight; + + MergedProto->X = p1->X * w1 + p2->X * w2; + MergedProto->Y = p1->Y * w1 + p2->Y * w2; + MergedProto->Length = p1->Length * w1 + p2->Length * w2; + MergedProto->Angle = p1->Angle * w1 + p2->Angle * w2; + FillABC(MergedProto); +} /* ComputeMergedProto */ + +/** + * This routine searches through all of the prototypes in + * Class and returns the id of the proto which would provide + * the best approximation of Prototype. If no close + * approximation can be found, NO_PROTO is returned. + * + * @param Class class to search for matching old proto in + * @param NumMerged # of protos merged into each proto of Class + * @param Prototype new proto to find match for + * + * Globals: none + * + * @return Id of closest proto in Class or NO_PROTO. + * @note Exceptions: none + * @note History: Sat Nov 24 11:42:58 1990, DSJ, Created. + */ +int FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[], + PROTOTYPE *Prototype) { + PROTO_STRUCT NewProto; + PROTO_STRUCT MergedProto; + int Pid; + PROTO Proto; + int BestProto; + FLOAT32 BestMatch; + FLOAT32 Match, OldMatch, NewMatch; + + MakeNewFromOld (&NewProto, Prototype); + + BestProto = NO_PROTO; + BestMatch = WORST_MATCH_ALLOWED; + for (Pid = 0; Pid < Class->NumProtos; Pid++) { + Proto = ProtoIn(Class, Pid); + ComputeMergedProto(Proto, &NewProto, + (FLOAT32) NumMerged[Pid], 1.0, &MergedProto); + OldMatch = CompareProtos(Proto, &MergedProto); + NewMatch = CompareProtos(&NewProto, &MergedProto); + Match = MIN(OldMatch, NewMatch); + if (Match > BestMatch) { + BestProto = Pid; + BestMatch = Match; + } + } + return BestProto; +} /* FindClosestExistingProto */ + +/** + * This fills in the fields of the New proto based on the + * fields of the Old proto. + * + * @param New new proto to be filled in + * @param Old old proto to be converted + * + * Globals: none + * + * Exceptions: none + * History: Mon Nov 26 09:45:39 1990, DSJ, Created. + */ +void MakeNewFromOld(PROTO New, PROTOTYPE *Old) { + New->X = CenterX(Old->Mean); + New->Y = CenterY(Old->Mean); + New->Length = LengthOf(Old->Mean); + New->Angle = OrientationOf(Old->Mean); + FillABC(New); +} /* MakeNewFromOld */ + +/*-------------------once in subfeat---------------------------------*/ + +/** + * @name SubfeatureEvidence + * + * Compare a feature to a prototype. Print the result. + */ +FLOAT32 SubfeatureEvidence(FEATURE Feature, PROTO Proto) { + float Distance; + float Dangle; + + Dangle = Proto->Angle - Feature->Params[PicoFeatDir]; + if (Dangle < -0.5) Dangle += 1.0; + if (Dangle > 0.5) Dangle -= 1.0; + Dangle *= training_angle_match_scale; + + Distance = Proto->A * Feature->Params[PicoFeatX] + + Proto->B * Feature->Params[PicoFeatY] + + Proto->C; + + return (EvidenceOf (Distance * Distance + Dangle * Dangle)); +} + +/** + * @name EvidenceOf + * + * Return the new type of evidence number corresponding to this + * distance value. This number is no longer based on the chi squared + * approximation. The equation that represents the transform is: + * 1 / (1 + (sim / midpoint) ^ curl) + */ +double EvidenceOf (double Similarity) { + + Similarity /= training_similarity_midpoint; + + if (training_similarity_curl == 3) + Similarity = Similarity * Similarity * Similarity; + else if (training_similarity_curl == 2) + Similarity = Similarity * Similarity; + else + Similarity = pow (Similarity, training_similarity_curl); + + return (1.0 / (1.0 + Similarity)); +} + +/** + * This routine returns TRUE if Feature would be matched + * by a fast match table built from Proto. + * + * @param Feature feature to be "fast matched" to proto + * @param Proto proto being "fast matched" against + * + * Globals: + * - training_tangent_bbox_pad bounding box pad tangent to proto + * - training_orthogonal_bbox_pad bounding box pad orthogonal to proto + * + * @return TRUE if feature could match Proto. + * @note Exceptions: none + * @note History: Wed Nov 14 17:19:58 1990, DSJ, Created. + */ +BOOL8 DummyFastMatch ( + FEATURE Feature, + PROTO Proto) +{ + FRECT BoundingBox; + FLOAT32 MaxAngleError; + FLOAT32 AngleError; + + MaxAngleError = training_angle_pad / 360.0; + AngleError = fabs (Proto->Angle - Feature->Params[PicoFeatDir]); + if (AngleError > 0.5) + AngleError = 1.0 - AngleError; + + if (AngleError > MaxAngleError) + return (FALSE); + + ComputePaddedBoundingBox (Proto, + training_tangent_bbox_pad * GetPicoFeatureLength (), + training_orthogonal_bbox_pad * GetPicoFeatureLength (), + &BoundingBox); + + return PointInside(&BoundingBox, Feature->Params[PicoFeatX], + Feature->Params[PicoFeatY]); +} /* DummyFastMatch */ + +/** + * This routine computes a bounding box that encloses the + * specified proto along with some padding. The + * amount of padding is specified as separate distances + * in the tangential and orthogonal directions. + * + * @param Proto proto to compute bounding box for + * @param TangentPad amount of pad to add in direction of segment + * @param OrthogonalPad amount of pad to add orthogonal to segment + * @param[out] BoundingBox place to put results + * + * Globals: none + * + * @return none (results are returned in BoundingBox) + * @note Exceptions: none + * @note History: Wed Nov 14 14:55:30 1990, DSJ, Created. + */ +void ComputePaddedBoundingBox (PROTO Proto, FLOAT32 TangentPad, + FLOAT32 OrthogonalPad, FRECT *BoundingBox) { + FLOAT32 Pad, Length, Angle; + FLOAT32 CosOfAngle, SinOfAngle; + + Length = Proto->Length / 2.0 + TangentPad; + Angle = Proto->Angle * 2.0 * PI; + CosOfAngle = fabs(cos(Angle)); + SinOfAngle = fabs(sin(Angle)); + + Pad = MAX (CosOfAngle * Length, SinOfAngle * OrthogonalPad); + BoundingBox->MinX = Proto->X - Pad; + BoundingBox->MaxX = Proto->X + Pad; + + Pad = MAX(SinOfAngle * Length, CosOfAngle * OrthogonalPad); + BoundingBox->MinY = Proto->Y - Pad; + BoundingBox->MaxY = Proto->Y + Pad; + +} /* ComputePaddedBoundingBox */ + +/** + * Return TRUE if point (X,Y) is inside of Rectangle. + * + * Globals: none + * + * @return TRUE if point (X,Y) is inside of Rectangle. + * @note Exceptions: none + * @note History: Wed Nov 14 17:26:35 1990, DSJ, Created. + */ +BOOL8 PointInside(FRECT *Rectangle, FLOAT32 X, FLOAT32 Y) { + if (X < Rectangle->MinX) return (FALSE); + if (X > Rectangle->MaxX) return (FALSE); + if (Y < Rectangle->MinY) return (FALSE); + if (Y > Rectangle->MaxY) return (FALSE); + return (TRUE); + +} /* PointInside */ diff --git a/training/mergenf.h b/src/training/mergenf.h similarity index 96% rename from training/mergenf.h rename to src/training/mergenf.h index 384d60caf3581661b1037aac70c89ec9e510993c..e640a0f0ae9b681108b53bcf850bbd30a2d90d26 100644 --- a/training/mergenf.h +++ b/src/training/mergenf.h @@ -1,103 +1,103 @@ -/****************************************************************************** -** Filename: MergeNF.c -** Purpose: Program for merging similar nano-feature protos -** Author: Dan Johnson -** History: Wed Nov 21 09:55:23 1990, DSJ, Created. -** - ** (c) Copyright Hewlett-Packard Company, 1988. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. -******************************************************************************/ - -#ifndef TESSERACT_TRAINING_MERGENF_H_ -#define TESSERACT_TRAINING_MERGENF_H_ - -/**---------------------------------------------------------------------------- - Include Files and Type Defines -----------------------------------------------------------------------------**/ -#include "protos.h" -#include "cluster.h" -#include "ocrfeatures.h" -#include "callcpp.h" -#include "picofeat.h" - - -#define WORST_MATCH_ALLOWED (0.9) -#define WORST_EVIDENCE (1.0) -#define MAX_LENGTH_MISMATCH (2.0 * GetPicoFeatureLength ()) - - -#define PROTO_SUFFIX ".mf.p" -#define CONFIG_SUFFIX ".cl" -#define NO_PROTO (-1) -#define XPOSITION 0 -#define YPOSITION 1 -#define MFLENGTH 2 -#define ORIENTATION 3 - -typedef struct -{ - FLOAT32 MinX, MaxX, MinY, MaxY; -} FRECT; - -/**---------------------------------------------------------------------------- - Public Macros -----------------------------------------------------------------------------**/ -#define CenterX(M) ( (M)[XPOSITION] ) -#define CenterY(M) ( (M)[YPOSITION] ) -#define LengthOf(M) ( (M)[MFLENGTH] ) -#define OrientationOf(M) ( (M)[ORIENTATION] ) - -/**---------------------------------------------------------------------------- - Public Function Prototypes -----------------------------------------------------------------------------**/ -FLOAT32 CompareProtos ( - PROTO p1, - PROTO p2); - -void ComputeMergedProto ( - PROTO p1, - PROTO p2, - FLOAT32 w1, - FLOAT32 w2, - PROTO MergedProto); - -int FindClosestExistingProto ( - CLASS_TYPE Class, - int NumMerged[], - PROTOTYPE *Prototype); - -void MakeNewFromOld ( - PROTO New, - PROTOTYPE *Old); - -FLOAT32 SubfeatureEvidence ( - FEATURE Feature, - PROTO Proto); - -double EvidenceOf ( - register double Similarity); - -BOOL8 DummyFastMatch ( - FEATURE Feature, - PROTO Proto); - -void ComputePaddedBoundingBox ( - PROTO Proto, - FLOAT32 TangentPad, - FLOAT32 OrthogonalPad, - FRECT *BoundingBox); - -BOOL8 PointInside ( - FRECT *Rectangle, - FLOAT32 X, - FLOAT32 Y); - -#endif // TESSERACT_TRAINING_MERGENF_H_ +/****************************************************************************** +** Filename: MergeNF.c +** Purpose: Program for merging similar nano-feature protos +** Author: Dan Johnson +** History: Wed Nov 21 09:55:23 1990, DSJ, Created. +** + ** (c) Copyright Hewlett-Packard Company, 1988. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. +******************************************************************************/ + +#ifndef TESSERACT_TRAINING_MERGENF_H_ +#define TESSERACT_TRAINING_MERGENF_H_ + +/**---------------------------------------------------------------------------- + Include Files and Type Defines +----------------------------------------------------------------------------**/ +#include "protos.h" +#include "cluster.h" +#include "ocrfeatures.h" +#include "callcpp.h" +#include "picofeat.h" + + +#define WORST_MATCH_ALLOWED (0.9) +#define WORST_EVIDENCE (1.0) +#define MAX_LENGTH_MISMATCH (2.0 * GetPicoFeatureLength ()) + + +#define PROTO_SUFFIX ".mf.p" +#define CONFIG_SUFFIX ".cl" +#define NO_PROTO (-1) +#define XPOSITION 0 +#define YPOSITION 1 +#define MFLENGTH 2 +#define ORIENTATION 3 + +typedef struct +{ + FLOAT32 MinX, MaxX, MinY, MaxY; +} FRECT; + +/**---------------------------------------------------------------------------- + Public Macros +----------------------------------------------------------------------------**/ +#define CenterX(M) ( (M)[XPOSITION] ) +#define CenterY(M) ( (M)[YPOSITION] ) +#define LengthOf(M) ( (M)[MFLENGTH] ) +#define OrientationOf(M) ( (M)[ORIENTATION] ) + +/**---------------------------------------------------------------------------- + Public Function Prototypes +----------------------------------------------------------------------------**/ +FLOAT32 CompareProtos ( + PROTO p1, + PROTO p2); + +void ComputeMergedProto ( + PROTO p1, + PROTO p2, + FLOAT32 w1, + FLOAT32 w2, + PROTO MergedProto); + +int FindClosestExistingProto ( + CLASS_TYPE Class, + int NumMerged[], + PROTOTYPE *Prototype); + +void MakeNewFromOld ( + PROTO New, + PROTOTYPE *Old); + +FLOAT32 SubfeatureEvidence ( + FEATURE Feature, + PROTO Proto); + +double EvidenceOf ( + register double Similarity); + +BOOL8 DummyFastMatch ( + FEATURE Feature, + PROTO Proto); + +void ComputePaddedBoundingBox ( + PROTO Proto, + FLOAT32 TangentPad, + FLOAT32 OrthogonalPad, + FRECT *BoundingBox); + +BOOL8 PointInside ( + FRECT *Rectangle, + FLOAT32 X, + FLOAT32 Y); + +#endif // TESSERACT_TRAINING_MERGENF_H_ diff --git a/training/mftraining.cpp b/src/training/mftraining.cpp similarity index 100% rename from training/mftraining.cpp rename to src/training/mftraining.cpp diff --git a/training/normstrngs.cpp b/src/training/normstrngs.cpp similarity index 100% rename from training/normstrngs.cpp rename to src/training/normstrngs.cpp diff --git a/training/normstrngs.h b/src/training/normstrngs.h similarity index 100% rename from training/normstrngs.h rename to src/training/normstrngs.h diff --git a/training/pango_font_info.cpp b/src/training/pango_font_info.cpp similarity index 100% rename from training/pango_font_info.cpp rename to src/training/pango_font_info.cpp diff --git a/training/pango_font_info.h b/src/training/pango_font_info.h similarity index 100% rename from training/pango_font_info.h rename to src/training/pango_font_info.h diff --git a/training/set_unicharset_properties.cpp b/src/training/set_unicharset_properties.cpp similarity index 100% rename from training/set_unicharset_properties.cpp rename to src/training/set_unicharset_properties.cpp diff --git a/training/shapeclustering.cpp b/src/training/shapeclustering.cpp similarity index 100% rename from training/shapeclustering.cpp rename to src/training/shapeclustering.cpp diff --git a/training/stringrenderer.cpp b/src/training/stringrenderer.cpp similarity index 100% rename from training/stringrenderer.cpp rename to src/training/stringrenderer.cpp diff --git a/training/stringrenderer.h b/src/training/stringrenderer.h similarity index 100% rename from training/stringrenderer.h rename to src/training/stringrenderer.h diff --git a/training/tessopt.cpp b/src/training/tessopt.cpp similarity index 100% rename from training/tessopt.cpp rename to src/training/tessopt.cpp diff --git a/training/tessopt.h b/src/training/tessopt.h similarity index 100% rename from training/tessopt.h rename to src/training/tessopt.h diff --git a/training/tesstrain.sh b/src/training/tesstrain.sh old mode 100755 new mode 100644 similarity index 100% rename from training/tesstrain.sh rename to src/training/tesstrain.sh diff --git a/training/tesstrain_utils.sh b/src/training/tesstrain_utils.sh old mode 100755 new mode 100644 similarity index 100% rename from training/tesstrain_utils.sh rename to src/training/tesstrain_utils.sh diff --git a/training/text2image.cpp b/src/training/text2image.cpp similarity index 100% rename from training/text2image.cpp rename to src/training/text2image.cpp diff --git a/training/tlog.cpp b/src/training/tlog.cpp similarity index 97% rename from training/tlog.cpp rename to src/training/tlog.cpp index e493bd090cc2ec81f6583f7fb0ab541af492e865..7c32c573ab9265f27d24240619abd647016adb77 100644 --- a/training/tlog.cpp +++ b/src/training/tlog.cpp @@ -1,23 +1,23 @@ -/********************************************************************** - * File: tlog.cpp - * Description: Variant of printf with logging level controllable by a - * commandline flag. - * Author: Ranjith Unnikrishnan - * Created: Wed Nov 20 2013 - * - * (C) Copyright 2013, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include "tlog.h" - -INT_PARAM_FLAG(tlog_level, 0, "Minimum logging level for tlog() output"); +/********************************************************************** + * File: tlog.cpp + * Description: Variant of printf with logging level controllable by a + * commandline flag. + * Author: Ranjith Unnikrishnan + * Created: Wed Nov 20 2013 + * + * (C) Copyright 2013, Google Inc. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#include "tlog.h" + +INT_PARAM_FLAG(tlog_level, 0, "Minimum logging level for tlog() output"); diff --git a/training/tlog.h b/src/training/tlog.h similarity index 97% rename from training/tlog.h rename to src/training/tlog.h index 6dcf6e6c3b5dd9a7a34a9e8a461f85a9370b3584..cf7de652f4386e2df1931938f7dca6b297d74364 100644 --- a/training/tlog.h +++ b/src/training/tlog.h @@ -1,41 +1,41 @@ -/********************************************************************** - * File: tlog.h - * Description: Variant of printf with logging level controllable by a - * commandline flag. - * Author: Ranjith Unnikrishnan - * Created: Wed Nov 20 2013 - * - * (C) Copyright 2013, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ -#ifndef TESSERACT_TRAINING_TLOG_H_ -#define TESSERACT_TRAINING_TLOG_H_ - -#include "commandlineflags.h" -#include "errcode.h" -#include "tprintf.h" - -DECLARE_INT_PARAM_FLAG(tlog_level); - -// Variant guarded by the numeric logging level parameter FLAGS_tlog_level -// (default 0). Code using ParseCommandLineFlags() can control its value using -// the --tlog_level commandline argument. Otherwise it must be specified in a -// config file like other params. -#define tlog(level, ...) { \ - if (FLAGS_tlog_level >= level) { \ - tprintf_internal(__VA_ARGS__); \ - } \ -} - -#define TLOG_IS_ON(level) (FLAGS_tlog_level >= level) - -#endif // TESSERACT_TRAINING_TLOG_H_ +/********************************************************************** + * File: tlog.h + * Description: Variant of printf with logging level controllable by a + * commandline flag. + * Author: Ranjith Unnikrishnan + * Created: Wed Nov 20 2013 + * + * (C) Copyright 2013, Google Inc. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ +#ifndef TESSERACT_TRAINING_TLOG_H_ +#define TESSERACT_TRAINING_TLOG_H_ + +#include "commandlineflags.h" +#include "errcode.h" +#include "tprintf.h" + +DECLARE_INT_PARAM_FLAG(tlog_level); + +// Variant guarded by the numeric logging level parameter FLAGS_tlog_level +// (default 0). Code using ParseCommandLineFlags() can control its value using +// the --tlog_level commandline argument. Otherwise it must be specified in a +// config file like other params. +#define tlog(level, ...) { \ + if (FLAGS_tlog_level >= level) { \ + tprintf_internal(__VA_ARGS__); \ + } \ +} + +#define TLOG_IS_ON(level) (FLAGS_tlog_level >= level) + +#endif // TESSERACT_TRAINING_TLOG_H_ diff --git a/training/unicharset_extractor.cpp b/src/training/unicharset_extractor.cpp similarity index 100% rename from training/unicharset_extractor.cpp rename to src/training/unicharset_extractor.cpp diff --git a/training/unicharset_training_utils.cpp b/src/training/unicharset_training_utils.cpp similarity index 100% rename from training/unicharset_training_utils.cpp rename to src/training/unicharset_training_utils.cpp diff --git a/training/unicharset_training_utils.h b/src/training/unicharset_training_utils.h similarity index 100% rename from training/unicharset_training_utils.h rename to src/training/unicharset_training_utils.h diff --git a/training/util.h b/src/training/util.h similarity index 100% rename from training/util.h rename to src/training/util.h diff --git a/training/validate_grapheme.cpp b/src/training/validate_grapheme.cpp similarity index 100% rename from training/validate_grapheme.cpp rename to src/training/validate_grapheme.cpp diff --git a/training/validate_grapheme.h b/src/training/validate_grapheme.h similarity index 97% rename from training/validate_grapheme.h rename to src/training/validate_grapheme.h index 138ad57075f79c777bcb6351b5e0dda5360eb432..3f06e481725c71febedd64f68db1e1b2b62dcc33 100644 --- a/training/validate_grapheme.h +++ b/src/training/validate_grapheme.h @@ -1,35 +1,35 @@ -#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_ -#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_ - -#include "validator.h" - -namespace tesseract { - -// Subclass of Validator that validates and segments generic unicode into -// grapheme clusters, including Latin with diacritics. -class ValidateGrapheme : public Validator { - public: - ValidateGrapheme(ViramaScript script, bool report_errors) - : Validator(script, report_errors) {} - ~ValidateGrapheme() {} - - protected: - // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to - // parts_ and output_. Returns true if a valid Grapheme was consumed, - // otherwise does not increment codes_used_. - bool ConsumeGraphemeIfValid() override; - // Returns the CharClass corresponding to the given Unicode ch. - CharClass UnicodeToCharClass(char32 ch) const override; - - private: - // Helper returns true if the sequence prev_ch,ch is invalid. - bool IsBadlyFormed(char32 prev_ch, char32 ch); - // Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel. - static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch); - // Helper returns true if the sequence prev_ch,ch is invalid Thai. - static bool IsBadlyFormedThai(char32 prev_ch, char32 ch); -}; - -} // namespace tesseract - -#endif // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_ +#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_ +#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_ + +#include "validator.h" + +namespace tesseract { + +// Subclass of Validator that validates and segments generic unicode into +// grapheme clusters, including Latin with diacritics. +class ValidateGrapheme : public Validator { + public: + ValidateGrapheme(ViramaScript script, bool report_errors) + : Validator(script, report_errors) {} + ~ValidateGrapheme() {} + + protected: + // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to + // parts_ and output_. Returns true if a valid Grapheme was consumed, + // otherwise does not increment codes_used_. + bool ConsumeGraphemeIfValid() override; + // Returns the CharClass corresponding to the given Unicode ch. + CharClass UnicodeToCharClass(char32 ch) const override; + + private: + // Helper returns true if the sequence prev_ch,ch is invalid. + bool IsBadlyFormed(char32 prev_ch, char32 ch); + // Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel. + static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch); + // Helper returns true if the sequence prev_ch,ch is invalid Thai. + static bool IsBadlyFormedThai(char32 prev_ch, char32 ch); +}; + +} // namespace tesseract + +#endif // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_ diff --git a/training/validate_indic.cpp b/src/training/validate_indic.cpp similarity index 100% rename from training/validate_indic.cpp rename to src/training/validate_indic.cpp diff --git a/training/validate_indic.h b/src/training/validate_indic.h similarity index 97% rename from training/validate_indic.h rename to src/training/validate_indic.h index 62dbcb23d100811f35f5cbd9b7bdad5c117d41f2..9c1656af9cc927d798b582f5bbd5bb408f42aede 100644 --- a/training/validate_indic.h +++ b/src/training/validate_indic.h @@ -1,44 +1,44 @@ -#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_ -#define TESSERACT_TRAINING_VALIDATE_INDIC_H_ - -#include "validator.h" - -namespace tesseract { - -// Subclass of Validator that validates and segments Indic scripts in the -// unicode range 0x900-0xdff (Devanagari-Sinhala). -class ValidateIndic : public Validator { - public: - ValidateIndic(ViramaScript script, bool report_errors) - : Validator(script, report_errors) {} - ~ValidateIndic() {} - - protected: - // Returns whether codes matches the pattern for an Indic Grapheme. - // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to - // parts_ and output_. Returns true if a valid Grapheme was consumed, - // otherwise does not increment codes_used_. - bool ConsumeGraphemeIfValid() override; - // Returns the CharClass corresponding to the given Unicode ch. - Validator::CharClass UnicodeToCharClass(char32 ch) const override; - - private: - // Helper consumes/copies a virama and any associated post-virama joiners. - bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra); - // Helper consumes/copies a series of consonants separated by viramas while - // valid, but not any vowel or other modifiers. - bool ConsumeConsonantHeadIfValid(); - // Helper consumes/copies a tail part of a consonant, comprising optional - // matra/piece, vowel modifier, vedic mark, terminating virama. - bool ConsumeConsonantTailIfValid(); - // Helper consumes/copies a vowel and optional modifiers. - bool ConsumeVowelIfValid(); - - // Some special unicodes used only for Indic processing. - static const char32 kYayana = 0xdba; // Sinhala Ya - static const char32 kRayana = 0xdbb; // Sinhala Ra -}; - -} // namespace tesseract - -#endif // TESSERACT_TRAINING_VALIDATE_INDIC_H_ +#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_ +#define TESSERACT_TRAINING_VALIDATE_INDIC_H_ + +#include "validator.h" + +namespace tesseract { + +// Subclass of Validator that validates and segments Indic scripts in the +// unicode range 0x900-0xdff (Devanagari-Sinhala). +class ValidateIndic : public Validator { + public: + ValidateIndic(ViramaScript script, bool report_errors) + : Validator(script, report_errors) {} + ~ValidateIndic() {} + + protected: + // Returns whether codes matches the pattern for an Indic Grapheme. + // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to + // parts_ and output_. Returns true if a valid Grapheme was consumed, + // otherwise does not increment codes_used_. + bool ConsumeGraphemeIfValid() override; + // Returns the CharClass corresponding to the given Unicode ch. + Validator::CharClass UnicodeToCharClass(char32 ch) const override; + + private: + // Helper consumes/copies a virama and any associated post-virama joiners. + bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra); + // Helper consumes/copies a series of consonants separated by viramas while + // valid, but not any vowel or other modifiers. + bool ConsumeConsonantHeadIfValid(); + // Helper consumes/copies a tail part of a consonant, comprising optional + // matra/piece, vowel modifier, vedic mark, terminating virama. + bool ConsumeConsonantTailIfValid(); + // Helper consumes/copies a vowel and optional modifiers. + bool ConsumeVowelIfValid(); + + // Some special unicodes used only for Indic processing. + static const char32 kYayana = 0xdba; // Sinhala Ya + static const char32 kRayana = 0xdbb; // Sinhala Ra +}; + +} // namespace tesseract + +#endif // TESSERACT_TRAINING_VALIDATE_INDIC_H_ diff --git a/training/validate_khmer.cpp b/src/training/validate_khmer.cpp similarity index 97% rename from training/validate_khmer.cpp rename to src/training/validate_khmer.cpp index 45c8f061de2b98396c020259d715b2f5f52c05bb..709b908d05811290ca17c05b36c76168ea65d74f 100644 --- a/training/validate_khmer.cpp +++ b/src/training/validate_khmer.cpp @@ -1,106 +1,106 @@ -#include "validate_khmer.h" -#include "errcode.h" -#include "tprintf.h" - -namespace tesseract { - -// Returns whether codes matches the pattern for a Khmer Grapheme. -// Taken from unicode standard: -// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf. -// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation -// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf. -// Translated to the codes used by the CharClass enum: -// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC} -// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter. -// Also the Consonant class here includes independent vowels, as they are -// treated the same anyway. -// In the split grapheme mode, the only characters that get grouped are the -// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in -// the BNF syntax, so who knows what they do. -bool ValidateKhmer::ConsumeGraphemeIfValid() { - int num_codes = codes_.size(); - if (codes_used_ == num_codes) return false; - if (codes_[codes_used_].first == CharClass::kOther) { - UseMultiCode(1); - return true; - } - if (codes_[codes_used_].first != CharClass::kConsonant) { - if (report_errors_) { - tprintf("Invalid start of Khmer syllable:0x%x\n", - codes_[codes_used_].second); - } - return false; - } - if (UseMultiCode(1)) return true; - if (codes_[codes_used_].first == CharClass::kRobat || - codes_[codes_used_].first == CharClass::kNukta) { - if (UseMultiCode(1)) return true; - } - while (codes_used_ + 1 < num_codes && - codes_[codes_used_].first == CharClass::kVirama && - codes_[codes_used_ + 1].first == CharClass::kConsonant) { - ASSERT_HOST(!CodeOnlyToOutput()); - if (UseMultiCode(2)) return true; - if (codes_[codes_used_].first == CharClass::kRobat) { - if (UseMultiCode(1)) return true; - } - } - int num_matra_parts = 0; - if (codes_[codes_used_].second == kZeroWidthJoiner || - codes_[codes_used_].second == kZeroWidthNonJoiner) { - if (CodeOnlyToOutput()) { - if (report_errors_) { - tprintf("Unterminated joiner: 0x%x\n", output_.back()); - } - return false; - } - ++num_matra_parts; - } - // Not quite as shown by the BNF, the matra piece is allowed as a matra on its - // own or as an addition to other matras. - if (codes_[codes_used_].first == CharClass::kMatra || - codes_[codes_used_].first == CharClass::kMatraPiece) { - ++num_matra_parts; - if (UseMultiCode(num_matra_parts)) return true; - } else if (num_matra_parts) { - if (report_errors_) { - tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n", - output_.back(), codes_[codes_used_].second); - } - return false; - } - if (codes_[codes_used_].first == CharClass::kMatraPiece && - codes_[codes_used_ - 1].first != CharClass::kMatraPiece) { - if (UseMultiCode(1)) return true; - } - if (codes_[codes_used_].first == CharClass::kVowelModifier) { - if (UseMultiCode(1)) return true; - } - if (codes_used_ + 1 < num_codes && - codes_[codes_used_].first == CharClass::kVirama && - codes_[codes_used_ + 1].first == CharClass::kConsonant) { - ASSERT_HOST(!CodeOnlyToOutput()); - if (UseMultiCode(2)) return true; - } - return true; -} - -Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const { - if (IsVedicAccent(ch)) return CharClass::kVedicMark; - if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner; - if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner; - // Offset from the start of the relevant unicode code block aka code page. - int off = ch - static_cast(script_); - // Anything in another code block is other. - if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther; - if (off <= 0x33) return CharClass::kConsonant; - if (off <= 0x45) return CharClass::kMatra; - if (off == 0x46) return CharClass::kMatraPiece; - if (off == 0x4c) return CharClass::kRobat; - if (off == 0x49 || off == 0x4a) return CharClass::kNukta; - if (off <= 0x51) return CharClass::kVowelModifier; - if (off == 0x52) return CharClass::kVirama; - return CharClass::kOther; -} - -} // namespace tesseract +#include "validate_khmer.h" +#include "errcode.h" +#include "tprintf.h" + +namespace tesseract { + +// Returns whether codes matches the pattern for a Khmer Grapheme. +// Taken from unicode standard: +// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf. +// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation +// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf. +// Translated to the codes used by the CharClass enum: +// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC} +// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter. +// Also the Consonant class here includes independent vowels, as they are +// treated the same anyway. +// In the split grapheme mode, the only characters that get grouped are the +// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in +// the BNF syntax, so who knows what they do. +bool ValidateKhmer::ConsumeGraphemeIfValid() { + int num_codes = codes_.size(); + if (codes_used_ == num_codes) return false; + if (codes_[codes_used_].first == CharClass::kOther) { + UseMultiCode(1); + return true; + } + if (codes_[codes_used_].first != CharClass::kConsonant) { + if (report_errors_) { + tprintf("Invalid start of Khmer syllable:0x%x\n", + codes_[codes_used_].second); + } + return false; + } + if (UseMultiCode(1)) return true; + if (codes_[codes_used_].first == CharClass::kRobat || + codes_[codes_used_].first == CharClass::kNukta) { + if (UseMultiCode(1)) return true; + } + while (codes_used_ + 1 < num_codes && + codes_[codes_used_].first == CharClass::kVirama && + codes_[codes_used_ + 1].first == CharClass::kConsonant) { + ASSERT_HOST(!CodeOnlyToOutput()); + if (UseMultiCode(2)) return true; + if (codes_[codes_used_].first == CharClass::kRobat) { + if (UseMultiCode(1)) return true; + } + } + int num_matra_parts = 0; + if (codes_[codes_used_].second == kZeroWidthJoiner || + codes_[codes_used_].second == kZeroWidthNonJoiner) { + if (CodeOnlyToOutput()) { + if (report_errors_) { + tprintf("Unterminated joiner: 0x%x\n", output_.back()); + } + return false; + } + ++num_matra_parts; + } + // Not quite as shown by the BNF, the matra piece is allowed as a matra on its + // own or as an addition to other matras. + if (codes_[codes_used_].first == CharClass::kMatra || + codes_[codes_used_].first == CharClass::kMatraPiece) { + ++num_matra_parts; + if (UseMultiCode(num_matra_parts)) return true; + } else if (num_matra_parts) { + if (report_errors_) { + tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n", + output_.back(), codes_[codes_used_].second); + } + return false; + } + if (codes_[codes_used_].first == CharClass::kMatraPiece && + codes_[codes_used_ - 1].first != CharClass::kMatraPiece) { + if (UseMultiCode(1)) return true; + } + if (codes_[codes_used_].first == CharClass::kVowelModifier) { + if (UseMultiCode(1)) return true; + } + if (codes_used_ + 1 < num_codes && + codes_[codes_used_].first == CharClass::kVirama && + codes_[codes_used_ + 1].first == CharClass::kConsonant) { + ASSERT_HOST(!CodeOnlyToOutput()); + if (UseMultiCode(2)) return true; + } + return true; +} + +Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const { + if (IsVedicAccent(ch)) return CharClass::kVedicMark; + if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner; + if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner; + // Offset from the start of the relevant unicode code block aka code page. + int off = ch - static_cast(script_); + // Anything in another code block is other. + if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther; + if (off <= 0x33) return CharClass::kConsonant; + if (off <= 0x45) return CharClass::kMatra; + if (off == 0x46) return CharClass::kMatraPiece; + if (off == 0x4c) return CharClass::kRobat; + if (off == 0x49 || off == 0x4a) return CharClass::kNukta; + if (off <= 0x51) return CharClass::kVowelModifier; + if (off == 0x52) return CharClass::kVirama; + return CharClass::kOther; +} + +} // namespace tesseract diff --git a/training/validate_khmer.h b/src/training/validate_khmer.h similarity index 97% rename from training/validate_khmer.h rename to src/training/validate_khmer.h index a2fe75c962994ed25e6f372430dd8f9fd9f79caf..e9897010cac42cfed1c87fcca79ecb29a580102e 100644 --- a/training/validate_khmer.h +++ b/src/training/validate_khmer.h @@ -1,27 +1,27 @@ -#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_ -#define TESSERACT_TRAINING_VALIDATE_KHMER_H_ - -#include "validator.h" - -namespace tesseract { - -// Subclass of Validator that validates and segments Khmer. -class ValidateKhmer : public Validator { - public: - ValidateKhmer(ViramaScript script, bool report_errors) - : Validator(script, report_errors) {} - ~ValidateKhmer() {} - - protected: - // Returns whether codes matches the pattern for an Khmer Grapheme. - // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to - // parts_ and output_. Returns true if a valid Grapheme was consumed, - // otherwise does not increment codes_used_. - bool ConsumeGraphemeIfValid() override; - // Returns the CharClass corresponding to the given Unicode ch. - CharClass UnicodeToCharClass(char32 ch) const override; -}; - -} // namespace tesseract - -#endif // TESSERACT_TRAINING_VALIDATE_KHMER_H_ +#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_ +#define TESSERACT_TRAINING_VALIDATE_KHMER_H_ + +#include "validator.h" + +namespace tesseract { + +// Subclass of Validator that validates and segments Khmer. +class ValidateKhmer : public Validator { + public: + ValidateKhmer(ViramaScript script, bool report_errors) + : Validator(script, report_errors) {} + ~ValidateKhmer() {} + + protected: + // Returns whether codes matches the pattern for an Khmer Grapheme. + // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to + // parts_ and output_. Returns true if a valid Grapheme was consumed, + // otherwise does not increment codes_used_. + bool ConsumeGraphemeIfValid() override; + // Returns the CharClass corresponding to the given Unicode ch. + CharClass UnicodeToCharClass(char32 ch) const override; +}; + +} // namespace tesseract + +#endif // TESSERACT_TRAINING_VALIDATE_KHMER_H_ diff --git a/training/validate_myanmar.cpp b/src/training/validate_myanmar.cpp similarity index 97% rename from training/validate_myanmar.cpp rename to src/training/validate_myanmar.cpp index 44934690232fad05fb2e1f5556069d4354d0c81f..3e822e6612d65a309e380c6c0ce4aec32367fdf8 100644 --- a/training/validate_myanmar.cpp +++ b/src/training/validate_myanmar.cpp @@ -1,160 +1,160 @@ -#include "validate_myanmar.h" -#include "errcode.h" -#include "icuerrorcode.h" -#include "tprintf.h" -#include "unicode/uchar.h" // From libicu -#include "unicode/uscript.h" // From libicu - -namespace tesseract { - -// Returns whether codes matches the pattern for a Myanmar Grapheme. -// Taken directly from the unicode table 16-3. -// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf -bool ValidateMyanmar::ConsumeGraphemeIfValid() { - int num_codes = codes_.size(); - if (codes_used_ == num_codes) return true; - // Other. - if (IsMyanmarOther(codes_[codes_used_].second)) { - UseMultiCode(1); - return true; - } - // Kinzi. - if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 && - codes_[codes_used_ + 1].second == kMyanmarAsat && - codes_[codes_used_ + 2].second == kMyanmarVirama) { - ASSERT_HOST(!CodeOnlyToOutput()); - ASSERT_HOST(!CodeOnlyToOutput()); - if (UseMultiCode(3)) return true; - } - // Base consonant/vowel. NOTE that since everything in Myanmar appears to be - // optional, except the base, this is the only place where invalid input can - // be detected and false returned. - if (IsMyanmarLetter(codes_[codes_used_].second)) { - if (UseMultiCode(1)) return true; - } else { - if (report_errors_) { - tprintf("Invalid start of Myanmar syllable:0x%x\n", - codes_[codes_used_].second); - } - return false; // One of these is required. - } - if (ConsumeSubscriptIfPresent()) return true; - ConsumeOptionalSignsIfPresent(); - // What we have consumed so far is a valid syllable. - return true; -} - -// TODO(rays) Doesn't use intermediate coding like the other scripts, as there -// is little correspondence between the content of table 16-3 and the char -// classes of the Indic languages. (Experts may disagree and improve!) -// In unicode table 16-3 there is basically a long list of optional characters, -// which can be coded quite easily. -// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!! -// The table also allows sequences that still result in dotted circles!! -// So with a lot of guesswork the rest have been added in a reasonable place. -Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const { - if (IsMyanmarLetter(ch)) return CharClass::kConsonant; - return CharClass::kOther; -} - -// Helper consumes/copies a virama and any subscript consonant. -// Returns true if the end of input is reached. -bool ValidateMyanmar::ConsumeSubscriptIfPresent() { - // Subscript consonant. It appears there can be only one. - int num_codes = codes_.size(); - if (codes_used_ + 1 < num_codes && - codes_[codes_used_].second == kMyanmarVirama) { - if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) { - ASSERT_HOST(!CodeOnlyToOutput()); - if (UseMultiCode(2)) return true; - } - } - return false; -} - -// Helper consumes/copies a series of optional signs. -// Returns true if the end of input is reached. -bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() { - // The following characters are allowed, all optional, and in sequence. - // An exception is kMyanmarMedialYa, which can include kMyanmarAsat. - const std::vector kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c, - 0x103d, 0x103e, 0x105e, 0x105f, 0x1060, - 0x1081, 0x1031}); - for (char32 ch : kMedials) { - if (codes_[codes_used_].second == ch) { - if (UseMultiCode(1)) return true; - if (ch == kMyanmarMedialYa && - codes_[codes_used_].second == kMyanmarAsat) { - if (UseMultiCode(1)) return true; - } - } - } - // Vowel sign i, ii, ai. - char32 ch = codes_[codes_used_].second; - if (ch == 0x102d || ch == 0x102e || ch == 0x1032) { - if (UseMultiCode(1)) return true; - } - // Vowel sign u, uu, and extensions. - ch = codes_[codes_used_].second; - if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) || - ch == 0x1062 || ch == 0x1067 || ch == 0x1068 || - (0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) || - ch == 0x109c || ch == 0x109d) { - if (UseMultiCode(1)) return true; - } - // Tall aa, aa with optional asat. - if (codes_[codes_used_].second == 0x102b || - codes_[codes_used_].second == 0x102c) { - if (UseMultiCode(1)) return true; - if (codes_[codes_used_].second == kMyanmarAsat) { - if (UseMultiCode(1)) return true; - } - } - // The following characters are allowed, all optional, and in sequence. - const std::vector kSigns({0x1036, 0x1037}); - for (char32 ch : kSigns) { - if (codes_[codes_used_].second == ch) { - if (UseMultiCode(1)) return true; - } - } - // Tone mark extensions. - ch = codes_[codes_used_].second; - if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 || - (0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) || - ch == 0x108f || ch == 0x109a || ch == 0x109b || - (0xaa7b <= ch && ch <= 0xaa7d)) { - if (UseMultiCode(1)) return true; - } - return false; -} - -// Returns true if the unicode is a Myanmar "letter" including consonants -// and independent vowels. Although table 16-3 distinguishes between some -// base consonants and vowels, the extensions make no such distinction, so we -// put them all into a single bucket. -/* static */ -bool ValidateMyanmar::IsMyanmarLetter(char32 ch) { - return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f || - (0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) || - ch == 0x1061 || ch == 0x1065 || ch == 0x1066 || - (0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) || - ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) || - (0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) || - ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f; -} - -// Returns true if ch is a Myanmar digit or other symbol that does not take -// part in being a syllable. -/* static */ -bool ValidateMyanmar::IsMyanmarOther(char32 ch) { - IcuErrorCode err; - UScriptCode script_code = uscript_getScript(ch, err); - if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner && - ch != Validator::kZeroWidthNonJoiner) - return true; - return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) || - (0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) || - (0xaa74 <= ch && ch <= 0xaa79); -} - -} // namespace tesseract +#include "validate_myanmar.h" +#include "errcode.h" +#include "icuerrorcode.h" +#include "tprintf.h" +#include "unicode/uchar.h" // From libicu +#include "unicode/uscript.h" // From libicu + +namespace tesseract { + +// Returns whether codes matches the pattern for a Myanmar Grapheme. +// Taken directly from the unicode table 16-3. +// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf +bool ValidateMyanmar::ConsumeGraphemeIfValid() { + int num_codes = codes_.size(); + if (codes_used_ == num_codes) return true; + // Other. + if (IsMyanmarOther(codes_[codes_used_].second)) { + UseMultiCode(1); + return true; + } + // Kinzi. + if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 && + codes_[codes_used_ + 1].second == kMyanmarAsat && + codes_[codes_used_ + 2].second == kMyanmarVirama) { + ASSERT_HOST(!CodeOnlyToOutput()); + ASSERT_HOST(!CodeOnlyToOutput()); + if (UseMultiCode(3)) return true; + } + // Base consonant/vowel. NOTE that since everything in Myanmar appears to be + // optional, except the base, this is the only place where invalid input can + // be detected and false returned. + if (IsMyanmarLetter(codes_[codes_used_].second)) { + if (UseMultiCode(1)) return true; + } else { + if (report_errors_) { + tprintf("Invalid start of Myanmar syllable:0x%x\n", + codes_[codes_used_].second); + } + return false; // One of these is required. + } + if (ConsumeSubscriptIfPresent()) return true; + ConsumeOptionalSignsIfPresent(); + // What we have consumed so far is a valid syllable. + return true; +} + +// TODO(rays) Doesn't use intermediate coding like the other scripts, as there +// is little correspondence between the content of table 16-3 and the char +// classes of the Indic languages. (Experts may disagree and improve!) +// In unicode table 16-3 there is basically a long list of optional characters, +// which can be coded quite easily. +// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!! +// The table also allows sequences that still result in dotted circles!! +// So with a lot of guesswork the rest have been added in a reasonable place. +Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const { + if (IsMyanmarLetter(ch)) return CharClass::kConsonant; + return CharClass::kOther; +} + +// Helper consumes/copies a virama and any subscript consonant. +// Returns true if the end of input is reached. +bool ValidateMyanmar::ConsumeSubscriptIfPresent() { + // Subscript consonant. It appears there can be only one. + int num_codes = codes_.size(); + if (codes_used_ + 1 < num_codes && + codes_[codes_used_].second == kMyanmarVirama) { + if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) { + ASSERT_HOST(!CodeOnlyToOutput()); + if (UseMultiCode(2)) return true; + } + } + return false; +} + +// Helper consumes/copies a series of optional signs. +// Returns true if the end of input is reached. +bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() { + // The following characters are allowed, all optional, and in sequence. + // An exception is kMyanmarMedialYa, which can include kMyanmarAsat. + const std::vector kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c, + 0x103d, 0x103e, 0x105e, 0x105f, 0x1060, + 0x1081, 0x1031}); + for (char32 ch : kMedials) { + if (codes_[codes_used_].second == ch) { + if (UseMultiCode(1)) return true; + if (ch == kMyanmarMedialYa && + codes_[codes_used_].second == kMyanmarAsat) { + if (UseMultiCode(1)) return true; + } + } + } + // Vowel sign i, ii, ai. + char32 ch = codes_[codes_used_].second; + if (ch == 0x102d || ch == 0x102e || ch == 0x1032) { + if (UseMultiCode(1)) return true; + } + // Vowel sign u, uu, and extensions. + ch = codes_[codes_used_].second; + if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) || + ch == 0x1062 || ch == 0x1067 || ch == 0x1068 || + (0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) || + ch == 0x109c || ch == 0x109d) { + if (UseMultiCode(1)) return true; + } + // Tall aa, aa with optional asat. + if (codes_[codes_used_].second == 0x102b || + codes_[codes_used_].second == 0x102c) { + if (UseMultiCode(1)) return true; + if (codes_[codes_used_].second == kMyanmarAsat) { + if (UseMultiCode(1)) return true; + } + } + // The following characters are allowed, all optional, and in sequence. + const std::vector kSigns({0x1036, 0x1037}); + for (char32 ch : kSigns) { + if (codes_[codes_used_].second == ch) { + if (UseMultiCode(1)) return true; + } + } + // Tone mark extensions. + ch = codes_[codes_used_].second; + if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 || + (0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) || + ch == 0x108f || ch == 0x109a || ch == 0x109b || + (0xaa7b <= ch && ch <= 0xaa7d)) { + if (UseMultiCode(1)) return true; + } + return false; +} + +// Returns true if the unicode is a Myanmar "letter" including consonants +// and independent vowels. Although table 16-3 distinguishes between some +// base consonants and vowels, the extensions make no such distinction, so we +// put them all into a single bucket. +/* static */ +bool ValidateMyanmar::IsMyanmarLetter(char32 ch) { + return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f || + (0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) || + ch == 0x1061 || ch == 0x1065 || ch == 0x1066 || + (0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) || + ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) || + (0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) || + ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f; +} + +// Returns true if ch is a Myanmar digit or other symbol that does not take +// part in being a syllable. +/* static */ +bool ValidateMyanmar::IsMyanmarOther(char32 ch) { + IcuErrorCode err; + UScriptCode script_code = uscript_getScript(ch, err); + if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner && + ch != Validator::kZeroWidthNonJoiner) + return true; + return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) || + (0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) || + (0xaa74 <= ch && ch <= 0xaa79); +} + +} // namespace tesseract diff --git a/training/validate_myanmar.h b/src/training/validate_myanmar.h similarity index 97% rename from training/validate_myanmar.h rename to src/training/validate_myanmar.h index d2ada745059774a618842ff609be34df00fbd0d4..97cb4800075c78cd1507856769faf326fde9d809 100644 --- a/training/validate_myanmar.h +++ b/src/training/validate_myanmar.h @@ -1,47 +1,47 @@ -#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ -#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ - -#include "validator.h" - -namespace tesseract { - -// Subclass of Validator that validates and segments Myanmar. -class ValidateMyanmar : public Validator { - public: - ValidateMyanmar(ViramaScript script, bool report_errors) - : Validator(script, report_errors) {} - ~ValidateMyanmar() {} - - protected: - // Returns whether codes matches the pattern for a Myanmar Grapheme. - // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to - // parts_ and output_. Returns true if a valid Grapheme was consumed, - // otherwise does not increment codes_used_. - bool ConsumeGraphemeIfValid() override; - // Returns the CharClass corresponding to the given Unicode ch. - Validator::CharClass UnicodeToCharClass(char32 ch) const override; - - private: - // Helper consumes/copies a virama and any subscript consonant. - // Returns true if the end of input is reached. - bool ConsumeSubscriptIfPresent(); - // Helper consumes/copies a series of optional signs. - // Returns true if the end of input is reached. - bool ConsumeOptionalSignsIfPresent(); - // Returns true if the unicode is a Myanmar "letter" including consonants - // and independent vowels. Although table 16-3 distinguishes between some - // base consonants and vowels, the extensions make no such distinction, so we - // put them all into a single bucket. - static bool IsMyanmarLetter(char32 ch); - // Returns true if ch is a Myanmar digit or other symbol that does not take - // part in being a syllable. - static bool IsMyanmarOther(char32 ch); - - // Some special unicodes used only for Myanmar processing. - static const char32 kMyanmarAsat = 0x103a; - static const char32 kMyanmarMedialYa = 0x103b; -}; - -} // namespace tesseract - -#endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ +#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ +#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ + +#include "validator.h" + +namespace tesseract { + +// Subclass of Validator that validates and segments Myanmar. +class ValidateMyanmar : public Validator { + public: + ValidateMyanmar(ViramaScript script, bool report_errors) + : Validator(script, report_errors) {} + ~ValidateMyanmar() {} + + protected: + // Returns whether codes matches the pattern for a Myanmar Grapheme. + // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to + // parts_ and output_. Returns true if a valid Grapheme was consumed, + // otherwise does not increment codes_used_. + bool ConsumeGraphemeIfValid() override; + // Returns the CharClass corresponding to the given Unicode ch. + Validator::CharClass UnicodeToCharClass(char32 ch) const override; + + private: + // Helper consumes/copies a virama and any subscript consonant. + // Returns true if the end of input is reached. + bool ConsumeSubscriptIfPresent(); + // Helper consumes/copies a series of optional signs. + // Returns true if the end of input is reached. + bool ConsumeOptionalSignsIfPresent(); + // Returns true if the unicode is a Myanmar "letter" including consonants + // and independent vowels. Although table 16-3 distinguishes between some + // base consonants and vowels, the extensions make no such distinction, so we + // put them all into a single bucket. + static bool IsMyanmarLetter(char32 ch); + // Returns true if ch is a Myanmar digit or other symbol that does not take + // part in being a syllable. + static bool IsMyanmarOther(char32 ch); + + // Some special unicodes used only for Myanmar processing. + static const char32 kMyanmarAsat = 0x103a; + static const char32 kMyanmarMedialYa = 0x103b; +}; + +} // namespace tesseract + +#endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ diff --git a/training/validator.cpp b/src/training/validator.cpp similarity index 100% rename from training/validator.cpp rename to src/training/validator.cpp diff --git a/training/validator.h b/src/training/validator.h similarity index 97% rename from training/validator.h rename to src/training/validator.h index 6b21daa911f47174cf05557b659a07d6c6a14342..0289544fbeb93b99a66936d82c1e4ac6ebab9e03 100644 --- a/training/validator.h +++ b/src/training/validator.h @@ -1,243 +1,243 @@ -/********************************************************************** - * File: validator.h - * Description: Base class for various text validators. Intended mainly for - * scripts that use a virama character. - * Author: Ray Smith - * Created: Tue May 23 2017 - * - * (C) Copyright 2017, Google Inc. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0 - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - **********************************************************************/ - -#ifndef TESSERACT_TRAINING_VALIDATOR_H_ -#define TESSERACT_TRAINING_VALIDATOR_H_ - -#include -#include -#include "unichar.h" - -namespace tesseract { - -// Different kinds of grapheme normalization - not just for Indic! -// A grapheme is a syllable unit in Indic and can be several unicodes. -// In other scripts, a grapheme is a base character and accent/diacritic -// combination, as not all accented characters have a single composed form. -enum class GraphemeNormMode { - // Validation result is a single string, even if input is multi-word. - kSingleString, - // Standard unicode graphemes are validated and output as grapheme units. - kCombined, - // Graphemes are validated and sub-divided. For virama-using scripts, units - // that correspond to repeatable glyphs are generated. (Mostly single unicodes - // but viramas and joiners are paired with the most sensible neighbor.) - // For non-virama scripts, this means that base/accent pairs are separated, - // ie the output is individual unicodes. - kGlyphSplit, - // The output is always single unicodes, regardless of the script. - kIndividualUnicodes, -}; - -// An enum representing the scripts that use a virama character. It is -// guaranteed that the value of any element, (except kNonVirama) can be cast -// to a unicode (char32) value that represents the start of the unicode range -// of the corresponding script. -enum class ViramaScript : char32 { - kNonVirama = 0, - kDevanagari = 0x900, - kBengali = 0x980, - kGurmukhi = 0xa00, - kGujarati = 0xa80, - kOriya = 0xb00, - kTamil = 0xb80, - kTelugu = 0xc00, - kKannada = 0xc80, - kMalayalam = 0xd00, - kSinhala = 0xd80, - kMyanmar = 0x1000, - kKhmer = 0x1780, -}; - -// Base class offers a validation API and protected methods to allow subclasses -// to easily build the validated/segmented output. -class Validator { - public: - // Validates and cleans the src vector of unicodes to the *dest, according to - // g_mode. In the case of kSingleString, a single vector containing the whole - // result is added to *dest. With kCombined, multiple vectors are added to - // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are - // added to *dest with a smaller unit representing a glyph in each. - // In case of validation error, returns false and as much as possible of the - // input, without discarding invalid text. - static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, - bool report_errors, - const std::vector& src, - std::vector>* dest); - - // Returns true if the unicode ch is a non-printing zero-width mark of no - // significance to OCR training or evaluation. - static bool IsZeroWidthMark(char32 ch) { - return ch == kZeroWidthSpace || ch == kLeftToRightMark || - ch == kRightToLeftMark || ch == kInvalid; - } - virtual ~Validator() {} - - // Some specific but universally useful unicodes. - static const char32 kZeroWidthSpace; - static const char32 kZeroWidthNonJoiner; - static const char32 kZeroWidthJoiner; - static const char32 kLeftToRightMark; - static const char32 kRightToLeftMark; - static const char32 kInvalid; - - protected: - // These are more or less the character class identifiers in the ISCII - // standard, section 8. They have been augmented with the Unicode meta - // characters Zero Width Joiner and Zero Width Non Joiner, and the - // Unicode Vedic Marks. - // The best sources of information on Unicode and Indic scripts are: - // http://varamozhi.sourceforge.net/iscii91.pdf - // http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf - // http://unicode.org/faq/indic.html - // http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx - enum class CharClass { - // NOTE: The values of the enum members are meaningless and arbitrary, ie - // they are not used for sorting, or any other risky application. - // The reason they are what they are is they are a single character - // abbreviation that can be used in a regexp/BNF definition of a grammar, - // IN A COMMENT, and still not relied upon in the code. - kConsonant = 'C', - kVowel = 'V', - kVirama = 'H', // (aka Halant) - kMatra = 'M', // (aka Dependent Vowel) - kMatraPiece = 'P', // unicode provides pieces of Matras. - kVowelModifier = 'D', // (candrabindu, anusvara, visarga, other marks) - kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C - kZeroWidthJoiner = 'Z', // Unicode Zero Width Joiner U+200D - kVedicMark = 'v', // Modifiers can come modify any indic syllable. - kNukta = 'N', // Occurs only immediately after consonants. - kRobat = 'R', // Khmer only. - kOther = 'O', // (digits, measures, non-Indic, etc) - // Additional classes used only by ValidateGrapheme. - kWhitespace = ' ', - kCombiner = 'c', // Combiners other than virama. - }; - typedef std::pair IndicPair; - - Validator(ViramaScript script, bool report_errors) - : script_(script), - codes_used_(0), - output_used_(0), - report_errors_(report_errors) {} - - // Factory method that understands how to map script to the right subclass. - static std::unique_ptr ScriptValidator(ViramaScript script, - bool report_errors); - - // Internal version of the public static ValidateCleanAndSegment. - // Validates and cleans the src vector of unicodes to the *dest, according to - // its type and the given g_mode. - // In case of validation error, returns false and returns as much as possible - // of the input, without discarding invalid text. - bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, - const std::vector& src, - std::vector>* dest); - // Moves the results from parts_ or output_ to dest according to g_mode. - void MoveResultsToDest(GraphemeNormMode g_mode, - std::vector>* dest); - - // Computes and returns the ViramaScript corresponding to the most frequent - // virama-using script in the input, or kNonVirama if none are present. - static ViramaScript MostFrequentViramaScript( - const std::vector& utf32); - // Returns true if the given UTF-32 unicode is a "virama" character. - static bool IsVirama(char32 unicode); - // Returns true if the given UTF-32 unicode is a vedic accent. - static bool IsVedicAccent(char32 unicode); - // Returns true if the script is one that uses subscripts for conjuncts. - bool IsSubscriptScript() const; - - // Helper function appends the next element of codes_ only to output_, - // without touching parts_ - // Returns true at the end of codes_. - bool CodeOnlyToOutput() { - output_.push_back(codes_[codes_used_].second); - return ++codes_used_ == codes_.size(); - } - - // Helper function adds a length-element vector to parts_ from the last length - // elements of output_. If there are more than length unused elements in - // output_, adds unicodes as single-element vectors to parts_ to catch - // output_used_ up to output->size() - length before adding the length-element - // vector. - void MultiCodePart(int length) { - while (output_used_ + length < output_.size()) { - parts_.emplace_back( - std::initializer_list{output_[output_used_++]}); - } - parts_.emplace_back(std::initializer_list{output_[output_used_]}); - while (++output_used_ < output_.size()) { - parts_.back().push_back(output_[output_used_]); - } - } - - // Helper function appends the next element of codes_ to output_, and then - // calls MultiCodePart to add the appropriate components to parts_. - // Returns true at the end of codes_. - bool UseMultiCode(int length) { - output_.push_back(codes_[codes_used_].second); - MultiCodePart(length); - return ++codes_used_ == codes_.size(); - } - - // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to - // parts_ and output_. Returns true if a valid Grapheme was consumed, - // otherwise does not increment codes_used_. - virtual bool ConsumeGraphemeIfValid() = 0; - // Sets codes_ to the class codes for the given unicode text. - void ComputeClassCodes(const std::vector& text); - // Returns the CharClass corresponding to the given Unicode ch. - virtual CharClass UnicodeToCharClass(char32 ch) const = 0; - // Resets to the initial state. - void Clear(); - - // Number of unicodes in each Indic codepage. - static const int kIndicCodePageSize = 128; - // Lowest unicode value of any Indic script. (Devanagari). - static const char32 kMinIndicUnicode = 0x900; - // Highest unicode value of any consistent (ISCII-based) Indic script. - static const char32 kMaxSinhalaUnicode = 0xdff; - // Highest unicode value of any virama-using script. (Khmer). - static const char32 kMaxViramaScriptUnicode = 0x17ff; - // Some special unicodes. - static const char32 kSinhalaVirama = 0xdca; - static const char32 kMyanmarVirama = 0x1039; - static const char32 kKhmerVirama = 0x17d2; - - // Script we are operating on. - ViramaScript script_; - // Input unicodes with assigned CharClass is the data to be validated. - std::vector codes_; - // Glyph-like components of the input. - std::vector> parts_; - // Copied validated unicodes from codes_ that are OK to output. - std::vector output_; - // The number of elements of codes_ that have been processed so far. - int codes_used_; - // The number of elements of output_ that have already been added to parts_. - int output_used_; - // Log error messages for reasons why text is invalid. - bool report_errors_; -}; - -} // namespace tesseract - -#endif // TESSERACT_TRAINING_VALIDATOR_H_ +/********************************************************************** + * File: validator.h + * Description: Base class for various text validators. Intended mainly for + * scripts that use a virama character. + * Author: Ray Smith + * Created: Tue May 23 2017 + * + * (C) Copyright 2017, Google Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **********************************************************************/ + +#ifndef TESSERACT_TRAINING_VALIDATOR_H_ +#define TESSERACT_TRAINING_VALIDATOR_H_ + +#include +#include +#include "unichar.h" + +namespace tesseract { + +// Different kinds of grapheme normalization - not just for Indic! +// A grapheme is a syllable unit in Indic and can be several unicodes. +// In other scripts, a grapheme is a base character and accent/diacritic +// combination, as not all accented characters have a single composed form. +enum class GraphemeNormMode { + // Validation result is a single string, even if input is multi-word. + kSingleString, + // Standard unicode graphemes are validated and output as grapheme units. + kCombined, + // Graphemes are validated and sub-divided. For virama-using scripts, units + // that correspond to repeatable glyphs are generated. (Mostly single unicodes + // but viramas and joiners are paired with the most sensible neighbor.) + // For non-virama scripts, this means that base/accent pairs are separated, + // ie the output is individual unicodes. + kGlyphSplit, + // The output is always single unicodes, regardless of the script. + kIndividualUnicodes, +}; + +// An enum representing the scripts that use a virama character. It is +// guaranteed that the value of any element, (except kNonVirama) can be cast +// to a unicode (char32) value that represents the start of the unicode range +// of the corresponding script. +enum class ViramaScript : char32 { + kNonVirama = 0, + kDevanagari = 0x900, + kBengali = 0x980, + kGurmukhi = 0xa00, + kGujarati = 0xa80, + kOriya = 0xb00, + kTamil = 0xb80, + kTelugu = 0xc00, + kKannada = 0xc80, + kMalayalam = 0xd00, + kSinhala = 0xd80, + kMyanmar = 0x1000, + kKhmer = 0x1780, +}; + +// Base class offers a validation API and protected methods to allow subclasses +// to easily build the validated/segmented output. +class Validator { + public: + // Validates and cleans the src vector of unicodes to the *dest, according to + // g_mode. In the case of kSingleString, a single vector containing the whole + // result is added to *dest. With kCombined, multiple vectors are added to + // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are + // added to *dest with a smaller unit representing a glyph in each. + // In case of validation error, returns false and as much as possible of the + // input, without discarding invalid text. + static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, + bool report_errors, + const std::vector& src, + std::vector>* dest); + + // Returns true if the unicode ch is a non-printing zero-width mark of no + // significance to OCR training or evaluation. + static bool IsZeroWidthMark(char32 ch) { + return ch == kZeroWidthSpace || ch == kLeftToRightMark || + ch == kRightToLeftMark || ch == kInvalid; + } + virtual ~Validator() {} + + // Some specific but universally useful unicodes. + static const char32 kZeroWidthSpace; + static const char32 kZeroWidthNonJoiner; + static const char32 kZeroWidthJoiner; + static const char32 kLeftToRightMark; + static const char32 kRightToLeftMark; + static const char32 kInvalid; + + protected: + // These are more or less the character class identifiers in the ISCII + // standard, section 8. They have been augmented with the Unicode meta + // characters Zero Width Joiner and Zero Width Non Joiner, and the + // Unicode Vedic Marks. + // The best sources of information on Unicode and Indic scripts are: + // http://varamozhi.sourceforge.net/iscii91.pdf + // http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf + // http://unicode.org/faq/indic.html + // http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx + enum class CharClass { + // NOTE: The values of the enum members are meaningless and arbitrary, ie + // they are not used for sorting, or any other risky application. + // The reason they are what they are is they are a single character + // abbreviation that can be used in a regexp/BNF definition of a grammar, + // IN A COMMENT, and still not relied upon in the code. + kConsonant = 'C', + kVowel = 'V', + kVirama = 'H', // (aka Halant) + kMatra = 'M', // (aka Dependent Vowel) + kMatraPiece = 'P', // unicode provides pieces of Matras. + kVowelModifier = 'D', // (candrabindu, anusvara, visarga, other marks) + kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C + kZeroWidthJoiner = 'Z', // Unicode Zero Width Joiner U+200D + kVedicMark = 'v', // Modifiers can come modify any indic syllable. + kNukta = 'N', // Occurs only immediately after consonants. + kRobat = 'R', // Khmer only. + kOther = 'O', // (digits, measures, non-Indic, etc) + // Additional classes used only by ValidateGrapheme. + kWhitespace = ' ', + kCombiner = 'c', // Combiners other than virama. + }; + typedef std::pair IndicPair; + + Validator(ViramaScript script, bool report_errors) + : script_(script), + codes_used_(0), + output_used_(0), + report_errors_(report_errors) {} + + // Factory method that understands how to map script to the right subclass. + static std::unique_ptr ScriptValidator(ViramaScript script, + bool report_errors); + + // Internal version of the public static ValidateCleanAndSegment. + // Validates and cleans the src vector of unicodes to the *dest, according to + // its type and the given g_mode. + // In case of validation error, returns false and returns as much as possible + // of the input, without discarding invalid text. + bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, + const std::vector& src, + std::vector>* dest); + // Moves the results from parts_ or output_ to dest according to g_mode. + void MoveResultsToDest(GraphemeNormMode g_mode, + std::vector>* dest); + + // Computes and returns the ViramaScript corresponding to the most frequent + // virama-using script in the input, or kNonVirama if none are present. + static ViramaScript MostFrequentViramaScript( + const std::vector& utf32); + // Returns true if the given UTF-32 unicode is a "virama" character. + static bool IsVirama(char32 unicode); + // Returns true if the given UTF-32 unicode is a vedic accent. + static bool IsVedicAccent(char32 unicode); + // Returns true if the script is one that uses subscripts for conjuncts. + bool IsSubscriptScript() const; + + // Helper function appends the next element of codes_ only to output_, + // without touching parts_ + // Returns true at the end of codes_. + bool CodeOnlyToOutput() { + output_.push_back(codes_[codes_used_].second); + return ++codes_used_ == codes_.size(); + } + + // Helper function adds a length-element vector to parts_ from the last length + // elements of output_. If there are more than length unused elements in + // output_, adds unicodes as single-element vectors to parts_ to catch + // output_used_ up to output->size() - length before adding the length-element + // vector. + void MultiCodePart(int length) { + while (output_used_ + length < output_.size()) { + parts_.emplace_back( + std::initializer_list{output_[output_used_++]}); + } + parts_.emplace_back(std::initializer_list{output_[output_used_]}); + while (++output_used_ < output_.size()) { + parts_.back().push_back(output_[output_used_]); + } + } + + // Helper function appends the next element of codes_ to output_, and then + // calls MultiCodePart to add the appropriate components to parts_. + // Returns true at the end of codes_. + bool UseMultiCode(int length) { + output_.push_back(codes_[codes_used_].second); + MultiCodePart(length); + return ++codes_used_ == codes_.size(); + } + + // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to + // parts_ and output_. Returns true if a valid Grapheme was consumed, + // otherwise does not increment codes_used_. + virtual bool ConsumeGraphemeIfValid() = 0; + // Sets codes_ to the class codes for the given unicode text. + void ComputeClassCodes(const std::vector& text); + // Returns the CharClass corresponding to the given Unicode ch. + virtual CharClass UnicodeToCharClass(char32 ch) const = 0; + // Resets to the initial state. + void Clear(); + + // Number of unicodes in each Indic codepage. + static const int kIndicCodePageSize = 128; + // Lowest unicode value of any Indic script. (Devanagari). + static const char32 kMinIndicUnicode = 0x900; + // Highest unicode value of any consistent (ISCII-based) Indic script. + static const char32 kMaxSinhalaUnicode = 0xdff; + // Highest unicode value of any virama-using script. (Khmer). + static const char32 kMaxViramaScriptUnicode = 0x17ff; + // Some special unicodes. + static const char32 kSinhalaVirama = 0xdca; + static const char32 kMyanmarVirama = 0x1039; + static const char32 kKhmerVirama = 0x17d2; + + // Script we are operating on. + ViramaScript script_; + // Input unicodes with assigned CharClass is the data to be validated. + std::vector codes_; + // Glyph-like components of the input. + std::vector> parts_; + // Copied validated unicodes from codes_ that are OK to output. + std::vector output_; + // The number of elements of codes_ that have been processed so far. + int codes_used_; + // The number of elements of output_ that have already been added to parts_. + int output_used_; + // Log error messages for reasons why text is invalid. + bool report_errors_; +}; + +} // namespace tesseract + +#endif // TESSERACT_TRAINING_VALIDATOR_H_ diff --git a/training/wordlist2dawg.cpp b/src/training/wordlist2dawg.cpp similarity index 100% rename from training/wordlist2dawg.cpp rename to src/training/wordlist2dawg.cpp