diff --git a/api/baseapi.cpp b/api/baseapi.cpp index f92f00fc4e47d940ad87dd2d64c6e2c12fe83487..5eefb1361fc1e2e07f7665f82e0de994404c4e7c 100644 --- a/api/baseapi.cpp +++ b/api/baseapi.cpp @@ -514,9 +514,9 @@ PageIterator* TessBaseAPI::AnalyseLayout() { if (block_list_->empty()) return NULL; // The page was empty. page_res_ = new PAGE_RES(block_list_, NULL); - // TODO(rays) Support transmission of image scaling and resolution through - // ImageThresholder, so it can be used here in place of literal 1, 300. - return new PageIterator(page_res_, tesseract_, 1, 300, + return new PageIterator(page_res_, tesseract_, + thresholder_->GetScaleFactor(), + thresholder_->GetScaledYResolution(), rect_left_, rect_top_, rect_width_, rect_height_); } return NULL; @@ -798,9 +798,9 @@ bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename, ResultIterator* TessBaseAPI::GetIterator() { if (tesseract_ == NULL || page_res_ == NULL) return NULL; - // TODO(rays) Support transmission of image scaling and resolution through - // ImageThresholder, so it can be used here in place of literal 1, 300. - return new ResultIterator(page_res_, tesseract_, 1, 300, + return new ResultIterator(page_res_, tesseract_, + thresholder_->GetScaleFactor(), + thresholder_->GetScaledYResolution(), rect_left_, rect_top_, rect_width_, rect_height_); } @@ -952,17 +952,15 @@ char* TessBaseAPI::GetHOCRText(int page_number) { hocr_str += ""; if (word->italic > 0) hocr_str += ""; - int i; + int i; // escape special characters - for (i = 0; - choice->unichar_string()[i] != '\0'; - i++) { - if (choice->unichar_string()[i] == '<') { hocr_str += "<"; } - else if (choice->unichar_string()[i] == '>') { hocr_str += ">"; } - else if (choice->unichar_string()[i] == '&') { hocr_str += "&"; } - else if (choice->unichar_string()[i] == '"') { hocr_str += """; } - else if (choice->unichar_string()[i] == '\'') { hocr_str += "'"; } - else { hocr_str += choice->unichar_string()[i]; } + for (i = 0; choice->unichar_string()[i] != '\0'; i++) { + if (choice->unichar_string()[i] == '<') hocr_str += "<"; + else if (choice->unichar_string()[i] == '>') hocr_str += ">"; + else if (choice->unichar_string()[i] == '&') hocr_str += "&"; + else if (choice->unichar_string()[i] == '"') hocr_str += """; + else if (choice->unichar_string()[i] == '\'') hocr_str += "'"; + else hocr_str += choice->unichar_string()[i]; } if (word->italic > 0) hocr_str += ""; @@ -973,7 +971,7 @@ char* TessBaseAPI::GetHOCRText(int page_number) { hocr_str += " "; } } - if (block != NULL) + if (block != NULL) hocr_str += "\n

\n\n"; hocr_str += "\n"; @@ -1206,6 +1204,7 @@ bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char* wordstr) { bool success = true; PageSegMode current_psm = GetPageSegMode(); SetPageSegMode(mode); + SetVariable("classify_enable_learning", "0"); char* text = GetUTF8Text(); if (text != NULL) { PAGE_RES_IT it(page_res_); diff --git a/api/resultiterator.cpp b/api/resultiterator.cpp index 6c9c1b6ed118792e860d8b2fd2815630c80100ea..83665167a9c4bb619d6d11b384488286fa1b34c5 100644 --- a/api/resultiterator.cpp +++ b/api/resultiterator.cpp @@ -153,6 +153,7 @@ const char* ResultIterator::WordFontAttributes(bool* is_bold, bool* is_underlined, bool* is_monospace, bool* is_serif, + bool* is_smallcaps, int* pointsize, int* font_id) const { if (it_->word() == NULL) return NULL; // Already at the end! @@ -165,6 +166,7 @@ const char* ResultIterator::WordFontAttributes(bool* is_bold, *is_underlined = false; // TODO(rays) fix this! *is_monospace = font_info.is_fixed_pitch(); *is_serif = font_info.is_serif(); + *is_smallcaps = it_->word()->small_caps; // The font size is calculated from a multiple of the x-height // that came from the block. float row_height = it_->row()->row->x_height() * @@ -192,6 +194,33 @@ bool ResultIterator::WordIsNumeric() const { return permuter == NUMBER_PERM; } +// Returns true if the current symbol is a superscript. +// If iterating at a higher level object than symbols, eg words, then +// this will return the attributes of the first symbol in that word. +bool ResultIterator::SymbolIsSuperscript() const { + if (cblob_it_ == NULL && it_->word() != NULL) + return it_->word()->box_word->BlobPosition(blob_index_) == SP_SUPERSCRIPT; + return false; +} + +// Returns true if the current symbol is a subscript. +// If iterating at a higher level object than symbols, eg words, then +// this will return the attributes of the first symbol in that word. +bool ResultIterator::SymbolIsSubscript() const { + if (cblob_it_ == NULL && it_->word() != NULL) + return it_->word()->box_word->BlobPosition(blob_index_) == SP_SUBSCRIPT; + return false; +} + +// Returns true if the current symbol is a dropcap. +// If iterating at a higher level object than symbols, eg words, then +// this will return the attributes of the first symbol in that word. +bool ResultIterator::SymbolIsDropcap() const { + if (cblob_it_ == NULL && it_->word() != NULL) + return it_->word()->box_word->BlobPosition(blob_index_) == SP_DROPCAP; + return false; +} + ChoiceIterator::ChoiceIterator(const ResultIterator& result_it) { ASSERT_HOST(result_it.it_->word() != NULL); tesseract_ = result_it.tesseract_; diff --git a/api/resultiterator.h b/api/resultiterator.h index ba8957be377eff23e910260482fa899f7254c3a4..73ea57ca3804fc98f697d261076abe3de2373c76 100644 --- a/api/resultiterator.h +++ b/api/resultiterator.h @@ -97,6 +97,7 @@ class ResultIterator : public PageIterator { bool* is_underlined, bool* is_monospace, bool* is_serif, + bool* is_smallcaps, int* pointsize, int* font_id) const; @@ -105,6 +106,21 @@ class ResultIterator : public PageIterator { // Returns true if the current word is numeric. bool WordIsNumeric() const; + + // ============= Functions that refer to symbols only ============. + + // Returns true if the current symbol is a superscript. + // If iterating at a higher level object than symbols, eg words, then + // this will return the attributes of the first symbol in that word. + bool SymbolIsSuperscript() const; + // Returns true if the current symbol is a subscript. + // If iterating at a higher level object than symbols, eg words, then + // this will return the attributes of the first symbol in that word. + bool SymbolIsSubscript() const; + // Returns true if the current symbol is a dropcap. + // If iterating at a higher level object than symbols, eg words, then + // this will return the attributes of the first symbol in that word. + bool SymbolIsDropcap() const; }; // Class to iterate over the classifier choices for a single RIL_SYMBOL. diff --git a/api/tesseractmain.cpp b/api/tesseractmain.cpp index 1c15184697d95e5bde087fa553cb98c097f1294d..fc7834c20bfe68973836150d6cd6caa72a3e11b7 100644 --- a/api/tesseractmain.cpp +++ b/api/tesseractmain.cpp @@ -18,7 +18,7 @@ **********************************************************************/ #include "mfcpch.h" -//#define USE_VLD //Uncomment for Visual Leak Detector. +// #define USE_VLD //Uncomment for Visual Leak Detector. #if (defined _MSC_VER && defined USE_VLD) #include #endif @@ -178,9 +178,8 @@ int WINAPI WinMain( //main for windows //command line argsin[1] = strdup (lpszCmdLine); /*allocate memory for the args. There can never be more than half*/ /*the total number of characters in the arguments.*/ - argv = - (char **) malloc (((strlen (argsin[0]) + strlen (argsin[1])) / 2 + 1) * - sizeof (char *)); + argv = (char **)malloc(((strlen(argsin[0]) + strlen(argsin[1])) / 2 + 1) * + sizeof(char *)); /*now construct argv as it should be for C.*/ argc = parse_args (2, argsin, argv); diff --git a/ccmain/applybox.cpp b/ccmain/applybox.cpp index 8aa7a3c9154b21cb50dc4623fe5791027622fa72..dce06cfed8b6be683498624a7084ae75d18c9c54 100644 --- a/ccmain/applybox.cpp +++ b/ccmain/applybox.cpp @@ -519,8 +519,26 @@ bool Tesseract::FindSegmentation(const GenericVector& target_text, for (int i = 0; i < word_length; ++i) choices[i].delete_data_pointers(); delete [] choices; - if (word_res->best_state.empty()) - return false; + if (word_res->best_state.empty()) { + // Build the original segmentation and if it is the same length as the + // truth, assume it will do. + int blob_count = 1; + for (int s = 0; s < array_count(word_res->seam_array); ++s) { + SEAM* seam = + reinterpret_cast(array_value(word_res->seam_array, s)); + if (seam->split1 == NULL) { + word_res->best_state.push_back(blob_count); + blob_count = 1; + } else { + ++blob_count; + } + } + word_res->best_state.push_back(blob_count); + if (word_res->best_state.size() != target_text.size()) { + word_res->best_state.clear(); // No good. Original segmentation bad size. + return false; + } + } word_res->correct_text.clear(); for (int i = 0; i < target_text.size(); ++i) { word_res->correct_text.push_back( diff --git a/ccmain/control.cpp b/ccmain/control.cpp index 9508620dbe1c2f0231dfad355f40333ace1daaf6..d895fbebefd1825f138f21e4988e04b7c0411dd9 100644 --- a/ccmain/control.cpp +++ b/ccmain/control.cpp @@ -569,7 +569,7 @@ static void SwitchWordOrDiscard(bool accept_new_word, WERD_RES* word, word->raw_choice = new_word->raw_choice; new_word->raw_choice = NULL; word->reject_map = new_word->reject_map; - word->done = new_word->done; + word->CopySimpleFields(*new_word); } else { // The new_word is no better, so destroy it and cleanup. new_word->ClearResults(); @@ -664,6 +664,26 @@ void Tesseract::classify_word_pass2(WERD_RES *word, BLOCK* block, ROW *row) { } if (accept_new_xht) done_this_pass = true; + // Test for small caps. Word capheight must be close to block xheight, + // and word must contain no lower case letters, and at least one upper case. + double small_cap_xheight = block->x_height() * kXHeightCapRatio; + double small_cap_delta = (block->x_height() - small_cap_xheight) / 2.0; + if (unicharset.script_has_xheight() && + small_cap_xheight - small_cap_delta <= word->x_height && + word->x_height <= small_cap_xheight + small_cap_delta) { + // Scan for upper/lower. + int num_upper = 0; + int num_lower = 0; + for (int i = 0; i < word->best_choice->length(); ++i) { + if (unicharset.get_isupper(word->best_choice->unichar_id(i))) + ++num_upper; + else if (unicharset.get_islower(word->best_choice->unichar_id(i))) + ++num_lower; + } + if (num_upper > 0 && num_lower == 0) + word->small_caps = true; + } + word->SetScriptPositions(unicharset); set_global_subloc_code(SUBLOC_NORM); } diff --git a/ccmain/pgedit.cpp b/ccmain/pgedit.cpp index 814afb33e9bfd5477e531d4a4bb70d07539c3af6..6d9a2b3efb29da28d20bd0e423a4734d5bd6c856 100755 --- a/ccmain/pgedit.cpp +++ b/ccmain/pgedit.cpp @@ -76,7 +76,29 @@ enum CMD_EVENTS REFRESH_CMD_EVENT, QUIT_CMD_EVENT, RECOG_WERDS, - RECOG_PSEUDO + RECOG_PSEUDO, + SHOW_SUBSCRIPT_CMD_EVENT, + SHOW_SUPERSCRIPT_CMD_EVENT, + SHOW_ITALIC_CMD_EVENT, + SHOW_BOLD_CMD_EVENT, + SHOW_UNDERLINE_CMD_EVENT, + SHOW_FIXEDPITCH_CMD_EVENT, + SHOW_SERIF_CMD_EVENT, + SHOW_SMALLCAPS_CMD_EVENT, + SHOW_DROPCAPS_CMD_EVENT, +}; + +enum ColorationMode { + CM_RAINBOW, + CM_SUBSCRIPT, + CM_SUPERSCRIPT, + CM_ITALIC, + CM_BOLD, + CM_UNDERLINE, + CM_FIXEDPITCH, + CM_SERIF, + CM_SMALLCAPS, + CM_DROPCAPS }; /* @@ -99,6 +121,7 @@ CMD_EVENTS mode = CHANGE_DISP_CMD_EVENT; // selected words op // These variables should remain global, since they are only used for the // debug mode (in which only a single Tesseract thread/instance will be exist). BITS16 word_display_mode; +static ColorationMode color_mode = CM_RAINBOW; BOOL8 display_image = FALSE; BOOL8 display_blocks = FALSE; BOOL8 display_baselines = FALSE; @@ -253,6 +276,16 @@ SVMenuNode *Tesseract::build_menu_new() { parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, FALSE); parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, FALSE); parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, TRUE); + parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT); + parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT); + parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT); + parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT); + parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT); + parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT); + parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT); + parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT); + parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT); + parent_menu = root_menu_item->AddChild("OTHER"); @@ -368,7 +401,8 @@ BOOL8 Tesseract::process_cmd_win_event( // UI command semantics char msg[160]; BOOL8 exit = FALSE; - switch(cmd_event) { + color_mode = CM_RAINBOW; + switch (cmd_event) { case NULL_CMD_EVENT: break; @@ -434,6 +468,42 @@ BOOL8 Tesseract::process_cmd_win_event( // UI command semantics display_baselines =(new_value[0] == 'T'); do_re_display(&tesseract::Tesseract::word_display); break; + case SHOW_SUBSCRIPT_CMD_EVENT: + color_mode = CM_SUBSCRIPT; + do_re_display(&tesseract::Tesseract::word_display); + break; + case SHOW_SUPERSCRIPT_CMD_EVENT: + color_mode = CM_SUPERSCRIPT; + do_re_display(&tesseract::Tesseract::word_display); + break; + case SHOW_ITALIC_CMD_EVENT: + color_mode = CM_ITALIC; + do_re_display(&tesseract::Tesseract::word_display); + break; + case SHOW_BOLD_CMD_EVENT: + color_mode = CM_BOLD; + do_re_display(&tesseract::Tesseract::word_display); + break; + case SHOW_UNDERLINE_CMD_EVENT: + color_mode = CM_UNDERLINE; + do_re_display(&tesseract::Tesseract::word_display); + break; + case SHOW_FIXEDPITCH_CMD_EVENT: + color_mode = CM_FIXEDPITCH; + do_re_display(&tesseract::Tesseract::word_display); + break; + case SHOW_SERIF_CMD_EVENT: + color_mode = CM_SERIF; + do_re_display(&tesseract::Tesseract::word_display); + break; + case SHOW_SMALLCAPS_CMD_EVENT: + color_mode = CM_SMALLCAPS; + do_re_display(&tesseract::Tesseract::word_display); + break; + case SHOW_DROPCAPS_CMD_EVENT: + color_mode = CM_DROPCAPS; + do_re_display(&tesseract::Tesseract::word_display); + break; case REFRESH_CMD_EVENT: do_re_display(&tesseract::Tesseract::word_display); break; @@ -649,11 +719,63 @@ BOOL8 Tesseract::word_display(BLOCK* block, ROW* row, WERD_RES* word_res) { float shift; // from bot left C_BLOB_IT c_it; // cblob iterator + if (color_mode != CM_RAINBOW && word_res->box_word != NULL) { + BoxWord* box_word = word_res->box_word; + int length = box_word->length(); + int font_id = word_res->font1; + if (font_id < 0) font_id = 0; + const UnicityTable &font_table = get_fontinfo_table(); + FontInfo font_info = font_table.get(font_id); + for (int i = 0; i < length; ++i) { + ScrollView::Color color = ScrollView::GREEN; + switch (color_mode) { + case CM_SUBSCRIPT: + if (box_word->BlobPosition(i) == SP_SUBSCRIPT) + color = ScrollView::RED; + break; + case CM_SUPERSCRIPT: + if (box_word->BlobPosition(i) == SP_SUPERSCRIPT) + color = ScrollView::RED; + break; + case CM_ITALIC: + if (font_info.is_italic()) + color = ScrollView::RED; + break; + case CM_BOLD: + if (font_info.is_bold()) + color = ScrollView::RED; + break; + case CM_FIXEDPITCH: + if (font_info.is_fixed_pitch()) + color = ScrollView::RED; + break; + case CM_SERIF: + if (font_info.is_serif()) + color = ScrollView::RED; + break; + case CM_SMALLCAPS: + if (word_res->small_caps) + color = ScrollView::RED; + break; + case CM_DROPCAPS: + if (box_word->BlobPosition(i) == SP_DROPCAP) + color = ScrollView::RED; + break; + // TODO(rays) underline is currently completely unsupported. + case CM_UNDERLINE: + default: + break; + } + image_win->Pen(color); + TBOX box = box_word->BlobBox(i); + image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top()); + } + return true; + } /* Note the double coercions of(COLOUR)((inT32)editor_image_word_bb_color) etc. are to keep the compiler happy. */ - // display bounding box if (word->display_flag(DF_BOX)) { word->bounding_box().plot(image_win, diff --git a/ccmain/thresholder.cpp b/ccmain/thresholder.cpp index e8d9807f3c85fd7fb99ccd5d576025bab60d3f77..f8ae740d1b8df1437c4175b6164f8346db130e9b 100644 --- a/ccmain/thresholder.cpp +++ b/ccmain/thresholder.cpp @@ -17,15 +17,7 @@ // /////////////////////////////////////////////////////////////////////// -// Include automatically generated configuration file if running autoconf. -#ifdef HAVE_CONFIG_H -#include "config_auto.h" -#endif - -#ifdef HAVE_LIBLEPT -// Include leptonica library only if autoconf (or makefile etc) tell us to. #include "allheaders.h" -#endif #include "thresholder.h" @@ -37,13 +29,11 @@ namespace tesseract { ImageThresholder::ImageThresholder() - : -#ifdef HAVE_LIBLEPT - pix_(NULL), -#endif + : pix_(NULL), image_data_(NULL), image_width_(0), image_height_(0), - image_bytespp_(0), image_bytespl_(0) { + image_bytespp_(0), image_bytespl_(0), + scale_(1), yres_(300) { SetRectangle(0, 0, 0, 0); } @@ -53,21 +43,17 @@ ImageThresholder::~ImageThresholder() { // Destroy the Pix if there is one, freeing memory. void ImageThresholder::Clear() { -#ifdef HAVE_LIBLEPT if (pix_ != NULL) { pixDestroy(&pix_); pix_ = NULL; } -#endif image_data_ = NULL; } // Return true if no image has been set. bool ImageThresholder::IsEmpty() const { -#ifdef HAVE_LIBLEPT if (pix_ != NULL) return false; -#endif return image_data_ == NULL; } @@ -84,16 +70,16 @@ bool ImageThresholder::IsEmpty() const { void ImageThresholder::SetImage(const unsigned char* imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line) { -#ifdef HAVE_LIBLEPT if (pix_ != NULL) pixDestroy(&pix_); pix_ = NULL; -#endif image_data_ = imagedata; image_width_ = width; image_height_ = height; image_bytespp_ = bytes_per_pixel; image_bytespl_ = bytes_per_line; + scale_ = 1; + yres_ = 300; Init(); } @@ -121,55 +107,6 @@ void ImageThresholder::GetImageSizes(int* left, int* top, *imageheight = image_height_; } -// Return true if HAVE_LIBLEPT and this thresholder implements the Pix -// interface. -bool ImageThresholder::HasThresholdToPix() const { -#ifdef HAVE_LIBLEPT - return true; -#else - return false; -#endif -} - -// Threshold the source image as efficiently as possible to the output -// tesseract IMAGE class. -void ImageThresholder::ThresholdToIMAGE(IMAGE* image) { -#ifdef HAVE_LIBLEPT - if (pix_ != NULL) { - if (image_bytespp_ == 0) { - // We have a binary image, so it just has to be converted. - CopyBinaryRectPixToIMAGE(image); - } else { - if (image_bytespp_ == 4) { - // Color data can just be passed direct. - const uinT32* data = pixGetData(pix_); - OtsuThresholdRectToIMAGE(reinterpret_cast(data), - image_bytespp_, image_bytespl_, image); - } else { - // Convert 8-bit to IMAGE and then pass its - // buffer to the raw interface to complete the conversion. - IMAGE temp_image; - temp_image.FromPix(pix_); - OtsuThresholdRectToIMAGE(temp_image.get_buffer(), - image_bytespp_, - COMPUTE_IMAGE_XDIM(temp_image.get_xsize(), - temp_image.get_bpp()), - image); - } - } - return; - } -#endif - if (image_bytespp_ > 0) { - // Threshold grey or color. - OtsuThresholdRectToIMAGE(image_data_, image_bytespp_, image_bytespl_, - image); - } else { - CopyBinaryRectRawToIMAGE(image); - } -} - -#ifdef HAVE_LIBLEPT // NOTE: Opposite to SetImage for raw images, SetImage for Pix clones its // input, so the source pix may be pixDestroyed immediately after. void ImageThresholder::SetImage(const Pix* pix) { @@ -191,6 +128,8 @@ void ImageThresholder::SetImage(const Pix* pix) { depth = pixGetDepth(pix_); image_bytespp_ = depth / 8; image_bytespl_ = pixGetWpl(pix_) * sizeof(l_uint32); + scale_ = 1; + yres_ = pixGetYRes(src); Init(); } @@ -275,74 +214,7 @@ Pix* ImageThresholder::GetPixRectGrey() { } return pix; } -#endif -// Otsu threshold the rectangle, taking everything except the image buffer -// pointer from the class, to the output IMAGE. -void ImageThresholder::OtsuThresholdRectToIMAGE(const unsigned char* imagedata, - int bytes_per_pixel, - int bytes_per_line, - IMAGE* image) const { - int* thresholds; - int* hi_values; - OtsuThreshold(imagedata, bytes_per_pixel, bytes_per_line, - rect_left_, rect_top_, rect_width_, rect_height_, - &thresholds, &hi_values); - - // Threshold the image to the given IMAGE. - ThresholdRectToIMAGE(imagedata, bytes_per_pixel, bytes_per_line, - thresholds, hi_values, image); - delete [] thresholds; - delete [] hi_values; -} - -// Threshold the given grey or color image into the tesseract global -// image ready for recognition. Requires thresholds and hi_value -// produced by OtsuThreshold in otsuthr.cpp. -void ImageThresholder::ThresholdRectToIMAGE(const unsigned char* imagedata, - int bytes_per_pixel, - int bytes_per_line, - const int* thresholds, - const int* hi_values, - IMAGE* image) const { - IMAGELINE line; - image->create(rect_width_, rect_height_, 1); - line.init(rect_width_); - // For each line in the image, fill the IMAGELINE class and put it into the - // output IMAGE. Note that Tesseract stores images with the - // bottom at y=0 and 0 is black, so we need 2 kinds of inversion. - const unsigned char* data = imagedata + rect_top_* bytes_per_line + - rect_left_ * bytes_per_pixel; - for (int y = rect_height_ - 1 ; y >= 0; --y) { - const unsigned char* pix = data; - for (int x = 0; x < rect_width_; ++x, pix += bytes_per_pixel) { - line.pixels[x] = 1; - for (int ch = 0; ch < bytes_per_pixel; ++ch) { - if (hi_values[ch] >= 0 && - (pix[ch] > thresholds[ch]) == (hi_values[ch] == 0)) { - line.pixels[x] = 0; - break; - } - } - } - image->put_line(0, y, rect_width_, &line, 0); - data += bytes_per_line; - } -} - -// Cut out the requested rectangle of the binary image to the output IMAGE. -void ImageThresholder::CopyBinaryRectRawToIMAGE(IMAGE* image) const { - IMAGE rect_image; - rect_image.capture(const_cast(image_data_), - image_width_, rect_top_ + rect_height_, 1); - image->create(rect_width_, rect_height_, 1); - // copy_sub_image uses coords starting at the bottom, so the y coord of the - // copy is the bottom of the rect_image. - copy_sub_image(&rect_image, rect_left_, 0, rect_width_, rect_height_, - image, 0, 0, false); -} - -#ifdef HAVE_LIBLEPT // Otsu threshold the rectangle, taking everything except the image buffer // pointer from the class, to the output Pix. void ImageThresholder::OtsuThresholdRectToPix(const unsigned char* imagedata, @@ -438,21 +310,5 @@ void ImageThresholder::RawRectToPix(Pix** pix) const { } } -// Cut out the requested rectangle of the binary image to the output IMAGE. -void ImageThresholder::CopyBinaryRectPixToIMAGE(IMAGE* image) const { - if (IsFullImage()) { - // Just poke it directly into the tess image. - image->FromPix(pix_); - } else { - // Crop to the given rectangle. - Box* box = boxCreate(rect_left_, rect_top_, rect_width_, rect_height_); - Pix* cropped = pixClipRectangle(pix_, box, NULL); - image->FromPix(cropped); - pixDestroy(&cropped); - boxDestroy(&box); - } -} -#endif - } // namespace tesseract. diff --git a/ccmain/thresholder.h b/ccmain/thresholder.h index 7d11f64528e6c450868d6d4188f9a6e9afab3752..7e21e259abc81498e51fd9c3f9c3d08aa03395d4 100644 --- a/ccmain/thresholder.h +++ b/ccmain/thresholder.h @@ -27,7 +27,7 @@ namespace tesseract { /// Base class for all tesseract image thresholding classes. /// Specific classes can add new thresholding methods by -/// overriding ThresholdToIMAGE and/or ThresholdToPix. +/// overriding ThresholdToPix. /// Each instance deals with a single image, but the design is intended to /// be useful for multiple calls to SetRectangle and ThresholdTo* if /// desired. @@ -66,10 +66,6 @@ class ImageThresholder { virtual void GetImageSizes(int* left, int* top, int* width, int* height, int* imagewidth, int* imageheight); - /// Return true if this thresholder implements the Pix - /// interface. - virtual bool HasThresholdToPix() const; - /// Return true if the source image is color. bool IsColor() const { return image_bytespp_ >= 3; @@ -80,9 +76,15 @@ class ImageThresholder { return image_bytespp_ == 0; } - /// Threshold the source image as efficiently as possible to the output - /// tesseract IMAGE class. - virtual void ThresholdToIMAGE(IMAGE* image); + int GetScaleFactor() const { + return scale_; + } + int GetSourceYResolution() const { + return yres_; + } + int GetScaledYResolution() const { + return scale_ * yres_; + } /// Pix vs raw, which to use? /// Implementations should provide the ability to source and target Pix @@ -126,23 +128,6 @@ class ImageThresholder { rect_width_ == image_width_ && rect_height_ == image_height_; } - /// Otsu threshold the rectangle, taking everything except the image buffer - /// pointer from the class, to the output IMAGE. - void OtsuThresholdRectToIMAGE(const unsigned char* imagedata, - int bytes_per_pixel, int bytes_per_line, - IMAGE* image) const; - - /// Threshold the rectangle, taking everything except the image buffer pointer - /// from the class, using thresholds/hi_values to the output IMAGE. - void ThresholdRectToIMAGE(const unsigned char* imagedata, - int bytes_per_pixel, int bytes_per_line, - const int* thresholds, const int* hi_values, - IMAGE* image) const; - - /// Cut out the requested rectangle of the source raw binary image to the - /// output IMAGE. - void CopyBinaryRectRawToIMAGE(IMAGE* image) const; - /// Otsu threshold the rectangle, taking everything except the image buffer /// pointer from the class, to the output Pix. void OtsuThresholdRectToPix(const unsigned char* imagedata, @@ -159,9 +144,6 @@ class ImageThresholder { /// Copy the raw image rectangle, taking all data from the class, to the Pix. void RawRectToPix(Pix** pix) const; - /// Cut out the requested rectangle of the binary image to the output IMAGE. - void CopyBinaryRectPixToIMAGE(IMAGE* image) const; - protected: /// Clone or other copy of the source Pix. /// The pix will always be PixDestroy()ed on destruction of the class. @@ -174,6 +156,8 @@ class ImageThresholder { int image_bytespp_; //< Bytes per pixel of source image/pix. int image_bytespl_; //< Bytes per line of source image/pix. // Limits of image rectangle to be processed. + int scale_; //< Scale factor from original image. + int yres_; //< y pixels/inch in source image int rect_left_; int rect_top_; int rect_width_; diff --git a/ccstruct/blobbox.cpp b/ccstruct/blobbox.cpp index f8dff991ab4bc8c2840fdaf577bd05daa189f49a..0fdb8ff4d80b092de9b54720dd4a8c1e82278bec 100644 --- a/ccstruct/blobbox.cpp +++ b/ccstruct/blobbox.cpp @@ -1081,4 +1081,4 @@ void plot_blob_list(ScrollView* win, // window to draw in it.data()->plot(win, body_colour, child_colour); } } -#endif //GRAPHICS_DISABLED +#endif // GRAPHICS_DISABLED diff --git a/ccstruct/blobbox.h b/ccstruct/blobbox.h index de5253634dab0e168667d20cf706530a34eb6b67..b6a7783f53fa636c38f25e145e9d9d5320c1246a 100644 --- a/ccstruct/blobbox.h +++ b/ccstruct/blobbox.h @@ -760,5 +760,5 @@ void plot_blob_list(ScrollView* win, // window to draw in BLOBNBOX_LIST *list, // blob list ScrollView::Color body_colour, // colour to draw ScrollView::Color child_colour); // colour of child -#endif //GRAPHICS_DISABLED +#endif // GRAPHICS_DISABLED #endif diff --git a/ccstruct/boxword.cpp b/ccstruct/boxword.cpp index a04efec756d4b101a8cf5f0bb7a7b49356d5bda8..a7c252944544a8fefa1df68316a0e9e60ef1dacf 100644 --- a/ccstruct/boxword.cpp +++ b/ccstruct/boxword.cpp @@ -29,6 +29,12 @@ namespace tesseract { // tolerance. Otherwise, the blob may be chopped and we have to just use // the word bounding box. const int kBoxClipTolerance = 2; +// Min offset in baseline-normalized coords to make a character a subscript. +const int kMinSubscriptOffset = 20; +// Min offset in baseline-normalized coords to make a character a superscript. +const int kMinSuperscriptOffset = 20; +// Max y of bottom of a drop-cap blob. +const int kMaxDropCapBottom = -128; BoxWord::BoxWord() : length_(0) { } @@ -95,20 +101,35 @@ BoxWord* BoxWord::CopyFromNormalized(const DENORM* denorm, return boxword; } -BoxWord* BoxWord::CopyFromPBLOBs(PBLOB_LIST* blobs) { - BoxWord* boxword = new BoxWord(); - // Count the blobs. - boxword->length_ = blobs->length(); +// Sets up the script_pos_ member using the tessword to get the bln +// bounding boxes, the best_choice to get the unichars, and the unicharset +// to get the target positions. If small_caps is true, sub/super are not +// considered, but dropcaps are. +void BoxWord::SetScriptPositions(const UNICHARSET& unicharset, bool small_caps, + TWERD* tessword, WERD_CHOICE* best_choice) { // Allocate memory. - boxword->boxes_.reserve(boxword->length_); - // Copy the boxes. - PBLOB_IT pb_it(blobs); - int i = 0; - for (pb_it.mark_cycle_pt(); !pb_it.cycled_list(); pb_it.forward(), ++i) { - boxword->boxes_.push_back(pb_it.data()->bounding_box()); + script_pos_.init_to_size(length_, SP_NORMAL); + + int blob_index = 0; + for (TBLOB* tblob = tessword->blobs; tblob != NULL; tblob = tblob->next, + ++blob_index) { + int class_id = best_choice->unichar_id(blob_index); + TBOX blob_box = tblob->bounding_box(); + int top = blob_box.top(); + int bottom = blob_box.bottom(); + int min_bottom, max_bottom, min_top, max_top; + unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, + &min_top, &max_top); + if (bottom <= kMaxDropCapBottom) { + script_pos_[blob_index] = SP_DROPCAP; + } else if (!small_caps) { + if (top + kMinSubscriptOffset < min_top) { + script_pos_[blob_index] = SP_SUBSCRIPT; + } else if (bottom - kMinSuperscriptOffset > max_bottom) { + script_pos_[blob_index] = SP_SUPERSCRIPT; + } + } } - boxword->ComputeBoundingBox(); - return boxword; } // Clean up the bounding boxes from the polygonal approximation by diff --git a/ccstruct/boxword.h b/ccstruct/boxword.h index b8fb18ef9eacc806d86bc78038d19c896aa96a99..f481b416d9375c42f6aef6cf4bdf489d41a616ec 100644 --- a/ccstruct/boxword.h +++ b/ccstruct/boxword.h @@ -27,11 +27,21 @@ class BLOCK; class DENORM; class PBLOB_LIST; struct TWERD; +class UNICHARSET; class WERD; +class WERD_CHOICE; class WERD_RES; namespace tesseract { +// ScriptPos tells whether a character is subscript, superscript or normal. +enum ScriptPos { + SP_NORMAL, + SP_SUBSCRIPT, + SP_SUPERSCRIPT, + SP_DROPCAP +}; + // Class to hold an array of bounding boxes for an output word and // the bounding box of the whole word. class BoxWord { @@ -50,7 +60,13 @@ class BoxWord { // back to the original image coordinates. static BoxWord* CopyFromNormalized(const DENORM* denorm, TWERD* tessword); - static BoxWord* CopyFromPBLOBs(PBLOB_LIST* blobs); + + // Sets up the script_pos_ member using the tessword to get the bln + // bounding boxes, the best_choice to get the unichars, and the unicharset + // to get the target positions. If small_caps is true, sub/super are not + // considered, but dropcaps are. + void SetScriptPositions(const UNICHARSET& unicharset, bool small_caps, + TWERD* tessword, WERD_CHOICE* best_choice); // Clean up the bounding boxes from the polygonal approximation by // expanding slightly, then clipping to the blobs from the original_word @@ -83,6 +99,11 @@ class BoxWord { const TBOX& BlobBox(int index) const { return boxes_[index]; } + ScriptPos BlobPosition(int index) const { + if (index < 0 || index >= script_pos_.size()) + return SP_NORMAL; + return script_pos_[index]; + } private: void ComputeBoundingBox(); @@ -90,6 +111,7 @@ class BoxWord { TBOX bbox_; int length_; GenericVector boxes_; + GenericVector script_pos_; }; } // namespace tesseract. diff --git a/ccstruct/pageres.cpp b/ccstruct/pageres.cpp index 3c11f9618cd3a49c5cf90c1b2f5dbe57de4080e3..b4db78f86b0abc5dcf0149b2705190aa9e7b521f 100644 --- a/ccstruct/pageres.cpp +++ b/ccstruct/pageres.cpp @@ -208,6 +208,7 @@ void WERD_RES::CopySimpleFields(const WERD_RES& source) { tess_would_adapt = source.tess_would_adapt; done = source.done; unlv_crunch_mode = source.unlv_crunch_mode; + small_caps = source.small_caps; italic = source.italic; bold = source.bold; font1 = source.font1; @@ -301,6 +302,13 @@ void WERD_RES::SetupBoxWord() { box_word->ClipToOriginalWord(denorm.block(), word); } +// Sets up the script positions in the output boxword using the best_choice +// to get the unichars, and the unicharset to get the target positions. +void WERD_RES::SetScriptPositions(const UNICHARSET& unicharset) { + box_word->SetScriptPositions(unicharset, small_caps, rebuild_word, + best_choice); +} + // Classifies the word with some already-calculated BLOB_CHOICEs. // The choices are an array of blob_count pointers to BLOB_CHOICE, // providing a single classifier result for each blob. diff --git a/ccstruct/pageres.h b/ccstruct/pageres.h index bc82a0afb59303b7454c188cb4b5b1a414873b6f..1e694d71da908898cc368061c2c8c0e06d9faeb3 100644 --- a/ccstruct/pageres.h +++ b/ccstruct/pageres.h @@ -194,6 +194,7 @@ class WERD_RES : public ELIST_LINK { BOOL8 tess_accepted; //Tess thinks its ok? BOOL8 tess_would_adapt; //Tess would adapt? BOOL8 done; //ready for output? + bool small_caps; // Word appears to be small caps. inT8 italic; inT8 bold; inT8 font1; //primary font @@ -239,6 +240,7 @@ class WERD_RES : public ELIST_LINK { tess_would_adapt = FALSE; done = FALSE; unlv_crunch_mode = CR_NONE; + small_caps = false; italic = FALSE; bold = FALSE; font1 = -1; @@ -283,6 +285,10 @@ class WERD_RES : public ELIST_LINK { // Sets/replaces the box_word with one made from the rebuild_word. void SetupBoxWord(); + // Sets up the script positions in the output boxword using the best_choice + // to get the unichars, and the unicharset to get the target positions. + void SetScriptPositions(const UNICHARSET& unicharset); + // Classifies the word with some already-calculated BLOB_CHOICEs. // The choices are an array of blob_count pointers to BLOB_CHOICE, // providing a single classifier result for each blob. diff --git a/ccstruct/rect.cpp b/ccstruct/rect.cpp index edd1522dc3e1bd6b51d0f3a1108ed9d25ded1947..b6919016ca315ed91c9995b547134d59a1059d4b 100644 --- a/ccstruct/rect.cpp +++ b/ccstruct/rect.cpp @@ -17,7 +17,7 @@ * **********************************************************************/ -#include "mfcpch.h" //precompiled headers +#include "mfcpch.h" // precompiled headers #include "rect.h" // Include automatically generated configuration file if running autoconf. diff --git a/textord/oldbasel.cpp b/textord/oldbasel.cpp index e6cbab204bd0935061bc617f9d680cc5ee564be8..936ccd10f4a6b48a5c5969e36a302cf9e4d670f2 100644 --- a/textord/oldbasel.cpp +++ b/textord/oldbasel.cpp @@ -107,6 +107,7 @@ void Textord::make_old_baselines(TO_BLOCK *block, // block to do } } correlate_lines(block, gradient); + block->block->set_xheight(block->xheight); } diff --git a/textord/tordmain.cpp b/textord/tordmain.cpp index 2cbfde4d4f0e52f4321c43e8c5bdfb8d561e007b..37a6e5b9efa641bc99230a44dd97ec623936e793 100644 --- a/textord/tordmain.cpp +++ b/textord/tordmain.cpp @@ -42,9 +42,6 @@ // Some of the code in this file is dependent upon leptonica. If you don't // have it, you don't get this functionality. -#ifdef HAVE_CONFIG_H -#include "config_auto.h" -#endif #ifdef HAVE_LIBLEPT #include "allheaders.h" #endif