diff --git a/src/api/altorenderer.cpp b/src/api/altorenderer.cpp index 11af90f6924e2600547dd9ba0d802c3783409890..67b7c1fc7eb948b874b49f77a0e9b9023deb337f 100644 --- a/src/api/altorenderer.cpp +++ b/src/api/altorenderer.cpp @@ -13,240 +13,242 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "baseapi.h" #include +#include "baseapi.h" #include "renderer.h" namespace tesseract { - /// - /// Add coordinates to specified TextBlock, TextLine, or String bounding box - /// Add word confidence if adding to a String bounding box - /// - static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level, - STRING *alto_str) { - int left, top, right, bottom; - it->BoundingBox(level, &left, &top, &right, &bottom); - - int hpos = left; - int vpos = top; - int height = bottom - top; - int width = right - left; - - *alto_str += " HPOS=\""; - alto_str->add_str_int("", hpos); - *alto_str += "\""; - *alto_str += " VPOS=\""; - alto_str->add_str_int("", vpos); - *alto_str += "\""; - *alto_str += " WIDTH=\""; - alto_str->add_str_int("", width); - *alto_str += "\""; - *alto_str += " HEIGHT=\""; - alto_str->add_str_int("", height); - *alto_str += "\""; - - if (level == RIL_WORD) { - int wc = it->Confidence(RIL_WORD); - *alto_str += " WC=\"0."; - alto_str->add_str_int("", wc); - *alto_str += "\""; - } - if (level != RIL_WORD) { - - *alto_str += ">"; - } +/// +/// Add coordinates to specified TextBlock, TextLine, or String bounding box +/// Add word confidence if adding to a String bounding box +/// +static void AddBoxToAlto(const ResultIterator* it, PageIteratorLevel level, + STRING* alto_str) { + int left, top, right, bottom; + it->BoundingBox(level, &left, &top, &right, &bottom); + + int hpos = left; + int vpos = top; + int height = bottom - top; + int width = right - left; + + *alto_str += " HPOS=\""; + alto_str->add_str_int("", hpos); + *alto_str += "\""; + *alto_str += " VPOS=\""; + alto_str->add_str_int("", vpos); + *alto_str += "\""; + *alto_str += " WIDTH=\""; + alto_str->add_str_int("", width); + *alto_str += "\""; + *alto_str += " HEIGHT=\""; + alto_str->add_str_int("", height); + *alto_str += "\""; + + if (level == RIL_WORD) { + int wc = it->Confidence(RIL_WORD); + *alto_str += " WC=\"0."; + alto_str->add_str_int("", wc); + *alto_str += "\""; + } + if (level != RIL_WORD) { + *alto_str += ">"; + } +} + +/// +/// Add a unique ID to an ALTO element +/// +static void AddIdToAlto(STRING* alto_str, const std::string base, int num1) { + const size_t BUFSIZE = 64; + char id_buffer[BUFSIZE]; + snprintf(id_buffer, BUFSIZE - 1, "%s_%d", base.c_str(), num1); + id_buffer[BUFSIZE - 1] = '\0'; + *alto_str += " ID=\""; + *alto_str += id_buffer; + *alto_str += "\""; +} + +/// +/// Append the ALTO XML for the beginning of the document +/// +bool TessAltoRenderer::BeginDocumentHandler() { + AppendString( + "\n" + "\n" + "\t\n" + "\t\tpixel\n" + "\t\t\n" + "\t\t\t"); + + AppendString(title()); + + AppendString( + "\t\t\t\n" + "\t\t\n" + "\t\t\n" + "\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\t\ttesseract "); + AppendString(TessBaseAPI::Version()); + AppendString( + "\n" + "\t\t\t\t\n" + "\t\t\t\n" + "\t\t\n" + "\t\n" + "\t\n"); + + return true; +} + +/// +/// Append the ALTO XML for the layout of the image +/// +bool TessAltoRenderer::AddImageHandler(TessBaseAPI* api) { + const std::unique_ptr hocr(api->GetAltoText(imagenum())); + if (hocr == nullptr) return false; + + AppendString(hocr.get()); + + return true; +} + +/// +/// Append the ALTO XML for the end of the document +/// +bool TessAltoRenderer::EndDocumentHandler() { + AppendString("\t\n\n"); + + return true; +} + +TessAltoRenderer::TessAltoRenderer(const char* outputbase) + : TessResultRenderer(outputbase, "xml") {} + +/// +/// Make an XML-formatted string with ALTO markup from the internal +/// data structures. +/// +char* TessBaseAPI::GetAltoText(int page_number) { + return GetAltoText(nullptr, page_number); +} + +/// +/// Make an XML-formatted string with ALTO markup from the internal +/// data structures. +/// +char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) { + if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) + return nullptr; + + int lcnt = 0, bcnt = 0, wcnt = 0; + int page_id = page_number; + + STRING alto_str(""); + + if (input_file_ == nullptr) SetInputName(nullptr); + +#ifdef _WIN32 + // convert input name from ANSI encoding to utf-8 + int str16_len = + MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0); + wchar_t* uni16_str = new WCHAR[str16_len]; + str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, + uni16_str, str16_len); + int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, + 0, nullptr, nullptr); + char* utf8_str = new char[utf8_len]; + WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, + nullptr, nullptr); + *input_file_ = utf8_str; + delete[] uni16_str; + delete[] utf8_str; +#endif + + alto_str += "\t\t\n"; + + ResultIterator* res_it = GetIterator(); + while (!res_it->Empty(RIL_BLOCK)) { + if (res_it->Empty(RIL_WORD)) { + res_it->Next(RIL_WORD); + continue; } - /// - /// Add a unique ID to an ALTO element - /// - static void AddIdToAlto(STRING *alto_str, const std::string base, int num1) { - const size_t BUFSIZE = 64; - char id_buffer[BUFSIZE]; - snprintf(id_buffer, BUFSIZE - 1, "%s_%d", base.c_str(), num1); - id_buffer[BUFSIZE - 1] = '\0'; - *alto_str += " ID=\""; - *alto_str += id_buffer; - *alto_str += "\""; + if (res_it->IsAtBeginningOf(RIL_BLOCK)) { + alto_str += "\t\t\t\t\n" - "\n" - "\t\n" - "\t\tpixel\n" - "\t\t\n" - "\t\t\t"); - - AppendString(title()); - - AppendString("\t\t\t\n" - "\t\t\n" - "\t\t\n" - "\t\t\t\n" - "\t\t\t\t\n" - "\t\t\t\t\ttesseract "); - AppendString(TessBaseAPI::Version()); - AppendString("\n" - "\t\t\t\t\n" - "\t\t\t\n" - "\t\t\n" - "\t\n" - "\t\n"); - - return true; + if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { + alto_str += "\t\t\t\t\t hocr(api->GetAltoText(imagenum())); - if (hocr == nullptr) return false; + alto_str += "\t\t\t\t\t\tIsAtFinalElement(RIL_TEXTLINE, RIL_WORD); + bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); - return true; - } + do { + const std::unique_ptr grapheme( + res_it->GetUTF8Text(RIL_SYMBOL)); + if (grapheme && grapheme[0] != 0) { + alto_str += HOcrEscape(grapheme.get()); + } + res_it->Next(RIL_SYMBOL); + } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); - /// - /// Append the ALTO XML for the end of the document - /// - bool TessAltoRenderer::EndDocumentHandler() { - AppendString("\t\n\n"); + alto_str += "\"/>\n"; - return true; - } + wcnt++; - TessAltoRenderer::TessAltoRenderer(const char *outputbase) - : TessResultRenderer(outputbase, "xml") { + if (last_word_in_line) { + alto_str += "\t\t\t\t\t\n"; + lcnt++; } - /// - /// Make an XML-formatted string with ALTO markup from the internal - /// data structures. - /// - char *TessBaseAPI::GetAltoText(int page_number) { - return GetAltoText(nullptr, page_number); + if (last_word_in_block) { + alto_str += "\t\t\t\t\n"; + bcnt++; } + } - /// - /// Make an XML-formatted string with ALTO markup from the internal - /// data structures. - /// - char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { - if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) - return nullptr; - - int lcnt = 0, bcnt = 0, wcnt = 0; - int page_id = page_number; - - STRING alto_str(""); - - if (input_file_ == nullptr) - SetInputName(nullptr); - - #ifdef _WIN32 - // convert input name from ANSI encoding to utf-8 - int str16_len = - MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0); - wchar_t *uni16_str = new WCHAR[str16_len]; - str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, - uni16_str, str16_len); - int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, - nullptr, nullptr); - char *utf8_str = new char[utf8_len]; - WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, - utf8_len, nullptr, nullptr); - *input_file_ = utf8_str; - delete[] uni16_str; - delete[] utf8_str; - #endif - - alto_str += "\t\t\n"; - - ResultIterator *res_it = GetIterator(); - while (!res_it->Empty(RIL_BLOCK)) { - if (res_it->Empty(RIL_WORD)) { - res_it->Next(RIL_WORD); - continue; - } - - if (res_it->IsAtBeginningOf(RIL_BLOCK)) { - alto_str += "\t\t\t\tIsAtBeginningOf(RIL_TEXTLINE)) { - - alto_str += "\t\t\t\t\tIsAtFinalElement(RIL_TEXTLINE, RIL_WORD); - bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); - - do { - const std::unique_ptr grapheme( - res_it->GetUTF8Text(RIL_SYMBOL)); - if (grapheme && grapheme[0] != 0) { - alto_str += HOcrEscape(grapheme.get()); - } - res_it->Next(RIL_SYMBOL); - } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); - - alto_str += "\"/>\n"; - - wcnt++; - - if (last_word_in_line) { - alto_str += "\t\t\t\t\t\n"; - lcnt++; - } - - if (last_word_in_block) { - alto_str += "\t\t\t\t\n"; - bcnt++; - } - } - - alto_str += "\t\t\t\n"; - alto_str += "\t\t\n"; - - char *ret = new char[alto_str.length() + 1]; - strcpy(ret, alto_str.string()); - delete res_it; - return ret; - } + alto_str += "\t\t\t\n"; + alto_str += "\t\t\n"; + char* ret = new char[alto_str.length() + 1]; + strcpy(ret, alto_str.string()); + delete res_it; + return ret; } + +} // namespace tesseract