From 16fd1439faa4f2c04275c17ca84272328abb843f Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Sat, 7 Aug 2021 22:13:16 +0200 Subject: [PATCH] Write image filename in ALTO output Signed-off-by: Stefan Weil --- include/tesseract/renderer.h | 3 +++ src/api/altorenderer.cpp | 32 +++++++++++++++++++------------- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/include/tesseract/renderer.h b/include/tesseract/renderer.h index 2cfa06c5..ffc27d41 100644 --- a/include/tesseract/renderer.h +++ b/include/tesseract/renderer.h @@ -189,6 +189,9 @@ protected: bool BeginDocumentHandler() override; bool AddImageHandler(TessBaseAPI *api) override; bool EndDocumentHandler() override; + +private: + bool begin_document; }; /** diff --git a/src/api/altorenderer.cpp b/src/api/altorenderer.cpp index c0afcb07..089189ef 100644 --- a/src/api/altorenderer.cpp +++ b/src/api/altorenderer.cpp @@ -55,7 +55,17 @@ static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level, /// Append the ALTO XML for the beginning of the document /// bool TessAltoRenderer::BeginDocumentHandler() { - AppendString( + // Delay the XML output because we need the name of the image file. + begin_document = true; + return true; +} + +/// +/// Append the ALTO XML for the layout of the image +/// +bool TessAltoRenderer::AddImageHandler(TessBaseAPI *api) { + if (begin_document) { + AppendString( "\n" "\n" "\t\t\t"); - AppendString(title()); + AppendString(api->GetInputName()); - AppendString( + AppendString( "\n" "\t\t\n" "\t\t\n" "\t\t\t\n" "\t\t\t\t\n" "\t\t\t\t\ttesseract "); - AppendString(TessBaseAPI::Version()); - AppendString( + AppendString(TessBaseAPI::Version()); + AppendString( "\n" "\t\t\t\t\n" "\t\t\t\n" "\t\t\n" "\t\n" "\t\n"); + begin_document = false; + } - return true; -} - -/// -/// Append the ALTO XML for the layout of the image -/// -bool TessAltoRenderer::AddImageHandler(TessBaseAPI *api) { const std::unique_ptr text(api->GetAltoText(imagenum())); if (text == nullptr) { return false; @@ -112,7 +117,8 @@ bool TessAltoRenderer::EndDocumentHandler() { } TessAltoRenderer::TessAltoRenderer(const char *outputbase) - : TessResultRenderer(outputbase, "xml") {} + : TessResultRenderer(outputbase, "xml"), + begin_document(false) {} /// /// Make an XML-formatted string with ALTO markup from the internal -- GitLab