altorenderer.cpp 7.5 KB
Newer Older
J
Jake Sebright 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// File:        altorenderer.cpp
// Description: ALTO rendering interface
// Author:      Jake Sebright

// (C) Copyright 2018
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

16
#ifdef _WIN32
17
#  include "host.h" // windows.h for MultiByteToWideChar, ...
18
#endif
19 20

#include <tesseract/baseapi.h>
E
Egor Pugin 已提交
21
#include <tesseract/renderer.h>
J
Jake Sebright 已提交
22

23
#include <memory>
24
#include <sstream> // for std::stringstream
25

J
Jake Sebright 已提交
26 27
namespace tesseract {

28 29
/// Add coordinates to specified TextBlock, TextLine or String bounding box.
/// Add word confidence if adding to a String bounding box.
30
///
31 32
static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level,
                         std::stringstream &alto_str) {
33 34 35 36 37 38 39 40
  int left, top, right, bottom;
  it->BoundingBox(level, &left, &top, &right, &bottom);

  int hpos = left;
  int vpos = top;
  int height = bottom - top;
  int width = right - left;

41 42 43 44
  alto_str << " HPOS=\"" << hpos << "\"";
  alto_str << " VPOS=\"" << vpos << "\"";
  alto_str << " WIDTH=\"" << width << "\"";
  alto_str << " HEIGHT=\"" << height << "\"";
45 46 47

  if (level == RIL_WORD) {
    int wc = it->Confidence(RIL_WORD);
48 49 50
    alto_str << " WC=\"0." << wc << "\"";
  } else {
    alto_str << ">";
51 52 53 54 55 56 57
  }
}

///
/// Append the ALTO XML for the beginning of the document
///
bool TessAltoRenderer::BeginDocumentHandler() {
58 59 60 61 62 63 64 65 66 67 68
  // Delay the XML output because we need the name of the image file.
  begin_document = true;
  return true;
}

///
/// Append the ALTO XML for the layout of the image
///
bool TessAltoRenderer::AddImageHandler(TessBaseAPI *api) {
  if (begin_document) {
    AppendString(
69 70 71 72 73 74 75 76 77 78 79
      "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
      "<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" "
      "xmlns:xlink=\"http://www.w3.org/1999/xlink\" "
      "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
      "xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# "
      "http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
      "\t<Description>\n"
      "\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
      "\t\t<sourceImageInformation>\n"
      "\t\t\t<fileName>");

80
    AppendString(api->GetInputName());
81

82
    AppendString(
83
      "</fileName>\n"
84 85 86 87 88
      "\t\t</sourceImageInformation>\n"
      "\t\t<OCRProcessing ID=\"OCR_0\">\n"
      "\t\t\t<ocrProcessingStep>\n"
      "\t\t\t\t<processingSoftware>\n"
      "\t\t\t\t\t<softwareName>tesseract ");
89 90
    AppendString(TessBaseAPI::Version());
    AppendString(
91 92 93 94 95 96
      "</softwareName>\n"
      "\t\t\t\t</processingSoftware>\n"
      "\t\t\t</ocrProcessingStep>\n"
      "\t\t</OCRProcessing>\n"
      "\t</Description>\n"
      "\t<Layout>\n");
97 98
    begin_document = false;
  }
99

100
  const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum()));
101
  if (text == nullptr) {
102
    return false;
103
  }
104

105
  AppendString(text.get());
106 107 108 109 110 111 112 113 114 115 116 117 118

  return true;
}

///
/// Append the ALTO XML for the end of the document
///
bool TessAltoRenderer::EndDocumentHandler() {
  AppendString("\t</Layout>\n</alto>\n");

  return true;
}

119
TessAltoRenderer::TessAltoRenderer(const char *outputbase)
120 121
    : TessResultRenderer(outputbase, "xml"),
      begin_document(false) {}
122 123 124 125 126

///
/// Make an XML-formatted string with ALTO markup from the internal
/// data structures.
///
127
char *TessBaseAPI::GetAltoText(int page_number) {
128 129 130 131 132 133 134
  return GetAltoText(nullptr, page_number);
}

///
/// Make an XML-formatted string with ALTO markup from the internal
/// data structures.
///
135
char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
136
  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) {
137
    return nullptr;
138
  }
139

140
  int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0;
141

142 143 144
  if (input_file_.empty()) {
    SetInputName(nullptr);
  }
145 146 147

#ifdef _WIN32
  // convert input name from ANSI encoding to utf-8
148 149 150 151 152 153 154
  int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
  wchar_t *uni16_str = new WCHAR[str16_len];
  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len);
  int utf8_len =
      WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr);
  char *utf8_str = new char[utf8_len];
  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr);
155
  input_file_ = utf8_str;
156 157 158 159
  delete[] uni16_str;
  delete[] utf8_str;
#endif

160
  std::stringstream alto_str;
S
Stefan Weil 已提交
161 162
  // Use "C" locale (needed for int values larger than 999).
  alto_str.imbue(std::locale::classic());
163 164 165 166 167 168 169 170
  alto_str << "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\"" << rect_height_
           << "\" PHYSICAL_IMG_NR=\"" << page_number << "\""
           << " ID=\"page_" << page_number << "\">\n"
           << "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
           << " WIDTH=\"" << rect_width_ << "\""
           << " HEIGHT=\"" << rect_height_ << "\">\n";

  ResultIterator *res_it = GetIterator();
171 172 173 174
  while (!res_it->Empty(RIL_BLOCK)) {
    if (res_it->Empty(RIL_WORD)) {
      res_it->Next(RIL_WORD);
      continue;
J
Jake Sebright 已提交
175 176
    }

177
    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
178
      alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
179 180
      AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
      alto_str << "\n";
J
Jake Sebright 已提交
181 182
    }

183 184 185 186 187 188
    if (res_it->IsAtBeginningOf(RIL_PARA)) {
      alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\"";
      AddBoxToAlto(res_it, RIL_PARA, alto_str);
      alto_str << "\n";
    }

189
    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
190
      alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
191 192
      AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
      alto_str << "\n";
J
Jake Sebright 已提交
193 194
    }

195
    alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
196 197
    AddBoxToAlto(res_it, RIL_WORD, alto_str);
    alto_str << " CONTENT=\"";
J
Jake Sebright 已提交
198

199
    bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
200 201 202
    bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
    bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);

203 204 205
    int left, top, right, bottom;
    res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);

206
    do {
207
      const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
208
      if (grapheme && grapheme[0] != 0) {
209
        alto_str << HOcrEscape(grapheme.get()).c_str();
210 211 212
      }
      res_it->Next(RIL_SYMBOL);
    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
J
Jake Sebright 已提交
213

214
    alto_str << "\"/>";
J
Jake Sebright 已提交
215

216
    wcnt++;
J
Jake Sebright 已提交
217

218
    if (last_word_in_line) {
219
      alto_str << "\n\t\t\t\t\t\t</TextLine>\n";
220
      lcnt++;
221 222 223 224 225
    } else {
      int hpos = right;
      int vpos = top;
      res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
      int width = left - hpos;
226 227
      alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos << "\" HPOS=\"" << hpos
               << "\"/>\n";
J
Jake Sebright 已提交
228 229
    }

230 231 232 233 234 235 236
    if (last_word_in_tblock) {
      alto_str << "\t\t\t\t\t</TextBlock>\n";
      tcnt++;
    }

    if (last_word_in_cblock) {
      alto_str << "\t\t\t\t</ComposedBlock>\n";
237
      bcnt++;
J
Jake Sebright 已提交
238
    }
239
  }
J
Jake Sebright 已提交
240

241 242
  alto_str << "\t\t\t</PrintSpace>\n"
           << "\t\t</Page>\n";
243
  const std::string &text = alto_str.str();
J
Jake Sebright 已提交
244

245
  char *result = new char[text.length() + 1];
246
  strcpy(result, text.c_str());
247
  delete res_it;
248
  return result;
J
Jake Sebright 已提交
249
}
250

251
} // namespace tesseract