stringrenderer.cpp 32.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
/**********************************************************************
 * File:        stringrenderer.cpp
 * Description: Class for rendering UTF-8 text to an image, and retrieving
 *              bounding boxes around each grapheme cluster.
 * Author:      Ranjith Unnikrishnan
 * Created:     Mon Nov 18 2013
 *
 * (C) Copyright 2013, Google Inc.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 **********************************************************************/

#include "stringrenderer.h"

23
#include <assert.h>
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
#include <stdio.h>
#include <string.h>
#include <algorithm>
#include <map>
#include <utility>
#include <vector>

#include "allheaders.h"     // from leptonica
#include "boxchar.h"
#include "ligature_table.h"
#include "normstrngs.h"
#include "pango/pango-font.h"
#include "pango/pango-glyph-item.h"
#include "tlog.h"
#include "unichar.h"
#include "unicode/uchar.h"  // from libicu
#include "util.h"

namespace tesseract {

static const int kDefaultOutputResolution = 300;

// Word joiner (U+2060) inserted after letters in ngram mode, as per
// recommendation in http://unicode.org/reports/tr14/ to avoid line-breaks at
// hyphens and other non-alpha characters.
R
Ray Smith 已提交
49
static const char* kWordJoinerUTF8 = "\xE2\x81\xA0";  // u8"\u2060";
50 51 52 53 54 55 56 57 58
static const char32 kWordJoiner = 0x2060;

static bool IsCombiner(int ch) {
  const int char_type = u_charType(ch);
  return ((char_type == U_NON_SPACING_MARK) ||
          (char_type == U_ENCLOSING_MARK) ||
          (char_type == U_COMBINING_SPACING_MARK));
}

59
static std::string EncodeAsUTF8(const char32 ch32) {
60
  UNICHAR uni_ch(ch32);
61
  return std::string(uni_ch.utf8(), uni_ch.utf8_len());
62 63
}

64 65 66 67 68 69
// Returns true with probability 'prob'.
static bool RandBool(const double prob, TRand* rand) {
  if (prob == 1.0) return true;
  if (prob == 0.0) return false;
  return rand->UnsignedRand(1.0) < prob;
}
70 71 72 73 74 75

/* static */
Pix* CairoARGB32ToPixFormat(cairo_surface_t *surface) {
  if (cairo_image_surface_get_format(surface) != CAIRO_FORMAT_ARGB32) {
    printf("Unexpected surface format %d\n",
           cairo_image_surface_get_format(surface));
S
Stefan Weil 已提交
76
    return nullptr;
77 78 79 80 81 82 83 84 85 86 87 88 89 90
  }
  const int width = cairo_image_surface_get_width(surface);
  const int height = cairo_image_surface_get_height(surface);
  Pix* pix = pixCreate(width, height, 32);
  int byte_stride = cairo_image_surface_get_stride(surface);

  for (int i = 0; i < height; ++i) {
    memcpy(reinterpret_cast<unsigned char*>(pix->data + i * pix->wpl) + 1,
           cairo_image_surface_get_data(surface) + i * byte_stride,
           byte_stride - ((i == height - 1) ? 1 : 0));
  }
  return pix;
}

91
StringRenderer::StringRenderer(const std::string& font_desc, int page_width,
92
                               int page_height)
93 94
    : font_(font_desc),
      page_width_(page_width),
95 96 97
      page_height_(page_height),
      h_margin_(50),
      v_margin_(50),
98
      pen_color_{0.0, 0.0, 0.0},
99 100 101 102 103 104 105 106
      char_spacing_(0),
      leading_(0),
      vertical_text_(false),
      gravity_hint_strong_(false),
      render_fullwidth_latin_(false),
      underline_start_prob_(0),
      underline_continuation_prob_(0),
      underline_style_(PANGO_UNDERLINE_SINGLE),
S
Stefan Weil 已提交
107
      features_(nullptr),
108 109 110 111
      drop_uncovered_chars_(true),
      strip_unrenderable_words_(false),
      add_ligatures_(false),
      output_word_boxes_(false),
S
Stefan Weil 已提交
112 113 114
      surface_(nullptr),
      cr_(nullptr),
      layout_(nullptr),
115 116 117
      start_box_(0),
      page_(0),
      box_padding_(0),
118
      page_boxes_(nullptr),
119 120 121
      total_chars_(0),
      font_index_(0),
      last_offset_(0) {
122 123 124
  set_resolution(kDefaultOutputResolution);
}

125
bool StringRenderer::set_font(const std::string& desc) {
126 127 128 129 130 131 132 133 134 135
  bool success = font_.ParseFontDescriptionName(desc);
  font_.set_resolution(resolution_);
  return success;
}

void StringRenderer::set_resolution(const int resolution) {
  resolution_ = resolution;
  font_.set_resolution(resolution);
}

136
void StringRenderer::set_underline_start_prob(const double frac) {
137
  underline_start_prob_ = std::min(std::max(frac, 0.0), 1.0);
138 139 140
}

void StringRenderer::set_underline_continuation_prob(const double frac) {
141
  underline_continuation_prob_ = std::min(std::max(frac, 0.0), 1.0);
142 143
}

144
StringRenderer::~StringRenderer() {
145
  free(features_);
146 147 148 149 150 151 152 153 154
  ClearBoxes();
  FreePangoCairo();
}

void StringRenderer::InitPangoCairo() {
  FreePangoCairo();
  surface_ = cairo_image_surface_create(CAIRO_FORMAT_ARGB32, page_width_,
                                        page_height_);
  cr_ = cairo_create(surface_);
155 156 157 158
  {
    DISABLE_HEAP_LEAK_CHECK;
    layout_ = pango_cairo_create_layout(cr_);
  }
159 160 161 162 163 164 165 166 167 168 169 170 171 172

  if (vertical_text_) {
    PangoContext* context = pango_layout_get_context(layout_);
    pango_context_set_base_gravity(context, PANGO_GRAVITY_EAST);
    if (gravity_hint_strong_) {
      pango_context_set_gravity_hint(context, PANGO_GRAVITY_HINT_STRONG);
    }
    pango_layout_context_changed(layout_);
  }

  SetLayoutProperties();
}

void StringRenderer::SetLayoutProperties() {
173
  std::string font_desc = font_.DescriptionName();
174 175 176 177 178 179 180 181 182 183 184 185 186
  // Specify the font via a description name
  PangoFontDescription *desc =
      pango_font_description_from_string(font_desc.c_str());
  // Assign the font description to the layout
  pango_layout_set_font_description(layout_, desc);
  pango_font_description_free(desc);  // free the description
  pango_cairo_context_set_resolution(pango_layout_get_context(layout_),
                                     resolution_);

  int max_width = page_width_ - 2 * h_margin_;
  int max_height = page_height_ - 2 * v_margin_;
  tlog(3, "max_width = %d, max_height = %d\n", max_width, max_height);
  if (vertical_text_) {
187
    using std::swap;
188 189 190
    swap(max_width, max_height);
  }
  pango_layout_set_width(layout_, max_width * PANGO_SCALE);
191 192
  // Ultra-wide Thai strings need to wrap at char level.
  pango_layout_set_wrap(layout_, PANGO_WRAP_WORD_CHAR);
193 194 195 196

  // Adjust character spacing
  PangoAttrList* attr_list = pango_attr_list_new();
  if (char_spacing_) {
197 198
    PangoAttribute* spacing_attr =
        pango_attr_letter_spacing_new(char_spacing_ * PANGO_SCALE);
199 200 201 202
    spacing_attr->start_index = 0;
    spacing_attr->end_index = static_cast<guint>(-1);
    pango_attr_list_change(attr_list, spacing_attr);
  }
203
#if (PANGO_VERSION_MAJOR == 1 && PANGO_VERSION_MINOR >= 38)
204 205
  if (add_ligatures_) {
    set_features("liga, clig, dlig, hlig");
R
Ray Smith 已提交
206
    PangoAttribute* feature_attr = pango_attr_font_features_new(features_);
207 208
    pango_attr_list_change(attr_list, feature_attr);
  }
209
#endif
210 211 212 213 214 215 216 217 218 219 220
  pango_layout_set_attributes(layout_, attr_list);
  pango_attr_list_unref(attr_list);
  // Adjust line spacing
  if (leading_) {
    pango_layout_set_spacing(layout_, leading_ * PANGO_SCALE);
  }
}

void StringRenderer::FreePangoCairo() {
  if (layout_) {
    g_object_unref(layout_);
S
Stefan Weil 已提交
221
    layout_ = nullptr;
222 223 224
  }
  if (cr_) {
    cairo_destroy(cr_);
S
Stefan Weil 已提交
225
    cr_ = nullptr;
226 227 228
  }
  if (surface_) {
    cairo_surface_destroy(surface_);
S
Stefan Weil 已提交
229
    surface_ = nullptr;
230 231 232
  }
}

233
void StringRenderer::SetWordUnderlineAttributes(const std::string& page_text) {
234 235 236 237
  if (underline_start_prob_ == 0) return;
  PangoAttrList* attr_list = pango_layout_get_attributes(layout_);

  const char* text = page_text.c_str();
238
  size_t offset = 0;
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
  TRand rand;
  bool started_underline = false;
  PangoAttribute* und_attr = nullptr;

  while (offset < page_text.length()) {
    offset += SpanUTF8Whitespace(text + offset);
    if (offset == page_text.length()) break;

    int word_start = offset;
    int word_len = SpanUTF8NotWhitespace(text + offset);
    offset += word_len;
    if (started_underline) {
      // Should we continue the underline to the next word?
      if (RandBool(underline_continuation_prob_, &rand)) {
        // Continue the current underline to this word.
        und_attr->end_index = word_start + word_len;
      } else {
        // Otherwise end the current underline attribute at the end of the
        // previous word.
        pango_attr_list_insert(attr_list, und_attr);
        started_underline = false;
        und_attr = nullptr;
      }
    }
    if (!started_underline && RandBool(underline_start_prob_, &rand)) {
      // Start a new underline attribute
      und_attr = pango_attr_underline_new(underline_style_);
      und_attr->start_index = word_start;
      und_attr->end_index = word_start + word_len;
      started_underline = true;
    }
  }
  // Finish the current underline attribute at the end of the page.
  if (started_underline) {
    und_attr->end_index = page_text.length();
    pango_attr_list_insert(attr_list, und_attr);
  }
}
277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293

// Returns offset in utf8 bytes to first page.
int StringRenderer::FindFirstPageBreakOffset(const char* text,
                                             int text_length) {
  if (!text_length) return 0;
  const int max_height = (page_height_ - 2 * v_margin_);
  const int max_width = (page_width_ - 2 * h_margin_);
  const int max_layout_height = vertical_text_ ? max_width : max_height;

  UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
  const UNICHAR::const_iterator it_end = UNICHAR::end(text, text_length);
  const int kMaxUnicodeBufLength = 15000;
  for (int i = 0; i < kMaxUnicodeBufLength && it != it_end; ++it, ++i);
  int buf_length = it.utf8_data() - text;
  tlog(1, "len = %d  buf_len = %d\n", text_length, buf_length);
  pango_layout_set_text(layout_, text, buf_length);

S
Stefan Weil 已提交
294
  PangoLayoutIter* line_iter = nullptr;
295 296 297 298 299 300 301 302 303 304
  { // Fontconfig caches some info here that is not freed before exit.
    DISABLE_HEAP_LEAK_CHECK;
    line_iter = pango_layout_get_iter(layout_);
  }
  bool first_page = true;
  int page_top = 0;
  int offset = buf_length;
  do {
    // Get bounding box of the current line
    PangoRectangle line_ink_rect;
S
Stefan Weil 已提交
305 306
    pango_layout_iter_get_line_extents(line_iter, &line_ink_rect, nullptr);
    pango_extents_to_pixels(&line_ink_rect, nullptr);
307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
    PangoLayoutLine* line = pango_layout_iter_get_line_readonly(line_iter);
    if (first_page) {
      page_top = line_ink_rect.y;
      first_page = false;
    }
    int line_bottom = line_ink_rect.y + line_ink_rect.height;
    if (line_bottom - page_top > max_layout_height) {
      offset = line->start_index;
      tlog(1, "Found offset = %d\n", offset);
      break;
    }
  } while (pango_layout_iter_next_line(line_iter));
  pango_layout_iter_free(line_iter);
  return offset;
}

R
Ray Smith 已提交
323
const std::vector<BoxChar*>& StringRenderer::GetBoxes() const {
324 325 326 327 328 329 330 331 332 333 334 335 336 337
    return boxchars_;
}

Boxa* StringRenderer::GetPageBoxes() const {
    return page_boxes_;
}

void StringRenderer::RotatePageBoxes(float rotation) {
  BoxChar::RotateBoxes(rotation, page_width_ / 2, page_height_ / 2,
                       start_box_, boxchars_.size(), &boxchars_);
}


void StringRenderer::ClearBoxes() {
338
  for (size_t i = 0; i < boxchars_.size(); ++i) delete boxchars_[i];
339 340 341 342
  boxchars_.clear();
  boxaDestroy(&page_boxes_);
}

343
std::string StringRenderer::GetBoxesStr() {
344 345 346 347
  BoxChar::PrepareToWrite(&boxchars_);
  return BoxChar::GetTesseractBoxStr(page_height_, boxchars_);
}

348
void StringRenderer::WriteAllBoxes(const std::string& filename) {
349
  BoxChar::PrepareToWrite(&boxchars_);
350 351 352 353
  BoxChar::WriteTesseractBoxFile(filename, page_height_, boxchars_);
}

// Returns cluster strings in logical order.
354 355
bool StringRenderer::GetClusterStrings(std::vector<std::string>* cluster_text) {
  std::map<int, std::string> start_byte_to_text;
356 357 358 359 360
  PangoLayoutIter* run_iter = pango_layout_get_iter(layout_);
  const char* full_text = pango_layout_get_text(layout_);
  do {
    PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
    if (!run) {
S
Stefan Weil 已提交
361
      // End of line nullptr run marker
362 363 364 365 366 367 368 369 370 371 372
      tlog(2, "Found end of line marker\n");
      continue;
    }
    PangoGlyphItemIter cluster_iter;
    gboolean have_cluster;
    for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
                                                          run, full_text);
         have_cluster;
         have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
      const int start_byte_index = cluster_iter.start_index;
      const int end_byte_index = cluster_iter.end_index;
373 374
      std::string text = std::string(full_text + start_byte_index,
                                     end_byte_index - start_byte_index);
375 376 377 378 379 380 381 382 383
      if (IsUTF8Whitespace(text.c_str())) {
        tlog(2, "Found whitespace\n");
        text = " ";
      }
      tlog(2, "start_byte=%d end_byte=%d : '%s'\n", start_byte_index,
           end_byte_index, text.c_str());
      if (add_ligatures_) {
        // Make sure the output box files have ligatured text in case the font
        // decided to use an unmapped glyph.
S
Stefan Weil 已提交
384
        text = LigatureTable::Get()->AddLigatures(text, nullptr);
385 386 387 388 389 390 391
      }
      start_byte_to_text[start_byte_index] = text;
    }
  } while (pango_layout_iter_next_run(run_iter));
  pango_layout_iter_free(run_iter);

  cluster_text->clear();
392
  for (std::map<int, std::string>::const_iterator it = start_byte_to_text.begin();
393 394 395
       it != start_byte_to_text.end(); ++it) {
    cluster_text->push_back(it->second);
  }
R
Ray Smith 已提交
396
  return !cluster_text->empty();
397 398 399 400 401 402 403 404 405 406 407 408
}

// Merges an array of BoxChars into words based on the identification of
// BoxChars containing the space character as inter-word separators.
//
// Sometime two adjacent characters in the sequence may be detected as lying on
// different lines based on their spatial positions. This may be the result of a
// newline character at end of the last word on a line in the source text, or of
// a discretionary line-break created by Pango at intra-word locations like
// hyphens. When this is detected the word is split at that location into
// multiple BoxChars. Otherwise, each resulting BoxChar will contain a word and
// its bounding box.
R
Ray Smith 已提交
409 410
static void MergeBoxCharsToWords(std::vector<BoxChar*>* boxchars) {
  std::vector<BoxChar*> result;
411
  bool started_word = false;
412
  for (size_t i = 0; i < boxchars->size(); ++i) {
R
Ray Smith 已提交
413
    if (boxchars->at(i)->ch() == " " || boxchars->at(i)->box() == nullptr) {
414
      result.push_back(boxchars->at(i));
S
Stefan Weil 已提交
415
      boxchars->at(i) = nullptr;
416 417 418 419 420 421 422 423
      started_word = false;
      continue;
    }

    if (!started_word) {
      // Begin new word
      started_word = true;
      result.push_back(boxchars->at(i));
S
Stefan Weil 已提交
424
      boxchars->at(i) = nullptr;
425 426 427 428 429
    } else {
      BoxChar* last_boxchar = result.back();
      // Compute bounding box union
      const Box* box = boxchars->at(i)->box();
      Box* last_box = last_boxchar->mutable_box();
430 431 432 433
      int left = std::min(last_box->x, box->x);
      int right = std::max(last_box->x + last_box->w, box->x + box->w);
      int top = std::min(last_box->y, box->y);
      int bottom = std::max(last_box->y + last_box->h, box->y + box->h);
434 435 436 437 438 439 440 441 442
      // Conclude that the word was broken to span multiple lines based on the
      // size of the merged bounding box in relation to those of the individual
      // characters seen so far.
      if (right - left > last_box->w + 5 * box->w) {
        tlog(1, "Found line break after '%s'", last_boxchar->ch().c_str());
        // Insert a fake interword space and start a new word with the current
        // boxchar.
        result.push_back(new BoxChar(" ", 1));
        result.push_back(boxchars->at(i));
S
Stefan Weil 已提交
443
        boxchars->at(i) = nullptr;
444 445 446 447 448 449 450 451 452
        continue;
      }
      // Append to last word
      last_boxchar->mutable_ch()->append(boxchars->at(i)->ch());
      last_box->x = left;
      last_box->w = right - left;
      last_box->y = top;
      last_box->h = bottom - top;
      delete boxchars->at(i);
S
Stefan Weil 已提交
453
      boxchars->at(i) = nullptr;
454 455 456 457 458 459 460 461 462 463 464
    }
  }
  boxchars->swap(result);
}


void StringRenderer::ComputeClusterBoxes() {
  const char* text = pango_layout_get_text(layout_);
  PangoLayoutIter* cluster_iter = pango_layout_get_iter(layout_);

  // Do a first pass to store cluster start indexes.
R
Ray Smith 已提交
465
  std::vector<int> cluster_start_indices;
466 467 468 469 470 471 472 473
  do {
    cluster_start_indices.push_back(pango_layout_iter_get_index(cluster_iter));
    tlog(3, "Added %d\n", cluster_start_indices.back());
  } while (pango_layout_iter_next_cluster(cluster_iter));
  pango_layout_iter_free(cluster_iter);
  cluster_start_indices.push_back(strlen(text));
  tlog(3, "Added last index %d\n", cluster_start_indices.back());
  // Sort the indices and create a map from start to end indices.
R
Ray Smith 已提交
474 475
  std::sort(cluster_start_indices.begin(), cluster_start_indices.end());
  std::map<int, int> cluster_start_to_end_index;
476
  for (size_t i = 0; i + 1 < cluster_start_indices.size(); ++i) {
477 478 479 480 481 482 483 484
    cluster_start_to_end_index[cluster_start_indices[i]]
        = cluster_start_indices[i + 1];
  }

  // Iterate again to compute cluster boxes and their text with the obtained
  // cluster extent information.
  cluster_iter = pango_layout_get_iter(layout_);
  // Store BoxChars* sorted by their byte start positions
R
Ray Smith 已提交
485
  std::map<int, BoxChar*> start_byte_to_box;
486 487
  do {
    PangoRectangle cluster_rect;
R
Ray Smith 已提交
488
    pango_layout_iter_get_cluster_extents(cluster_iter, &cluster_rect, nullptr);
S
Stefan Weil 已提交
489
    pango_extents_to_pixels(&cluster_rect, nullptr);
490 491
    const int start_byte_index = pango_layout_iter_get_index(cluster_iter);
    const int end_byte_index = cluster_start_to_end_index[start_byte_index];
492 493
    std::string cluster_text = std::string(text + start_byte_index,
                                           end_byte_index - start_byte_index);
R
Ray Smith 已提交
494
    if (!cluster_text.empty() && cluster_text[0] == '\n') {
495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519
      tlog(2, "Skipping newlines at start of text.\n");
      continue;
    }
    if (!cluster_rect.width || !cluster_rect.height ||
        IsUTF8Whitespace(cluster_text.c_str())) {
      tlog(2, "Skipping whitespace with boxdim (%d,%d) '%s'\n",
           cluster_rect.width, cluster_rect.height, cluster_text.c_str());
      BoxChar* boxchar = new BoxChar(" ", 1);
      boxchar->set_page(page_);
      start_byte_to_box[start_byte_index] = boxchar;
      continue;
    }
    // Prepare a boxchar for addition at this byte position.
    tlog(2, "[%d %d], %d, %d : start_byte=%d end_byte=%d : '%s'\n",
         cluster_rect.x, cluster_rect.y,
         cluster_rect.width, cluster_rect.height,
         start_byte_index, end_byte_index,
         cluster_text.c_str());
    ASSERT_HOST_MSG(cluster_rect.width,
                    "cluster_text:%s  start_byte_index:%d\n",
                    cluster_text.c_str(), start_byte_index);
    ASSERT_HOST_MSG(cluster_rect.height,
                    "cluster_text:%s  start_byte_index:%d\n",
                    cluster_text.c_str(), start_byte_index);
    if (box_padding_) {
520
      cluster_rect.x = std::max(0, cluster_rect.x - box_padding_);
521
      cluster_rect.width += 2 * box_padding_;
522
      cluster_rect.y = std::max(0, cluster_rect.y - box_padding_);
523 524 525 526 527
      cluster_rect.height += 2 * box_padding_;
    }
    if (add_ligatures_) {
      // Make sure the output box files have ligatured text in case the font
      // decided to use an unmapped glyph.
S
Stefan Weil 已提交
528
      cluster_text = LigatureTable::Get()->AddLigatures(cluster_text, nullptr);
529 530 531 532 533 534 535 536 537 538 539 540 541 542 543
    }
    BoxChar* boxchar = new BoxChar(cluster_text.c_str(), cluster_text.size());
    boxchar->set_page(page_);
    boxchar->AddBox(cluster_rect.x, cluster_rect.y,
                    cluster_rect.width, cluster_rect.height);
    start_byte_to_box[start_byte_index] = boxchar;
  } while (pango_layout_iter_next_cluster(cluster_iter));
  pango_layout_iter_free(cluster_iter);

  // There is a subtle bug in the cluster text reported by the PangoLayoutIter
  // on ligatured characters (eg. The word "Lam-Aliph" in arabic). To work
  // around this, we use text reported using the PangoGlyphIter which is
  // accurate.
  // TODO(ranjith): Revisit whether this is still needed in newer versions of
  // pango.
544
  std::vector<std::string> cluster_text;
545 546 547
  if (GetClusterStrings(&cluster_text)) {
    ASSERT_HOST(cluster_text.size() == start_byte_to_box.size());
    int ind = 0;
R
Ray Smith 已提交
548
    for (std::map<int, BoxChar*>::iterator it = start_byte_to_box.begin();
549 550 551 552 553 554
         it != start_byte_to_box.end(); ++it, ++ind) {
      it->second->mutable_ch()->swap(cluster_text[ind]);
    }
  }

  // Append to the boxchars list in byte order.
R
Ray Smith 已提交
555
  std::vector<BoxChar*> page_boxchars;
556
  page_boxchars.reserve(start_byte_to_box.size());
557
  std::string last_ch;
R
Ray Smith 已提交
558
  for (std::map<int, BoxChar*>::const_iterator it = start_byte_to_box.begin();
559 560 561 562 563 564 565 566 567 568 569
       it != start_byte_to_box.end(); ++it) {
    if (it->second->ch() == kWordJoinerUTF8) {
      // Skip zero-width joiner characters (ZWJs) here.
      delete it->second;
    } else {
      page_boxchars.push_back(it->second);
    }
  }
  CorrectBoxPositionsToLayout(&page_boxchars);

  if (render_fullwidth_latin_) {
R
Ray Smith 已提交
570
    for (std::map<int, BoxChar*>::iterator it = start_byte_to_box.begin();
571 572
         it != start_byte_to_box.end(); ++it) {
      // Convert fullwidth Latin characters to their halfwidth forms.
573
      std::string half(ConvertFullwidthLatinToBasicLatin(it->second->ch()));
574 575 576 577 578 579 580 581 582 583 584 585
      it->second->mutable_ch()->swap(half);
    }
  }

  // Merge the character boxes into word boxes if we are rendering n-grams.
  if (output_word_boxes_) {
    MergeBoxCharsToWords(&page_boxchars);
  }

  boxchars_.insert(boxchars_.end(), page_boxchars.begin(), page_boxchars.end());

  // Compute the page bounding box
S
Stefan Weil 已提交
586 587
  Box* page_box = nullptr;
  Boxa* all_boxes = nullptr;
588
  for (size_t i = 0; i < page_boxchars.size(); ++i) {
S
Stefan Weil 已提交
589
    if (page_boxchars[i]->box() == nullptr) continue;
R
Ray Smith 已提交
590
    if (all_boxes == nullptr) all_boxes = boxaCreate(0);
591 592
    boxaAddBox(all_boxes, page_boxchars[i]->mutable_box(), L_CLONE);
  }
S
Stefan Weil 已提交
593 594
  if (all_boxes != nullptr) {
    boxaGetExtent(all_boxes, nullptr, nullptr, &page_box);
R
Ray Smith 已提交
595
    boxaDestroy(&all_boxes);
S
Stefan Weil 已提交
596
    if (page_boxes_ == nullptr) page_boxes_ = boxaCreate(0);
R
Ray Smith 已提交
597 598
    boxaAddBox(page_boxes_, page_box, L_INSERT);
  }
599 600 601
}


R
Ray Smith 已提交
602 603
void StringRenderer::CorrectBoxPositionsToLayout(
    std::vector<BoxChar*>* boxchars) {
604 605 606 607 608 609 610 611 612 613 614
  if (vertical_text_) {
    const double rotation = - pango_gravity_to_rotation(
        pango_context_get_base_gravity(pango_layout_get_context(layout_)));
    BoxChar::TranslateBoxes(page_width_ - h_margin_, v_margin_, boxchars);
    BoxChar::RotateBoxes(rotation, page_width_ - h_margin_, v_margin_,
                         0, boxchars->size(), boxchars);
  } else {
    BoxChar::TranslateBoxes(h_margin_, v_margin_, boxchars);
  }
}

615 616
int StringRenderer::StripUnrenderableWords(std::string* utf8_text) const {
  std::string output_text;
617
  const char* text = utf8_text->c_str();
618
  size_t offset = 0;
619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641
  int num_dropped = 0;
  while (offset < utf8_text->length()) {
    int space_len = SpanUTF8Whitespace(text + offset);
    output_text.append(text + offset, space_len);
    offset += space_len;
    if (offset == utf8_text->length()) break;

    int word_len = SpanUTF8NotWhitespace(text + offset);
    if (font_.CanRenderString(text + offset, word_len)) {
      output_text.append(text + offset, word_len);
    } else {
      ++num_dropped;
    }
    offset += word_len;
  }
  utf8_text->swap(output_text);

  if (num_dropped > 0) {
    tprintf("Stripped %d unrenderable words\n", num_dropped);
  }
  return num_dropped;
}

642 643
int StringRenderer::RenderToGrayscaleImage(const char* text, int text_length,
                                           Pix** pix) {
R
Ray Smith 已提交
644
  Pix* orig_pix = nullptr;
645 646 647 648 649 650 651 652
  int offset = RenderToImage(text, text_length, &orig_pix);
  if (orig_pix) {
    *pix = pixConvertTo8(orig_pix, false);
    pixDestroy(&orig_pix);
  }
  return offset;
}

653 654
int StringRenderer::RenderToBinaryImage(const char* text, int text_length,
                                        int threshold, Pix** pix) {
R
Ray Smith 已提交
655
  Pix* orig_pix = nullptr;
656 657 658 659 660 661 662 663 664 665 666 667 668 669 670
  int offset = RenderToImage(text, text_length, &orig_pix);
  if (orig_pix) {
    Pix* gray_pix = pixConvertTo8(orig_pix, false);
    pixDestroy(&orig_pix);
    *pix = pixThresholdToBinary(gray_pix, threshold);
    pixDestroy(&gray_pix);
  } else {
    *pix = orig_pix;
  }
  return offset;
}

// Add word joiner (WJ) characters between adjacent non-space characters except
// immediately before a combiner.
/* static */
671 672
std::string StringRenderer::InsertWordJoiners(const std::string& text) {
  std::string out_str;
673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693
  const UNICHAR::const_iterator it_end = UNICHAR::end(text.c_str(),
                                                      text.length());
  for (UNICHAR::const_iterator it = UNICHAR::begin(text.c_str(), text.length());
       it != it_end; ++it) {
    // Add the symbol to the output string.
    out_str.append(it.utf8_data(), it.utf8_len());
    // Check the next symbol.
    UNICHAR::const_iterator next_it = it;
    ++next_it;
    bool next_char_is_boundary = (next_it == it_end || *next_it == ' ');
    bool next_char_is_combiner = (next_it == it_end) ?
        false : IsCombiner(*next_it);
    if (*it != ' ' && *it != '\n' && !next_char_is_boundary &&
        !next_char_is_combiner) {
      out_str += kWordJoinerUTF8;
    }
  }
  return out_str;
}

// Convert halfwidth Basic Latin characters to their fullwidth forms.
694 695
std::string StringRenderer::ConvertBasicLatinToFullwidthLatin(const std::string& str) {
  std::string full_str;
696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713
  const UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(),
                                                      str.length());
  for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length());
       it != it_end; ++it) {
    // Convert printable and non-space 7-bit ASCII characters to
    // their fullwidth forms.
    if (IsInterchangeValid7BitAscii(*it) && isprint(*it) && !isspace(*it)) {
      // Convert by adding 0xFEE0 to the codepoint of 7-bit ASCII.
      char32 full_char = *it + 0xFEE0;
      full_str.append(EncodeAsUTF8(full_char));
    } else {
      full_str.append(it.utf8_data(), it.utf8_len());
    }
  }
  return full_str;
}

// Convert fullwidth Latin characters to their halfwidth forms.
714 715
std::string StringRenderer::ConvertFullwidthLatinToBasicLatin(const std::string& str) {
  std::string half_str;
716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762
  UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
  for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length());
       it != it_end; ++it) {
    char32 half_char = FullwidthToHalfwidth(*it);
    // Convert fullwidth Latin characters to their halfwidth forms
    // only if halfwidth forms are printable and non-space 7-bit ASCII.
    if (IsInterchangeValid7BitAscii(half_char) &&
        isprint(half_char) && !isspace(half_char)) {
      half_str.append(EncodeAsUTF8(half_char));
    } else {
      half_str.append(it.utf8_data(), it.utf8_len());
    }
  }
  return half_str;
}

// Returns offset to end of text substring rendered in this method.
int StringRenderer::RenderToImage(const char* text, int text_length,
                                  Pix** pix) {
  if (pix && *pix) pixDestroy(pix);
  InitPangoCairo();

  const int page_offset = FindFirstPageBreakOffset(text, text_length);
  if (!page_offset) {
    return 0;
  }
  start_box_ = boxchars_.size();

  if (!vertical_text_) {
    // Translate by the specified margin
    cairo_translate(cr_, h_margin_, v_margin_);
  } else {
    // Vertical text rendering is achieved by a two-step process of first
    // performing regular horizontal layout with character orientation set to
    // EAST, and then translating and rotating the layout before rendering onto
    // the desired image surface. The settings required for the former step are
    // done within InitPangoCairo().
    //
    // Translate to the top-right margin of page
    cairo_translate(cr_, page_width_ - h_margin_, v_margin_);
    // Rotate the layout
    double rotation = - pango_gravity_to_rotation(
        pango_context_get_base_gravity(pango_layout_get_context(layout_)));
    tlog(2, "Rotating by %f radians\n", rotation);
    cairo_rotate(cr_, rotation);
    pango_cairo_update_layout(cr_, layout_);
  }
763
  std::string page_text(text, page_offset);
764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781
  if (render_fullwidth_latin_) {
    // Convert Basic Latin to their fullwidth forms.
    page_text = ConvertBasicLatinToFullwidthLatin(page_text);
  }
  if (strip_unrenderable_words_) {
    StripUnrenderableWords(&page_text);
  }
  if (drop_uncovered_chars_ &&
      !font_.CoversUTF8Text(page_text.c_str(), page_text.length())) {
    int num_dropped = font_.DropUncoveredChars(&page_text);
    if (num_dropped) {
      tprintf("WARNING: Dropped %d uncovered characters\n", num_dropped);
    }
  }
  if (add_ligatures_) {
    // Add ligatures wherever possible, including custom ligatures.
    page_text = LigatureTable::Get()->AddLigatures(page_text, &font_);
  }
782 783 784
  if (underline_start_prob_ > 0) {
    SetWordUnderlineAttributes(page_text);
  }
785 786 787 788 789 790 791 792 793 794 795 796 797 798

  pango_layout_set_text(layout_, page_text.c_str(), page_text.length());

  if (pix) {
    // Set a white background for the target image surface.
    cairo_set_source_rgb(cr_, 1.0, 1.0, 1.0);  // sets drawing colour to white
    // Fill the surface with the active colour (if you don't do this, you will
    // be given a surface with a transparent background to draw on)
    cairo_paint(cr_);
    // Set the ink color to black
    cairo_set_source_rgb(cr_, pen_color_[0], pen_color_[1], pen_color_[2]);
    // If the target surface or transformation properties of the cairo instance
    // have changed, update the pango layout to reflect this
    pango_cairo_update_layout(cr_, layout_);
799 800 801 802 803
    {
      DISABLE_HEAP_LEAK_CHECK;  // for Fontconfig
      // Draw the pango layout onto the cairo surface
      pango_cairo_show_layout(cr_, layout_);
    }
804 805 806 807 808 809 810 811 812 813 814 815 816
    *pix = CairoARGB32ToPixFormat(surface_);
  }
  ComputeClusterBoxes();
  FreePangoCairo();
  // Update internal state variables.
  ++page_;
  return page_offset;
}

// Render a string to an image, returning it as an 8 bit pix.  Behaves as
// RenderString, except that it ignores the font set at construction and works
// through all the fonts, returning 0 until they are exhausted, at which point
// it returns the value it should have returned all along, but no pix this time.
T
theraysmith@gmail.com 已提交
817
// Fonts that don't contain a given proportion of the characters in the string
818 819 820 821 822 823 824 825 826 827
// get skipped.
// Fonts that work each get rendered and the font name gets added
// to the image.
// NOTE that no boxes are produced by this function.
//
// Example usage: To render a null terminated char-array "txt"
//
// int offset = 0;
// do {
//   Pix *pix;
T
theraysmith@gmail.com 已提交
828
//   offset += renderer.RenderAllFontsToImage(min_proportion, txt + offset,
R
Ray Smith 已提交
829 830
//                                            strlen(txt + offset), nullptr,
//                                            &pix);
831 832 833
//   ...
// } while (offset < strlen(text));
//
T
theraysmith@gmail.com 已提交
834 835
int StringRenderer::RenderAllFontsToImage(double min_coverage,
                                          const char* text, int text_length,
836
                                          std::string* font_used, Pix** image) {
S
Stefan Weil 已提交
837
  *image = nullptr;
838 839
  // Select a suitable font to render the title with.
  const char kTitleTemplate[] = "%s : %d hits = %.2f%%, raw = %d = %.2f%%";
840
  std::string title_font;
841
  if (!FontUtils::SelectFont(kTitleTemplate, strlen(kTitleTemplate),
S
Stefan Weil 已提交
842
                             &title_font, nullptr)) {
843 844 845 846 847 848 849
    tprintf("WARNING: Could not find a font to render image title with!\n");
    title_font = "Arial";
  }
  title_font += " 8";
  tlog(1, "Selected title font: %s\n", title_font.c_str());
  if (font_used) font_used->clear();

850
  std::string orig_font = font_.DescriptionName();
851 852 853 854 855 856 857 858 859 860
  if (char_map_.empty()) {
    total_chars_ = 0;
    // Fill the hash table and use that for computing which fonts to use.
    for (UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
         it != UNICHAR::end(text, text_length); ++it) {
      ++total_chars_;
      ++char_map_[*it];
    }
    tprintf("Total chars = %d\n", total_chars_);
  }
861
  const std::vector<std::string>& all_fonts = FontUtils::ListAvailableFonts();
862 863

  for (size_t i = font_index_; i < all_fonts.size(); ++i) {
864 865
    ++font_index_;
    int raw_score = 0;
R
Ray Smith 已提交
866 867
    int ok_chars =
        FontUtils::FontScore(char_map_, all_fonts[i], &raw_score, nullptr);
T
theraysmith@gmail.com 已提交
868
    if (ok_chars > 0 && ok_chars >= total_chars_ * min_coverage) {
869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887
      set_font(all_fonts[i]);
      int offset = RenderToBinaryImage(text, text_length, 128, image);
      ClearBoxes();  // Get rid of them as they are garbage.
      const int kMaxTitleLength = 1024;
      char title[kMaxTitleLength];
      snprintf(title, kMaxTitleLength, kTitleTemplate,
               all_fonts[i].c_str(), ok_chars,
               100.0 * ok_chars / total_chars_, raw_score,
               100.0 * raw_score / char_map_.size());
      tprintf("%s\n", title);
      // This is a good font! Store the offset to return once we've tried all
      // the fonts.
      if (offset) {
        last_offset_ = offset;
        if (font_used) *font_used = all_fonts[i];
      }
      // Add the font to the image.
      set_font(title_font);
      v_margin_ /= 8;
S
Stefan Weil 已提交
888
      Pix* title_image = nullptr;
889 890 891 892 893 894 895 896 897 898 899 900 901 902 903
      RenderToBinaryImage(title, strlen(title), 128, &title_image);
      pixOr(*image, *image, title_image);
      pixDestroy(&title_image);

      v_margin_ *= 8;
      set_font(orig_font);
      // We return the real offset only after cycling through the list of fonts.
      return 0;
    } else {
      tprintf("Font %s failed with %d hits = %.2f%%\n",
              all_fonts[i].c_str(), ok_chars, 100.0 * ok_chars / total_chars_);
    }
  }
  font_index_ = 0;
  char_map_.clear();
904
  return last_offset_ == 0 ? -1 : last_offset_;
905 906 907
}

}  // namespace tesseract