Various fixes, including memory leak in fixspace, font labels on output,...

Various fixes, including memory leak in fixspace, font labels on output, removed some annoying debug output, fixes to initialization of parameters, general cleanup, and added Hindi git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@567 d0cd1f9f-072b-0410-8dd7-cf729c803f20

Various fixes, including memory leak in fixspace, font labels on output,...
Various fixes, including memory leak in fixspace, font labels on output, removed some annoying debug output, fixes to initialization of parameters, general cleanup, and added Hindi git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@567 d0cd1f9f-072b-0410-8dd7-cf729c803f20
3e8c0bc2 · theraysmith · c81483f7 · 3e8c0bc2 · 3e8c0bc2 · 3e8c0bc2
12 changed file
--- a/ccmain/Makefile.in
+++ b/ccmain/Makefile.in
@@ -72,13 +72,13 @@ am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(includedir)"
 LTLIBRARIES = $(lib_LTLIBRARIES)
 libtesseract_main_la_DEPENDENCIES =  \
 	../wordrec/libtesseract_wordrec.la
-am_libtesseract_main_la_OBJECTS = adaptions.lo applybox.lo charcut.lo \
-	control.lo cube_control.lo cube_reco_context.lo docqual.lo \
-	fixspace.lo fixxht.lo imgscale.lo osdetect.lo output.lo \
-	pagesegmain.lo pagewalk.lo paramsd.lo pgedit.lo reject.lo \
-	scaleimg.lo recogtraining.lo tesseract_cube_combiner.lo \
-	tessbox.lo tessedit.lo tesseractclass.lo tessvars.lo \
-	tfacepp.lo thresholder.lo tstruct.lo werdit.lo
+am_libtesseract_main_la_OBJECTS = adaptions.lo applybox.lo control.lo \
+	cube_control.lo cube_reco_context.lo docqual.lo fixspace.lo \
+	fixxht.lo imgscale.lo osdetect.lo output.lo pagesegmain.lo \
+	pagewalk.lo paramsd.lo pgedit.lo reject.lo scaleimg.lo \
+	recogtraining.lo tesseract_cube_combiner.lo tessbox.lo \
+	tessedit.lo tesseractclass.lo tessvars.lo tfacepp.lo \
+	thresholder.lo werdit.lo
 libtesseract_main_la_OBJECTS = $(am_libtesseract_main_la_OBJECTS)
 libtesseract_main_la_LINK = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \
@@ -280,25 +280,25 @@ AM_CPPFLAGS = \

 EXTRA_DIST = tessembedded.cpp
 include_HEADERS = \
-    charcut.h control.h cube_reco_context.h \
+    control.h cube_reco_context.h \
    docqual.h fixspace.h \
    imgscale.h osdetect.h output.h \
    paramsd.h pgedit.h reject.h scaleimg.h \
    tessbox.h tessedit.h tessembedded.h tesseractclass.h \
    tesseract_cube_combiner.h \
-    tessvars.h tfacep.h tfacepp.h thresholder.h tstruct.h \
+    tessvars.h tfacep.h tfacepp.h thresholder.h \
    werdit.h

 lib_LTLIBRARIES = libtesseract_main.la
 libtesseract_main_la_SOURCES = \
    adaptions.cpp applybox.cpp \
-    charcut.cpp control.cpp cube_control.cpp cube_reco_context.cpp \
+    control.cpp cube_control.cpp cube_reco_context.cpp \
    docqual.cpp fixspace.cpp fixxht.cpp \
    imgscale.cpp osdetect.cpp output.cpp pagesegmain.cpp \
    pagewalk.cpp paramsd.cpp pgedit.cpp reject.cpp scaleimg.cpp \
    recogtraining.cpp tesseract_cube_combiner.cpp \
    tessbox.cpp tessedit.cpp tesseractclass.cpp tessvars.cpp \
-    tfacepp.cpp thresholder.cpp tstruct.cpp \
+    tfacepp.cpp thresholder.cpp \
    werdit.cpp

 libtesseract_main_la_LIBADD = \
@@ -381,7 +381,6 @@ distclean-compile:

 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/adaptions.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/applybox.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/charcut.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/control.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cube_control.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cube_reco_context.Plo@am__quote@
@@ -405,7 +404,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tessvars.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tfacepp.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/thresholder.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tstruct.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/werdit.Plo@am__quote@

 .cpp.o:

--- a/ccmain/applybox.cpp
+++ b/ccmain/applybox.cpp
@@ -486,7 +486,7 @@ bool Tesseract::ConvertStringToUnichars(const char* utf8,
 // Returns false if the re-segmentation fails.
 // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
 // applies a full search on the classifier results to find the best classified
-// segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
+// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
 // substitutions ARE used.
 bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
                                 WERD_RES* word_res) {

--- a/ccmain/control.cpp
+++ b/ccmain/control.cpp
@@ -170,9 +170,22 @@ void Tesseract::recog_all_words(PAGE_RES* page_res,
                                const TBOX* target_word_box,
                                const char* word_config,
                                int dopasses) {
+  // TODO(rays): Normalize the "classify word" interface.  For instance:
+  //   (1) word.denorm gets set in word->SetupForRecognition() but does
+  //       not get invoked for cube alone.  Maybe it should?
+  //   (2) run_cube() checks whether word->best_choice is NULL, and if
+  //       so determines that "neither cube nor tess have an answer."
+  //       However, if tess gets run at all, the first thing it does is
+  //       call word->SetupForRecognition which inserts a poorly scoring
+  //       best_answer.  So what is the way that an engine (tess or cube)
+  //       says "I don't have an answer": an empty list or a single
+  //       poorly scoring best_answer?
+
                                 // reset page iterator
  // If we only intend to run cube - run it and return.
  if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
+    PrepareForCubeOCR();
+    mutable_splitter()->Clear();
    run_cube(page_res);
    return;
  }
@@ -394,6 +407,8 @@ void Tesseract::recog_all_words(PAGE_RES* page_res,
  // ****************** Pass 5 *******************
  // If cube is loaded and its combiner is present, run it.
  if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
+    PrepareForCubeOCR();
+    mutable_splitter()->Clear();
    run_cube(page_res);
  }

@@ -520,6 +535,7 @@ void Tesseract::classify_word_pass1(WERD_RES *word,  // word to do
        }
        // Send word to adaptive classifier for training.
        word->BestChoiceToCorrectText(unicharset);
+        set_word_fonts(word, blob_choices);
        LearnWord(NULL, rejmap, word);
      }

@@ -1164,14 +1180,14 @@ BOOL8 Tesseract::check_debug_pt(WERD_RES *word, int location) {
 */
 static void find_modal_font(           //good chars in word
                     STATS *fonts,     //font stats
-                     inT8 *font_out,   //output font
+                     inT16 *font_out,   //output font
                     inT8 *font_count  //output count
                    ) {
-  inT8 font;                     //font index
+  inT16 font;                     //font index
  inT32 count;                   //pile couat

  if (fonts->get_total () > 0) {
-    font = (inT8) fonts->mode ();
+    font = (inT16) fonts->mode ();
    *font_out = font;
    count = fonts->pile_count (font);
    *font_count = count < MAX_INT8 ? count : MAX_INT8;
@@ -1216,68 +1232,57 @@ void Tesseract::set_word_fonts(WERD_RES *word,
    if (word_ch_id >= PreTrainedTemplates->NumClasses)
      return;  // This must be a cube word.
    choice_it.set_to_list(char_it.data());
+    if (tessedit_debug_fonts) {
+      tprintf("Examining fonts in %s\n", word->best_choice->debug_string(
+          getDict().getUnicharset()).string());
+    }
    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
         choice_it.forward()) {
      UNICHAR_ID blob_ch_id = choice_it.data()->unichar_id();
      if (blob_ch_id == word_ch_id) {
-        int config = choice_it.data()->config();
-        int config2 = choice_it.data()->config2();
-        int font_set_id = PreTrainedTemplates->Class[blob_ch_id]->font_set_id;
-        if (font_set_id >= 0 && config >= 0 && font_set_id < fontset_size) {
-          FontSet font_set = get_fontset_table().get(font_set_id);
-          if (tessedit_debug_fonts) {
-            tprintf("%s(%d/%d)", unicharset.id_to_unichar(blob_ch_id),
-                    config, config2);
-            const char* fontname;
-            if (config >= font_set.size) {
-              fontname = "Unknown";
-            } else {
-              fontname = get_fontinfo_table().get(
-                font_set.configs[config]).name;
-            }
-            tprintf("%s(%d,%d=%s)\n",
-                    unicharset.id_to_unichar(choice_it.data()->unichar_id()),
-                    font_set_id, config, fontname);
-          }
-          // 1st choice config gets 2 pts, 2nd choice 1 pt.
-          if (config < font_set.size) {
-            int fontinfo_id = font_set.configs[config];
-            if (fontinfo_id < fontinfo_size) {
-              fonts.add(fontinfo_id, 2);
-            }
-          }
-          if (config2 >= 0 && config2 < font_set.size) {
-            int fontinfo_id = font_set.configs[config2];
-            if (fontinfo_id < fontinfo_size) {
-              fonts.add(fontinfo_id, 1);
-            }
-          }
+        if (tessedit_debug_fonts) {
+          tprintf("%s font %s (%d) font2 %s (%d)\n",
+                  getDict().getUnicharset().id_to_unichar(blob_ch_id),
+                  choice_it.data()->fontinfo_id() < 0 ? "unknown" :
+                  fontinfo_table_.get(choice_it.data()->fontinfo_id()).name,
+                  choice_it.data()->fontinfo_id(),
+                  choice_it.data()->fontinfo_id2() < 0 ? "unknown" :
+                  fontinfo_table_.get(choice_it.data()->fontinfo_id2()).name,
+                  choice_it.data()->fontinfo_id2());
+        }
+        // 1st choice font gets 2 pts, 2nd choice 1 pt.
+        if (choice_it.data()->fontinfo_id() >= 0) {
+          fonts.add(choice_it.data()->fontinfo_id(), 2);
+        }
+        if (choice_it.data()->fontinfo_id2() >= 0) {
+          fonts.add(choice_it.data()->fontinfo_id2(), 1);
        }
        break;
      }
    }
  }
-  find_modal_font(&fonts, &word->font1, &word->font1_count);
-  find_modal_font(&fonts, &word->font2, &word->font2_count);
+  find_modal_font(&fonts, &word->fontinfo_id, &word->fontinfo_id_count);
+  find_modal_font(&fonts, &word->fontinfo_id2, &word->fontinfo_id2_count);
  // All the blobs get the word's best choice font.
  for (int i = 0; i < word->best_choice->length(); ++i) {
-    word->best_choice_fontinfo_ids.push_back(word->font1);
+    word->best_choice_fontinfo_ids.push_back(word->fontinfo_id);
  }
-  if (word->font1_count > 0) {
-    FontInfo fi = fontinfo_table_.get(word->font1);
+  if (word->fontinfo_id_count > 0) {
+    FontInfo fi = fontinfo_table_.get(word->fontinfo_id);
    if (tessedit_debug_fonts) {
-      if (word->font2_count > 0) {
+      if (word->fontinfo_id2_count > 0) {
        tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
-                fi.name, word->font1_count,
-                fontinfo_table_.get(word->font2).name, word->font2_count);
+                fi.name, word->fontinfo_id_count,
+                fontinfo_table_.get(word->fontinfo_id2).name,
+                word->fontinfo_id2_count);
      } else {
        tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
-                fi.name, word->font1_count);
+                fi.name, word->fontinfo_id_count);
      }
    }
    // 1st choices got 2 pts, so we need to halve the score for the mode.
-    word->italic = (fi.is_italic() ? 1 : -1) * (word->font1_count + 1) / 2;
-    word->bold = (fi.is_bold() ? 1 : -1) * (word->font1_count + 1) / 2;
+    word->italic = (fi.is_italic() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2;
+    word->bold = (fi.is_bold() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2;
  }
 }

@@ -1292,7 +1297,7 @@ void Tesseract::font_recognition_pass(  //good chars in word
                                      PAGE_RES_IT &page_res_it) {
  inT32 length;                  //of word
  inT32 count;                   //of a feature
-  inT8 doc_font;                 //modal font
+  inT16 doc_font;                 //modal font
  inT8 doc_font_count;           //modal font
  WERD_RES *word;                //current word
  STATS doc_fonts (0, get_fontinfo_table().size() ?
@@ -1305,8 +1310,8 @@ void Tesseract::font_recognition_pass(  //good chars in word
    if (!save_best_choices) {  // set_blob_choices() does a deep clear
      word->best_choice->set_blob_choices(NULL);
    }
-    doc_fonts.add(word->font1, word->font1_count);
-    doc_fonts.add(word->font2, word->font2_count);
+    doc_fonts.add(word->fontinfo_id, word->fontinfo_id_count);
+    doc_fonts.add(word->fontinfo_id2, word->fontinfo_id2_count);
    page_res_it.forward();
  }
  find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
@@ -1320,11 +1325,11 @@ void Tesseract::font_recognition_pass(  //good chars in word
    length = word->best_choice->length();

    // 1st choices got 2 pts, so we need to halve the score for the mode.
-    count = (word->font1_count + 1) / 2;
+    count = (word->fontinfo_id_count + 1) / 2;
    if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
-      word->font1 = doc_font;
+      word->fontinfo_id = doc_font;
      // Counts only get 1 as it came from the doc.
-      word->font1_count = 1;
+      word->fontinfo_id_count = 1;
      word->italic = fi.is_italic() ? 1 : -1;
      word->bold = fi.is_bold() ? 1 : -1;
    }

--- a/ccmain/cube_control.cpp
+++ b/ccmain/cube_control.cpp
@@ -196,7 +196,7 @@ static WERD_CHOICE *create_werd_choice(
 /**********************************************************************
 * init_cube_objects
 *
- * Instantitates Tesseract object's CubeRecoContext and TesseractCubeCombiner.
+ * Instantiates Tesseract object's CubeRecoContext and TesseractCubeCombiner.
 * Returns false if cube context could not be created or if load_combiner is
 * true, but the combiner could not be loaded.
 **********************************************************************/
@@ -260,7 +260,10 @@ void Tesseract::run_cube(
       page_res_it.forward()) {
    WERD_RES* word = page_res_it.word();
    TBOX word_box = word->word->bounding_box();
-    const BLOCK* block = word->denorm.block();
+    // TODO(rays): Instead of page_res_it.block()->block maybe use
+    //             word->denorm.block() once TODO in
+    //             Tesseract::recog_all_words() is addressed.
+    const BLOCK* block = page_res_it.block()->block;
    if (block != NULL && (block->re_rotation().x() != 1.0f ||
          block->re_rotation().y() != 0.0f)) {
      // TODO(rays) We have to rotate the bounding box to get the true coords.

--- a/ccmain/docqual.cpp
+++ b/ccmain/docqual.cpp
@@ -737,12 +737,15 @@ void Tesseract::merge_tess_fails(WERD_RES *word_res) {
      unicharset,
      NewPermanentTessCallback(this, &Tesseract::BothSpaces), NULL,
      word_res->best_choice->blob_choices())) {
-    tprintf("Post:bc len=%d, rejmap=%d, boxword=%d, chopword=%d, rebuild=%d\n",
-            word_res->best_choice->length(),
-            word_res->reject_map.length(),
-            word_res->box_word->length(),
-            word_res->chopped_word->NumBlobs(),
-            word_res->rebuild_word->NumBlobs());
+    if (crunch_debug) {
+      tprintf("Post:bc len=%d, rejmap=%d, boxword=%d, chopword=%d,"
+              " rebuild=%d\n",
+              word_res->best_choice->length(),
+              word_res->reject_map.length(),
+              word_res->box_word->length(),
+              word_res->chopped_word->NumBlobs(),
+              word_res->rebuild_word->NumBlobs());
+    }
    int len = word_res->best_choice->length();
    ASSERT_HOST(word_res->reject_map.length() == len);
    ASSERT_HOST(word_res->box_word->length() == len);

--- a/ccmain/fixspace.cpp
+++ b/ccmain/fixspace.cpp
@@ -668,7 +668,6 @@ void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
  old_word_res->combination = TRUE;   // Kludge to force deep copy
  *new_word_res = *old_word_res;      // deep copy
  old_word_res->combination = FALSE;  // Undo kludge
-  new_word_res->combination = FALSE;  // Undo kludge
  current_perm_it.add_to_end(new_word_res);

  break_noisiest_blob_word(current_perm);
@@ -754,7 +753,9 @@ void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
    new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
  }

-  worst_word_it.add_before_then_move(new WERD_RES(new_word));
+  WERD_RES* new_word_res = new WERD_RES(new_word);
+  new_word_res->combination = TRUE;
+  worst_word_it.add_before_then_move(new_word_res);

  word_res->ClearResults();
 }

--- a/ccmain/osdetect.cpp
+++ b/ccmain/osdetect.cpp
@@ -403,7 +403,7 @@ void ScriptDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
    int prev_id = -1;
    int prev_script;
    int prev_class_id = -1;
-    int prev_config = -1;
+    int prev_fontinfo_id = -1;
    const char* prev_unichar = "";
    const char* unichar = "";
    float next_best_score = -1.0;
@@ -427,7 +427,7 @@ void ScriptDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
        prev_script = choice->script_id();
        prev_unichar = unichar;
        prev_class_id = choice->unichar_id();
-        prev_config = choice->config();
+        prev_fontinfo_id = choice->fontinfo_id();
      } else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {
        ++script_count;
        next_best_score = -choice->certainty();
@@ -451,11 +451,9 @@ void ScriptDetector::detect_blob(BLOB_CHOICE_LIST* scores) {

      // Workaround for Fraktur
      if (prev_id == latin_id_) {
-        int font_set_id = tess_->PreTrainedTemplates->
-            Class[prev_class_id]->font_set_id;
-        if (font_set_id >= 0 && prev_config >= 0) {
-          FontInfo fi = tess_->get_fontinfo_table().get(
-              tess_->get_fontset_table().get(font_set_id).configs[prev_config]);
+        if (prev_fontinfo_id >= 0) {
+          const FontInfo &fi =
+              tess_->get_fontinfo_table().get(prev_fontinfo_id);
          //printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
          //       fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
          //       fi.is_serif(), fi.is_fraktur(),

--- a/ccmain/pagesegmain.cpp
+++ b/ccmain/pagesegmain.cpp
@@ -102,8 +102,7 @@ static Pix* RemoveEnclosingCircle(Pix* pixs) {

 /**
 * Segment the page according to the current value of tessedit_pageseg_mode.
- * If the pix_binary_ member is not NULL, it is used as the source image,
- * and copied to image, otherwise it just uses image as the input.
+ * pix_binary_ is used as the source image and should not be NULL.
 * On return the blocks list owns all the constructed page layout.
 */
 int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
@@ -169,7 +168,8 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
  }

  if (blocks->empty()) {
-    tprintf("Empty page\n");
+    if (textord_debug_tabfind)
+      tprintf("Empty page\n");
    return 0;  // AutoPageSeg found an empty page.
  }


--- a/ccmain/pgedit.cpp
+++ b/ccmain/pgedit.cpp
@@ -721,7 +721,7 @@ BOOL8 Tesseract::word_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
  if (color_mode != CM_RAINBOW && word_res->box_word != NULL) {
    BoxWord* box_word = word_res->box_word;
    int length = box_word->length();
-    int font_id = word_res->font1;
+    int font_id = word_res->fontinfo_id;
    if (font_id < 0) font_id = 0;
    const UnicityTable<FontInfo> &font_table = get_fontinfo_table();
    FontInfo font_info = font_table.get(font_id);

--- a/ccmain/tessedit.cpp
+++ b/ccmain/tessedit.cpp
@@ -97,7 +97,9 @@ void Tesseract::read_config_file(const char *filename, bool init_only) {
 bool Tesseract::init_tesseract_lang_data(
    const char *arg0, const char *textbase, const char *language,
    OcrEngineMode oem, char **configs, int configs_size,
-    bool configs_init_only) {
+    const GenericVector<STRING> *vars_vec,
+    const GenericVector<STRING> *vars_values,
+    bool set_only_init_params) {
  // Set the basename, compute the data directory.
  main_setup(arg0, textbase);

@@ -129,7 +131,20 @@ bool Tesseract::init_tesseract_lang_data(
  // language-specific variables from [lang].traineddata file, so that custom
  // config files can override values in [lang].traineddata file.
  for (int i = 0; i < configs_size; ++i) {
-    read_config_file(configs[i], configs_init_only);
+    read_config_file(configs[i], set_only_init_params);
+  }
+
+  // Set params specified in vars_vec (done after setting params from config
+  // files, so that params in vars_vec can override those from files).
+  if (vars_vec != NULL && vars_values != NULL) {
+    for (int i = 0; i < vars_vec->size(); ++i) {
+      if (!ParamUtils::SetParam((*vars_vec)[i].string(),
+                                (*vars_values)[i].string(),
+                                set_only_init_params, this->params())) {
+        tprintf("Error setting param %s\n", (*vars_vec)[i].string());
+        exit(1);
+      }
+    }
  }

  if (((STRING &)tessedit_write_params_to_file).length() > 0) {
@@ -192,9 +207,12 @@ bool Tesseract::init_tesseract_lang_data(
 int Tesseract::init_tesseract(
    const char *arg0, const char *textbase, const char *language,
    OcrEngineMode oem, char **configs, int configs_size,
-    bool configs_init_only) {
+    const GenericVector<STRING> *vars_vec,
+    const GenericVector<STRING> *vars_values,
+    bool set_only_init_params) {
  if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
-                                configs_size, configs_init_only)) {
+                                configs_size, vars_vec, vars_values,
+                                set_only_init_params)) {
    return -1;
  }
  // If only Cube will be used, skip loading Tesseract classifier's
@@ -216,8 +234,8 @@ int Tesseract::init_tesseract(
 int Tesseract::init_tesseract_lm(const char *arg0,
                   const char *textbase,
                   const char *language) {
-  if (!init_tesseract_lang_data(arg0, textbase, language,
-                                OEM_TESSERACT_ONLY, NULL, 0, false))
+  if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
+                                NULL, 0, NULL, NULL, false))
    return -1;
  getDict().Load();
  tessdata_manager.End();

--- a/ccmain/tesseractclass.cpp
+++ b/ccmain/tesseractclass.cpp
@@ -68,6 +68,14 @@ Tesseract::Tesseract()
                  "Whitelist of chars to recognize", this->params()),
    BOOL_INIT_MEMBER(tessedit_ambigs_training, false,
                "Perform training for ambiguities", this->params()),
+    INT_MEMBER(pageseg_devanagari_split_strategy,
+              tesseract::ShiroRekhaSplitter::NO_SPLIT,
+              "Whether to use the top-line splitting process for Devanagari "
+              "documents while performing page-segmentation.", this->params()),
+    INT_MEMBER(ocr_devanagari_split_strategy,
+              tesseract::ShiroRekhaSplitter::NO_SPLIT,
+              "Whether to use the top-line splitting process for Devanagari "
+              "documents while performing ocr.", this->params()),
    STRING_MEMBER(tessedit_write_params_to_file, "",
                  "Write all parameters to the given file.", this->params()),
    BOOL_MEMBER(tessedit_adapt_to_char_fragments, true,
@@ -383,6 +391,7 @@ void Tesseract::Clear() {
  deskew_ = FCOORD(1.0f, 0.0f);
  reskew_ = FCOORD(1.0f, 0.0f);
  orig_image_changed_ = false;
+  splitter_.Clear();
 }

 void Tesseract::SetBlackAndWhitelist() {
@@ -391,4 +400,61 @@ void Tesseract::SetBlackAndWhitelist() {
                                     tessedit_char_whitelist.string());
 }

+// Perform steps to prepare underlying binary image/other data structures for
+// page segmentation.
+void Tesseract::PrepareForPageseg() {
+  // Perform shiro-rekha (top-line) splitting and replace the current image by
+  // the newly splitted image.
+  splitter_.set_orig_pix(pix_binary());
+  splitter_.set_pageseg_split_strategy(
+      (ShiroRekhaSplitter::SplitStrategy)
+      ((inT32)pageseg_devanagari_split_strategy));
+  if (splitter_.Split(true)) {
+    ASSERT_HOST(splitter_.splitted_image());
+    splitter_.CopySplittedImageTo(NULL, &pix_binary_);
+    orig_image_changed_ = true;
+  }
+}
+
+// Perform steps to prepare underlying binary image/other data structures for
+// OCR. The current segmentation is required by this method.
+void Tesseract::PrepareForTessOCR(BLOCK_LIST* block_list,
+                                  Tesseract* osd_tess, OSResults* osr) {
+  // Creating blobs to OCR.
+  // Utilize the segmentation information available.
+  splitter_.set_segmentation_block_list(block_list);
+  splitter_.set_ocr_split_strategy(
+      (ShiroRekhaSplitter::SplitStrategy)
+      ((inT32)ocr_devanagari_split_strategy));
+  if (splitter_.Split(false)) {
+    ASSERT_HOST(splitter_.splitted_image());
+    splitter_.CopySplittedImageTo(NULL, &pix_binary_);
+    orig_image_changed_ = true;
+    // If the split strategies used before pageseg and ocr are the same, the
+    // segmentation obtained from the second round can be used going forward.
+    // Otherwise, the page-segmentation (& importantly, the word segmentation)
+    // of first round is used.
+    if (splitter_.HasDifferentSplitStrategies()) {
+      // Refresh the segmentation with new blobs.
+      BLOCK_LIST new_segmentation;
+      SegmentPage(NULL, &new_segmentation, osd_tess, osr);
+      C_BLOB_LIST new_blobs;
+      ExtractBlobsFromSegmentation(&new_segmentation, &new_blobs);
+      splitter_.RefreshSegmentationWithNewBlobs(&new_blobs);
+    } else {
+      block_list->clear();
+      SegmentPage(NULL, block_list, osd_tess, osr);
+    }
+  }
+}
+
+// Perform steps to prepare underlying binary image/other data structures for
+// Cube OCR.
+void Tesseract::PrepareForCubeOCR() {
+  if (orig_image_changed_) {
+    // Revert to the original image as Cube likes them more.
+    splitter_.CopyOriginalImageTo(NULL, &pix_binary_);
+    orig_image_changed_ = false;
+  }
+}
 }  // namespace tesseract
--- a/ccmain/tesseractclass.h
+++ b/ccmain/tesseractclass.h
@@ -22,11 +22,13 @@
 #define TESSERACT_CCMAIN_TESSERACTCLASS_H__

 #include "allheaders.h"
+#include "genericvector.h"
 #include "params.h"
 #include "wordrec.h"
 #include "ocrclass.h"
 #include "control.h"
 #include "docqual.h"
+#include "devanagari_processing.h"
 #include "textord.h"

 class PAGE_RES;
@@ -159,6 +161,12 @@ class Tesseract : public Wordrec {
    return pixGetHeight(pix_binary_);
  }

+  const ShiroRekhaSplitter& splitter() const {
+    return splitter_;
+  }
+  ShiroRekhaSplitter* mutable_splitter() {
+    return &splitter_;
+  }
  const Textord& textord() const {
    return textord_;
  }
@@ -172,6 +180,24 @@ class Tesseract : public Wordrec {

  void SetBlackAndWhitelist();

+  // Perform steps to prepare underlying binary image/other data structures for
+  // page segmentation. Uses the strategy specified in the global variable
+  // pageseg_devanagari_split_strategy for perform splitting while preparing for
+  // page segmentation.
+  void PrepareForPageseg();
+
+  // Perform steps to prepare underlying binary image/other data structures for
+  // Tesseract OCR. The current segmentation is required by this method.
+  // Uses the strategy specified in the global variable
+  // ocr_devanagari_split_strategy for performing splitting while preparing for
+  // Tesseract ocr.
+  void PrepareForTessOCR(BLOCK_LIST* block_list,
+                         Tesseract* osd_tess, OSResults* osr);
+
+  // Perform steps to prepare underlying binary image/other data structures for
+  // Cube OCR.
+  void PrepareForCubeOCR();
+
  int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
                  Tesseract* osd_tess, OSResults* osr);
  void SetupWordScripts(BLOCK_LIST* blocks);
@@ -228,11 +254,12 @@ class Tesseract : public Wordrec {
  void fix_hyphens(WERD_RES* word_res,
                   BLOB_CHOICE_LIST_CLIST *blob_choices);
  void set_word_fonts(
-      WERD_RES *word,  // word to adapt to
+      WERD_RES *word,  // set fonts of this word
      BLOB_CHOICE_LIST_CLIST *blob_choices);  // detailed results
  void font_recognition_pass(  //good chars in word
                             PAGE_RES_IT &page_res_it);
  BOOL8 check_debug_pt(WERD_RES *word, int location);
+
  //// cube_control.cpp ///////////////////////////////////////////////////
  bool init_cube_objects(bool load_combiner,
                         TessdataManager *tessdata_manager);
@@ -267,11 +294,14 @@ class Tesseract : public Wordrec {
                     OcrEngineMode oem,
                     char **configs,
                     int configs_size,
-                     bool configs_init_only);
+                     const GenericVector<STRING> *vars_vec,
+                     const GenericVector<STRING> *vars_values,
+                     bool set_only_init_params);
  int init_tesseract(const char *datapath,
                     const char *language,
                     OcrEngineMode oem) {
-    return init_tesseract(datapath, NULL, language, oem, NULL, 0, false);
+    return init_tesseract(datapath, NULL, language, oem,
+                          NULL, 0, NULL, NULL, false);
  }

  int init_tesseract_lm(const char *arg0,
@@ -287,7 +317,9 @@ class Tesseract : public Wordrec {
                                OcrEngineMode oem,
                                char **configs,
                                int configs_size,
-                                bool configs_init_only);
+                                const GenericVector<STRING> *vars_vec,
+                                const GenericVector<STRING> *vars_values,
+                                bool set_only_init_params);

  //// pgedit.h //////////////////////////////////////////////////////////
  SVMenuNode *build_menu_new();
@@ -555,6 +587,14 @@ class Tesseract : public Wordrec {
               "Whitelist of chars to recognize");
  BOOL_VAR_H(tessedit_ambigs_training, false,
             "Perform training for ambiguities");
+  INT_VAR_H(pageseg_devanagari_split_strategy,
+            tesseract::ShiroRekhaSplitter::NO_SPLIT,
+            "Whether to use the top-line splitting process for Devanagari "
+            "documents while performing page-segmentation.");
+  INT_VAR_H(ocr_devanagari_split_strategy,
+            tesseract::ShiroRekhaSplitter::NO_SPLIT,
+            "Whether to use the top-line splitting process for Devanagari "
+            "documents while performing ocr.");
  STRING_VAR_H(tessedit_write_params_to_file, "",
               "Write all parameters to the given file.");
  BOOL_VAR_H(tessedit_adapt_to_char_fragments, true,
@@ -781,6 +821,9 @@ class Tesseract : public Wordrec {
  STRING word_config_;
  Pix* pix_binary_;
  Pix* pix_grey_;
+  // The shiro-rekha splitter object which is used to split top-lines in
+  // Devanagari words to provide a better word and grapheme segmentation.
+  ShiroRekhaSplitter splitter_;
  // The boolean records if the currently set
  // pix_binary_ member has been modified due to any processing so that this
  // may hurt Cube's recognition phase.