44 files changed, 20334 insertions, 0 deletions
diff --git a/tesseract/src/ccmain/adaptions.cpp b/tesseract/src/ccmain/adaptions.cpp
new file mode 100644
index 00000000..e07bf58c
--- /dev/null
+++ b/tesseract/src/ccmain/adaptions.cpp
@@ -0,0 +1,114 @@
+/**********************************************************************
+ * File:        adaptions.cpp  (Formerly adaptions.c)
+ * Description: Functions used to adapt to blobs already confidently
+ *              identified
+ * Author:      Chris Newton
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <cctype>
+#include <cstring>
+#include "tessvars.h"
+#include "reject.h"
+#include "control.h"
+#include "stopper.h"
+#include "tesseractclass.h"
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+namespace tesseract {
+bool Tesseract::word_adaptable(  //should we adapt?
+        WERD_RES* word,
+        uint16_t mode) {
+  if (tessedit_adaption_debug) {
+    tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
+          word->best_choice->unichar_string().c_str(),
+          word->best_choice->rating(), word->best_choice->certainty());
+  }
+
+  bool status = false;
+  BITS16 flags(mode);
+
+  enum MODES
+  {
+    ADAPTABLE_WERD,
+    ACCEPTABLE_WERD,
+    CHECK_DAWGS,
+    CHECK_SPACES,
+    CHECK_ONE_ELL_CONFLICT,
+    CHECK_AMBIG_WERD
+  };
+
+  /*
+  0: NO adaption
+  */
+  if (mode == 0) {
+    if (tessedit_adaption_debug) tprintf("adaption disabled\n");
+    return false;
+  }
+
+  if (flags[ADAPTABLE_WERD]) {
+    status |= word->tess_would_adapt;  // result of Classify::AdaptableWord()
+    if (tessedit_adaption_debug && !status) {
+      tprintf("tess_would_adapt bit is false\n");
+    }
+  }
+
+  if (flags[ACCEPTABLE_WERD]) {
+    status |= word->tess_accepted;
+    if (tessedit_adaption_debug && !status) {
+      tprintf("tess_accepted bit is false\n");
+    }
+  }
+
+  if (!status) {                  // If not set then
+    return false;                // ignore other checks
+  }
+
+  if (flags[CHECK_DAWGS] &&
+    (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
+    (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
+    (word->best_choice->permuter () != USER_DAWG_PERM) &&
+    (word->best_choice->permuter () != NUMBER_PERM)) {
+    if (tessedit_adaption_debug) tprintf("word not in dawgs\n");
+    return false;
+  }
+
+  if (flags[CHECK_ONE_ELL_CONFLICT] && one_ell_conflict (word, false)) {
+    if (tessedit_adaption_debug) tprintf("word has ell conflict\n");
+    return false;
+  }
+
+  if (flags[CHECK_SPACES] &&
+    (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
+    if (tessedit_adaption_debug) tprintf("word contains spaces\n");
+    return false;
+  }
+
+  if (flags[CHECK_AMBIG_WERD] &&
+      word->best_choice->dangerous_ambig_found()) {
+    if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
+    return false;
+  }
+
+  if (tessedit_adaption_debug) {
+    tprintf("returning status %d\n", status);
+  }
+  return status;
+}
+
+}  // namespace tesseract
diff --git a/tesseract/src/ccmain/applybox.cpp b/tesseract/src/ccmain/applybox.cpp
new file mode 100644
index 00000000..a8d1bbcd
--- /dev/null
+++ b/tesseract/src/ccmain/applybox.cpp
@@ -0,0 +1,807 @@
+/**********************************************************************
+ * File:        applybox.cpp  (Formerly applybox.c)
+ * Description: Re segment rows according to box file data
+ * Author:      Phil Cheatle
+ *
+ * (C) Copyright 1993, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef DISABLED_LEGACY_ENGINE
+#include <cctype>
+#include <cerrno>
+#include <cstring>
+#include "allheaders.h"
+#include "boxread.h"
+#endif  // ndef DISABLED_LEGACY_ENGINE
+#include "pageres.h"
+#include <tesseract/unichar.h>
+#include "unicharset.h"
+#include "tesseractclass.h"
+#include "genericvector.h"
+
+#ifndef DISABLED_LEGACY_ENGINE
+/** Max number of blobs to classify together in FindSegmentation. */
+const int kMaxGroupSize = 4;
+/// Max fraction of median allowed as deviation in xheight before switching
+/// to median.
+const double kMaxXHeightDeviationFraction = 0.125;
+#endif  // ndef DISABLED_LEGACY_ENGINE
+
+/**
+ * The box file is assumed to contain box definitions, one per line, of the
+ * following format for blob-level boxes:
+ * @verbatim
+ *   <UTF8 str> <left> <bottom> <right> <top> <page id>
+ * @endverbatim
+ * and for word/line-level boxes:
+ * @verbatim
+ *   WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
+ * @endverbatim
+ * NOTES:
+ * The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
+ *
+ * <page id> is 0-based, and the page number is used for multipage input (tiff).
+ *
+ * In the blob-level form, each line represents a recognizable unit, which may
+ * be several UTF-8 bytes, but there is a bounding box around each recognizable
+ * unit, and no classifier is needed to train in this mode (bootstrapping.)
+ *
+ * In the word/line-level form, the line begins with the literal "WordStr", and
+ * the bounding box bounds either a whole line or a whole word. The recognizable
+ * units in the word/line are listed after the # at the end of the line and
+ * are space delimited, ignoring any original spaces on the line.
+ * Eg.
+ * @verbatim
+ * word -> #w o r d
+ * multi word line -> #m u l t i w o r d l i n e
+ * @endverbatim
+ * The recognizable units must be space-delimited in order to allow multiple
+ * unicodes to be used for a single recognizable unit, eg Hindi.
+ *
+ * In this mode, the classifier must have been pre-trained with the desired
+ * character set, or it will not be able to find the character segmentations.
+ */
+
+namespace tesseract {
+
+#ifndef DISABLED_LEGACY_ENGINE
+static void clear_any_old_text(BLOCK_LIST *block_list) {
+  BLOCK_IT block_it(block_list);
+  for (block_it.mark_cycle_pt();
+       !block_it.cycled_list(); block_it.forward()) {
+    ROW_IT row_it(block_it.data()->row_list());
+    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
+      WERD_IT word_it(row_it.data()->word_list());
+      for (word_it.mark_cycle_pt();
+           !word_it.cycled_list(); word_it.forward()) {
+        word_it.data()->set_text("");
+      }
+    }
+  }
+}
+
+// Applies the box file based on the image name filename, and resegments
+// the words in the block_list (page), with:
+// blob-mode: one blob per line in the box file, words as input.
+// word/line-mode: one blob per space-delimited unit after the #, and one word
+// per line in the box file. (See comment above for box file format.)
+// If find_segmentation is true, (word/line mode) then the classifier is used
+// to re-segment words/lines to match the space-delimited truth string for
+// each box. In this case, the input box may be for a word or even a whole
+// text line, and the output words will contain multiple blobs corresponding
+// to the space-delimited input string.
+// With find_segmentation false, no classifier is needed, but the chopper
+// can still be used to correctly segment touching characters with the help
+// of the input boxes.
+// In the returned PAGE_RES, the WERD_RES are setup as they would be returned
+// from normal classification, ie. with a word, chopped_word, rebuild_word,
+// seam_array, denorm, box_word, and best_state, but NO best_choice or
+// raw_choice, as they would require a UNICHARSET, which we aim to avoid.
+// Instead, the correct_text member of WERD_RES is set, and this may be later
+// converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
+// is not required before calling ApplyBoxTraining.
+PAGE_RES* Tesseract::ApplyBoxes(const char* filename,
+                                bool find_segmentation,
+                                BLOCK_LIST *block_list) {
+  std::vector<TBOX> boxes;
+  std::vector<STRING> texts, full_texts;
+  if (!ReadAllBoxes(applybox_page, true, filename, &boxes, &texts, &full_texts,
+                    nullptr)) {
+    return nullptr;  // Can't do it.
+  }
+
+  const int box_count = boxes.size();
+  int box_failures = 0;
+
+  // In word mode, we use the boxes to make a word for each box, but
+  // in blob mode we use the existing words and maximally chop them first.
+  PAGE_RES* page_res = find_segmentation ?
+      nullptr : SetupApplyBoxes(boxes, block_list);
+  clear_any_old_text(block_list);
+
+  for (int i = 0; i < box_count; i++) {
+    bool foundit = false;
+    if (page_res != nullptr) {
+      foundit = ResegmentCharBox(page_res,
+                                 (i == 0) ? nullptr : &boxes[i - 1],
+                                 boxes[i],
+                                 (i == box_count - 1) ? nullptr : &boxes[i + 1],
+                                 full_texts[i].c_str());
+    } else {
+      foundit = ResegmentWordBox(block_list, boxes[i],
+                                 (i == box_count - 1) ? nullptr : &boxes[i + 1],
+                                 texts[i].c_str());
+    }
+    if (!foundit) {
+      box_failures++;
+      ReportFailedBox(i, boxes[i], texts[i].c_str(),
+                      "FAILURE! Couldn't find a matching blob");
+    }
+  }
+
+  if (page_res == nullptr) {
+    // In word/line mode, we now maximally chop all the words and resegment
+    // them with the classifier.
+    page_res = SetupApplyBoxes(boxes, block_list);
+    ReSegmentByClassification(page_res);
+  }
+  if (applybox_debug > 0) {
+    tprintf("APPLY_BOXES:\n");
+    tprintf("   Boxes read from boxfile:  %6d\n", box_count);
+    if (box_failures > 0)
+      tprintf("   Boxes failed resegmentation:  %6d\n", box_failures);
+  }
+  TidyUp(page_res);
+  return page_res;
+}
+
+// Helper computes median xheight in the image.
+static double MedianXHeight(BLOCK_LIST *block_list) {
+  BLOCK_IT block_it(block_list);
+  STATS xheights(0, block_it.data()->pdblk.bounding_box().height());
+  for (block_it.mark_cycle_pt();
+       !block_it.cycled_list(); block_it.forward()) {
+    ROW_IT row_it(block_it.data()->row_list());
+    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
+      xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
+    }
+  }
+  return xheights.median();
+}
+
+/// Any row xheight that is significantly different from the median is set
+/// to the median.
+void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
+  const double median_xheight = MedianXHeight(block_list);
+  const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
+  // Strip all fuzzy space markers to simplify the PAGE_RES.
+  BLOCK_IT b_it(block_list);
+  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+    BLOCK* block = b_it.data();
+    ROW_IT r_it(block->row_list());
+    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
+      ROW* row = r_it.data();
+      const double diff = fabs(row->x_height() - median_xheight);
+      if (diff > max_deviation) {
+        if (applybox_debug) {
+          tprintf("row xheight=%g, but median xheight = %g\n",
+                  row->x_height(), median_xheight);
+        }
+        row->set_x_height(static_cast<float>(median_xheight));
+      }
+    }
+  }
+}
+
+/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
+/// All fuzzy spaces are removed, and all the words are maximally chopped.
+PAGE_RES* Tesseract::SetupApplyBoxes(const std::vector<TBOX>& boxes,
+                                     BLOCK_LIST *block_list) {
+  PreenXHeights(block_list);
+  // Strip all fuzzy space markers to simplify the PAGE_RES.
+  BLOCK_IT b_it(block_list);
+  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+    BLOCK* block = b_it.data();
+    ROW_IT r_it(block->row_list());
+    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
+      ROW* row = r_it.data();
+      WERD_IT w_it(row->word_list());
+      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
+        WERD* word = w_it.data();
+        if (word->cblob_list()->empty()) {
+          delete w_it.extract();
+        } else {
+          word->set_flag(W_FUZZY_SP, false);
+          word->set_flag(W_FUZZY_NON, false);
+        }
+      }
+    }
+  }
+  auto* page_res = new PAGE_RES(false, block_list, nullptr);
+  PAGE_RES_IT pr_it(page_res);
+  WERD_RES* word_res;
+  while ((word_res = pr_it.word()) != nullptr) {
+    MaximallyChopWord(boxes, pr_it.block()->block,
+                      pr_it.row()->row, word_res);
+    pr_it.forward();
+  }
+  return page_res;
+}
+
+/// Tests the chopper by exhaustively running chop_one_blob.
+/// The word_res will contain filled chopped_word, seam_array, denorm,
+/// box_word and best_state for the maximally chopped word.
+void Tesseract::MaximallyChopWord(const std::vector<TBOX>& boxes,
+                                  BLOCK* block, ROW* row,
+                                  WERD_RES* word_res) {
+  if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
+                                     tessedit_ocr_engine_mode, nullptr,
+                                     classify_bln_numeric_mode,
+                                     textord_use_cjk_fp_model,
+                                     poly_allow_detailed_fx,
+                                     row, block)) {
+    word_res->CloneChoppedToRebuild();
+    return;
+  }
+  if (chop_debug) {
+    tprintf("Maximally chopping word at:");
+    word_res->word->bounding_box().print();
+  }
+  GenericVector<BLOB_CHOICE*> blob_choices;
+  ASSERT_HOST(!word_res->chopped_word->blobs.empty());
+  auto rating = static_cast<float>(INT8_MAX);
+  for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
+    // The rating and certainty are not quite arbitrary. Since
+    // select_blob_to_chop uses the worst certainty to choose, they all have
+    // to be different, so starting with INT8_MAX, subtract 1/8 for each blob
+    // in here, and then divide by e each time they are chopped, which
+    // should guarantee a set of unequal values for the whole tree of blobs
+    // produced, however much chopping is required. The chops are thus only
+    // limited by the ability of the chopper to find suitable chop points,
+    // and not by the value of the certainties.
+    auto* choice =
+        new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
+    blob_choices.push_back(choice);
+    rating -= 0.125f;
+  }
+  const double e = exp(1.0);  // The base of natural logs.
+  int blob_number;
+  int right_chop_index = 0;
+  if (!assume_fixed_pitch_char_segment) {
+    // We only chop if the language is not fixed pitch like CJK.
+    SEAM* seam = nullptr;
+    while ((seam = chop_one_blob(boxes, blob_choices, word_res,
+                                 &blob_number)) != nullptr) {
+      word_res->InsertSeam(blob_number, seam);
+      BLOB_CHOICE* left_choice = blob_choices[blob_number];
+      rating = left_choice->rating() / e;
+      left_choice->set_rating(rating);
+      left_choice->set_certainty(-rating);
+      // combine confidence w/ serial #
+      auto* right_choice = new BLOB_CHOICE(++right_chop_index,
+                                                  rating - 0.125f, -rating, -1,
+                                                  0.0f, 0.0f, 0.0f, BCC_FAKE);
+      blob_choices.insert(right_choice, blob_number + 1);
+    }
+  }
+  word_res->CloneChoppedToRebuild();
+  word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
+}
+
+/// Helper to compute the dispute resolution metric.
+/// Disputed blob resolution. The aim is to give the blob to the most
+/// appropriate boxfile box. Most of the time it is obvious, but if
+/// two boxfile boxes overlap significantly it is not. If a small boxfile
+/// box takes most of the blob, and a large boxfile box does too, then
+/// we want the small boxfile box to get it, but if the small box
+/// is much smaller than the blob, we don't want it to get it.
+/// Details of the disputed blob resolution:
+/// Given a box with area A, and a blob with area B, with overlap area C,
+/// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
+/// miss metric gets the blob.
+static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
+  const int overlap_area = box1.intersection(box2).area();
+  const int a = box1.area();
+  const int b = box2.area();
+  ASSERT_HOST(a != 0 && b != 0);
+  return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b;
+}
+
+/// Gather consecutive blobs that match the given box into the best_state
+/// and corresponding correct_text.
+///
+/// Fights over which box owns which blobs are settled by pre-chopping and
+/// applying the blobs to box or next_box with the least non-overlap.
+/// @return false if the box was in error, which can only be caused by
+/// failing to find an appropriate blob for a box.
+///
+/// This means that occasionally, blobs may be incorrectly segmented if the
+/// chopper fails to find a suitable chop point.
+bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX* prev_box,
+                                 const TBOX& box, const TBOX* next_box,
+                                 const char* correct_text) {
+  if (applybox_debug > 1) {
+    tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
+  }
+  PAGE_RES_IT page_res_it(page_res);
+  WERD_RES* word_res;
+  for (word_res = page_res_it.word(); word_res != nullptr;
+       word_res = page_res_it.forward()) {
+    if (!word_res->box_word->bounding_box().major_overlap(box))
+      continue;
+    if (applybox_debug > 1) {
+      tprintf("Checking word box:");
+      word_res->box_word->bounding_box().print();
+    }
+    int word_len = word_res->box_word->length();
+    for (int i = 0; i < word_len; ++i) {
+      TBOX char_box = TBOX();
+      int blob_count = 0;
+      for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
+        TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
+        if (!blob_box.major_overlap(box))
+          break;
+        if (word_res->correct_text[i + blob_count].length() > 0)
+          break;  // Blob is claimed already.
+        if (next_box != nullptr) {
+          const double current_box_miss_metric = BoxMissMetric(blob_box, box);
+          const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
+          if (applybox_debug > 2) {
+            tprintf("Checking blob:");
+            blob_box.print();
+            tprintf("Current miss metric = %g, next = %g\n",
+                    current_box_miss_metric, next_box_miss_metric);
+          }
+          if (current_box_miss_metric > next_box_miss_metric)
+            break;  // Blob is a better match for next box.
+        }
+        char_box += blob_box;
+      }
+      if (blob_count > 0) {
+        if (applybox_debug > 1) {
+          tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
+        }
+        if (!char_box.almost_equal(box, 3) &&
+            ((next_box != nullptr && box.x_gap(*next_box) < -3)||
+             (prev_box != nullptr && prev_box->x_gap(box) < -3))) {
+          return false;
+        }
+        // We refine just the box_word, best_state and correct_text here.
+        // The rebuild_word is made in TidyUp.
+        // blob_count blobs are put together to match the box. Merge the
+        // box_word boxes, save the blob_count in the state and the text.
+        word_res->box_word->MergeBoxes(i, i + blob_count);
+        word_res->best_state[i] = blob_count;
+        word_res->correct_text[i] = correct_text;
+        if (applybox_debug > 2) {
+          tprintf("%d Blobs match: blob box:", blob_count);
+          word_res->box_word->BlobBox(i).print();
+          tprintf("Matches box:");
+          box.print();
+          if (next_box != nullptr) {
+            tprintf("With next box:");
+            next_box->print();
+          }
+        }
+        // Eliminated best_state and correct_text entries for the consumed
+        // blobs.
+        for (int j = 1; j < blob_count; ++j) {
+          word_res->best_state.remove(i + 1);
+          word_res->correct_text.remove(i + 1);
+        }
+        // Assume that no box spans multiple source words, so we are done with
+        // this box.
+        if (applybox_debug > 1) {
+          tprintf("Best state = ");
+          for (int j = 0; j < word_res->best_state.size(); ++j) {
+            tprintf("%d ", word_res->best_state[j]);
+          }
+          tprintf("\n");
+          tprintf("Correct text = [[ ");
+          for (int j = 0; j < word_res->correct_text.size(); ++j) {
+            tprintf("%s ", word_res->correct_text[j].c_str());
+          }
+          tprintf("]]\n");
+        }
+        return true;
+      }
+    }
+  }
+  if (applybox_debug > 0) {
+    tprintf("FAIL!\n");
+  }
+  return false;  // Failure.
+}
+
+/// Consume all source blobs that strongly overlap the given box,
+/// putting them into a new word, with the correct_text label.
+/// Fights over which box owns which blobs are settled by
+/// applying the blobs to box or next_box with the least non-overlap.
+/// @return false if the box was in error, which can only be caused by
+/// failing to find an overlapping blob for a box.
+bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
+                                 const TBOX& box, const TBOX* next_box,
+                                 const char* correct_text) {
+  if (applybox_debug > 1) {
+    tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
+  }
+  WERD* new_word = nullptr;
+  BLOCK_IT b_it(block_list);
+  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+    BLOCK* block = b_it.data();
+    if (!box.major_overlap(block->pdblk.bounding_box()))
+      continue;
+    ROW_IT r_it(block->row_list());
+    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
+      ROW* row = r_it.data();
+      if (!box.major_overlap(row->bounding_box()))
+        continue;
+      WERD_IT w_it(row->word_list());
+      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
+        WERD* word = w_it.data();
+        if (applybox_debug > 2) {
+          tprintf("Checking word:");
+          word->bounding_box().print();
+        }
+        if (word->text() != nullptr && word->text()[0] != '\0')
+          continue;  // Ignore words that are already done.
+        if (!box.major_overlap(word->bounding_box()))
+          continue;
+        C_BLOB_IT blob_it(word->cblob_list());
+        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
+             blob_it.forward()) {
+          C_BLOB* blob = blob_it.data();
+          TBOX blob_box = blob->bounding_box();
+          if (!blob_box.major_overlap(box))
+            continue;
+          if (next_box != nullptr) {
+            const double current_box_miss_metric = BoxMissMetric(blob_box, box);
+            const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
+            if (applybox_debug > 2) {
+              tprintf("Checking blob:");
+              blob_box.print();
+              tprintf("Current miss metric = %g, next = %g\n",
+                      current_box_miss_metric, next_box_miss_metric);
+            }
+            if (current_box_miss_metric > next_box_miss_metric)
+              continue;  // Blob is a better match for next box.
+          }
+          if (applybox_debug > 2) {
+            tprintf("Blob match: blob:");
+            blob_box.print();
+            tprintf("Matches box:");
+            box.print();
+            if (next_box != nullptr) {
+              tprintf("With next box:");
+              next_box->print();
+            }
+          }
+          if (new_word == nullptr) {
+            // Make a new word with a single blob.
+            new_word = word->shallow_copy();
+            new_word->set_text(correct_text);
+            w_it.add_to_end(new_word);
+          }
+          C_BLOB_IT new_blob_it(new_word->cblob_list());
+          new_blob_it.add_to_end(blob_it.extract());
+        }
+      }
+    }
+  }
+  if (new_word == nullptr && applybox_debug > 0) tprintf("FAIL!\n");
+  return new_word != nullptr;
+}
+
+/// Resegments the words by running the classifier in an attempt to find the
+/// correct segmentation that produces the required string.
+void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
+  PAGE_RES_IT pr_it(page_res);
+  WERD_RES* word_res;
+  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
+    const WERD* word = word_res->word;
+    if (word->text() == nullptr || word->text()[0] == '\0')
+      continue;  // Ignore words that have no text.
+    // Convert the correct text to a vector of UNICHAR_ID
+    GenericVector<UNICHAR_ID> target_text;
+    if (!ConvertStringToUnichars(word->text(), &target_text)) {
+      tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
+              word->text());
+      pr_it.DeleteCurrentWord();
+      continue;
+    }
+    if (!FindSegmentation(target_text, word_res)) {
+      tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
+              word->text());
+      pr_it.DeleteCurrentWord();
+      continue;
+    }
+  }
+}
+
+/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
+/// @return false if an invalid UNICHAR_ID is encountered.
+bool Tesseract::ConvertStringToUnichars(const char* utf8,
+                                        GenericVector<UNICHAR_ID>* class_ids) {
+  for (int step = 0; *utf8 != '\0'; utf8 += step) {
+    const char* next_space = strchr(utf8, ' ');
+    if (next_space == nullptr)
+      next_space = utf8 + strlen(utf8);
+    step = next_space - utf8;
+    UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
+    if (class_id == INVALID_UNICHAR_ID) {
+      return false;
+    }
+    while (utf8[step] == ' ')
+      ++step;
+    class_ids->push_back(class_id);
+  }
+  return true;
+}
+
+/// Resegments the word to achieve the target_text from the classifier.
+/// Returns false if the re-segmentation fails.
+/// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
+/// applies a full search on the classifier results to find the best classified
+/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
+/// substitutions ARE used.
+bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
+                                 WERD_RES* word_res) {
+  // Classify all required combinations of blobs and save results in choices.
+  const int word_length = word_res->box_word->length();
+  auto* choices =
+      new GenericVector<BLOB_CHOICE_LIST*>[word_length];
+  for (int i = 0; i < word_length; ++i) {
+    for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
+      BLOB_CHOICE_LIST* match_result = classify_piece(
+          word_res->seam_array, i, i + j - 1, "Applybox",
+          word_res->chopped_word, word_res->blamer_bundle);
+      if (applybox_debug > 2) {
+        tprintf("%d+%d:", i, j);
+        print_ratings_list("Segment:", match_result, unicharset);
+      }
+      choices[i].push_back(match_result);
+    }
+  }
+  // Search the segmentation graph for the target text. Must be an exact
+  // match. Using wildcards makes it difficult to find the correct
+  // segmentation even when it is there.
+  word_res->best_state.clear();
+  GenericVector<int> search_segmentation;
+  float best_rating = 0.0f;
+  SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
+                &search_segmentation, &best_rating, &word_res->best_state);
+  for (int i = 0; i < word_length; ++i)
+    choices[i].delete_data_pointers();
+  delete [] choices;
+  if (word_res->best_state.empty()) {
+    // Build the original segmentation and if it is the same length as the
+    // truth, assume it will do.
+    int blob_count = 1;
+    for (int s = 0; s < word_res->seam_array.size(); ++s) {
+      SEAM* seam = word_res->seam_array[s];
+      if (!seam->HasAnySplits()) {
+        word_res->best_state.push_back(blob_count);
+        blob_count = 1;
+      } else {
+        ++blob_count;
+      }
+    }
+    word_res->best_state.push_back(blob_count);
+    if (word_res->best_state.size() != target_text.size()) {
+      word_res->best_state.clear();  // No good. Original segmentation bad size.
+      return false;
+    }
+  }
+  word_res->correct_text.clear();
+  for (int i = 0; i < target_text.size(); ++i) {
+    word_res->correct_text.push_back(
+        STRING(unicharset.id_to_unichar(target_text[i])));
+  }
+  return true;
+}
+
+/// Recursive helper to find a match to the target_text (from text_index
+/// position) in the choices (from choices_pos position).
+/// @param choices is an array of GenericVectors, of length choices_length,
+/// with each element representing a starting position in the word, and the
+/// #GenericVector holding classification results for a sequence of consecutive
+/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
+/// @param choices_pos
+/// @param choices_length
+/// @param target_text
+/// @param text_index
+/// @param rating
+/// @param segmentation
+/// @param best_rating
+/// @param best_segmentation
+void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
+                              int choices_pos, int choices_length,
+                              const GenericVector<UNICHAR_ID>& target_text,
+                              int text_index,
+                              float rating, GenericVector<int>* segmentation,
+                              float* best_rating,
+                              GenericVector<int>* best_segmentation) {
+  const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs();
+  for (int length = 1; length <= choices[choices_pos].size(); ++length) {
+    // Rating of matching choice or worst choice if no match.
+    float choice_rating = 0.0f;
+    // Find the corresponding best BLOB_CHOICE.
+    BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
+    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
+         choice_it.forward()) {
+      const BLOB_CHOICE* choice = choice_it.data();
+      choice_rating = choice->rating();
+      UNICHAR_ID class_id = choice->unichar_id();
+      if (class_id == target_text[text_index]) {
+        break;
+      }
+      // Search ambigs table.
+      if (class_id < table.size() && table[class_id] != nullptr) {
+        AmbigSpec_IT spec_it(table[class_id]);
+        for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
+             spec_it.forward()) {
+          const AmbigSpec *ambig_spec = spec_it.data();
+          // We'll only do 1-1.
+          if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
+              ambig_spec->correct_ngram_id == target_text[text_index])
+            break;
+        }
+        if (!spec_it.cycled_list())
+          break;  // Found an ambig.
+      }
+    }
+    if (choice_it.cycled_list())
+      continue;  // No match.
+    segmentation->push_back(length);
+    if (choices_pos + length == choices_length &&
+        text_index + 1 == target_text.size()) {
+      // This is a complete match. If the rating is good record a new best.
+      if (applybox_debug > 2) {
+        tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
+                rating + choice_rating, *best_rating, segmentation->size(),
+                best_segmentation->size());
+      }
+      if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
+        *best_segmentation = *segmentation;
+        *best_rating = rating + choice_rating;
+      }
+    } else if (choices_pos + length < choices_length &&
+               text_index + 1 < target_text.size()) {
+      if (applybox_debug > 3) {
+        tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
+                target_text[text_index],
+                unicharset.id_to_unichar(target_text[text_index]),
+                choice_it.data()->unichar_id() == target_text[text_index]
+                     ? "Match" : "Ambig",
+                choices_pos, length);
+      }
+      SearchForText(choices, choices_pos + length, choices_length, target_text,
+                    text_index + 1, rating + choice_rating, segmentation,
+                    best_rating, best_segmentation);
+      if (applybox_debug > 3) {
+        tprintf("End recursion for %d=%s\n", target_text[text_index],
+                unicharset.id_to_unichar(target_text[text_index]));
+      }
+    }
+    segmentation->truncate(segmentation->size() - 1);
+  }
+}
+
+/// - Counts up the labelled words and the blobs within.
+/// - Deletes all unused or emptied words, counting the unused ones.
+/// - Resets W_BOL and W_EOL flags correctly.
+/// - Builds the rebuild_word and rebuilds the box_word and the best_choice.
+void Tesseract::TidyUp(PAGE_RES* page_res) {
+  int ok_blob_count = 0;
+  int bad_blob_count = 0;
+  int ok_word_count = 0;
+  int unlabelled_words = 0;
+  PAGE_RES_IT pr_it(page_res);
+  WERD_RES* word_res;
+  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
+    int ok_in_word = 0;
+    int blob_count = word_res->correct_text.size();
+    auto* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
+    word_choice->set_permuter(TOP_CHOICE_PERM);
+    for (int c = 0; c < blob_count; ++c) {
+      if (word_res->correct_text[c].length() > 0) {
+        ++ok_in_word;
+      }
+      // Since we only need a fake word_res->best_choice, the actual
+      // unichar_ids do not matter. Which is fortunate, since TidyUp()
+      // can be called while training Tesseract, at the stage where
+      // unicharset is not meaningful yet.
+      word_choice->append_unichar_id_space_allocated(
+          INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
+    }
+    if (ok_in_word > 0) {
+      ok_blob_count += ok_in_word;
+      bad_blob_count += word_res->correct_text.size() - ok_in_word;
+      word_res->LogNewRawChoice(word_choice);
+      word_res->LogNewCookedChoice(1, false, word_choice);
+    } else {
+      ++unlabelled_words;
+      if (applybox_debug > 0) {
+        tprintf("APPLY_BOXES: Unlabelled word at :");
+        word_res->word->bounding_box().print();
+      }
+      pr_it.DeleteCurrentWord();
+      delete word_choice;
+    }
+  }
+  pr_it.restart_page();
+  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
+    // Denormalize back to a BoxWord.
+    word_res->RebuildBestState();
+    word_res->SetupBoxWord();
+    word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
+    word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
+  }
+  if (applybox_debug > 0) {
+    tprintf("   Found %d good blobs.\n", ok_blob_count);
+    if (bad_blob_count > 0) {
+      tprintf("   Leaving %d unlabelled blobs in %d words.\n",
+              bad_blob_count, ok_word_count);
+    }
+    if (unlabelled_words > 0)
+      tprintf("   %d remaining unlabelled words deleted.\n", unlabelled_words);
+  }
+}
+
+/** Logs a bad box by line in the box file and box coords.*/
+void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
+                                const char *box_ch, const char *err_msg) {
+  tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
+          boxfile_lineno + 1, box_ch,
+          box.left(), box.bottom(), box.right(), box.top(), err_msg);
+}
+
+/// Calls #LearnWord to extract features for labelled blobs within each word.
+/// Features are stored in an internal buffer.
+void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
+  PAGE_RES_IT pr_it(page_res);
+  int word_count = 0;
+  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr;
+       word_res = pr_it.forward()) {
+    LearnWord(fontname.c_str(), word_res);
+    ++word_count;
+  }
+  tprintf("Generated training data for %d words\n", word_count);
+}
+
+#endif  // ndef DISABLED_LEGACY_ENGINE
+
+/** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
+void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
+  PAGE_RES_IT pr_it(page_res);
+  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr;
+       word_res = pr_it.forward()) {
+    auto* choice = new WERD_CHOICE(word_res->uch_set,
+                                          word_res->correct_text.size());
+    for (int i = 0; i < word_res->correct_text.size(); ++i) {
+      // The part before the first space is the real ground truth, and the
+      // rest is the bounding box location and page number.
+      std::vector<STRING> tokens;
+      word_res->correct_text[i].split(' ', &tokens);
+      UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str());
+      choice->append_unichar_id_space_allocated(char_id,
+                                                word_res->best_state[i],
+                                                0.0f, 0.0f);
+    }
+    word_res->ClearWordChoices();
+    word_res->LogNewRawChoice(choice);
+    word_res->LogNewCookedChoice(1, false, choice);
+  }
+}
+
+}  // namespace tesseract
diff --git a/tesseract/src/ccmain/control.cpp b/tesseract/src/ccmain/control.cpp
new file mode 100644
index 00000000..50b0fb05
--- /dev/null
+++ b/tesseract/src/ccmain/control.cpp
@@ -0,0 +1,2110 @@
+/******************************************************************
+ * File:        control.cpp  (Formerly control.c)
+ * Description: Module-independent matcher controller.
+ * Author:      Ray Smith
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#include <cmath>
+#include <cstdint>               // for int16_t, int32_t
+#include <cstdio>                // for fclose, fopen, FILE
+#include <ctime>                 // for clock
+#include <cctype>
+#include "control.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#include "docqual.h"
+#include "drawfx.h"
+#include "fixspace.h"
+#endif
+#include "lstmrecognizer.h"
+#include <tesseract/ocrclass.h>
+#include "output.h"
+#include "pageres.h"             // for WERD_RES, PAGE_RES_IT, PAGE_RES, BLO...
+#ifndef DISABLED_LEGACY_ENGINE
+#include "reject.h"
+#endif
+#include "sorthelper.h"
+#include "tesseractclass.h"
+#include "tessvars.h"
+#include "werdit.h"
+
+const char* const kBackUpConfigFile = "tempconfigdata.config";
+// Min believable x-height for any text when refitting as a fraction of
+// original x-height
+const double kMinRefitXHeightFraction = 0.5;
+
+
+/**
+ * Make a word from the selected blobs and run Tess on them.
+ *
+ * @param page_res recognise blobs
+ * @param selection_box within this box
+ */
+namespace tesseract {
+
+void Tesseract::recog_pseudo_word(PAGE_RES* page_res,
+                                  TBOX &selection_box) {
+  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
+  if (it != nullptr) {
+    recog_interactive(it);
+    it->DeleteCurrentWord();
+    delete it;
+  }
+}
+
+/**
+ * Recognize a single word in interactive mode.
+ *
+ * @param pr_it the page results iterator
+ */
+bool Tesseract::recog_interactive(PAGE_RES_IT* pr_it) {
+  WordData word_data(*pr_it);
+  SetupWordPassN(2, &word_data);
+  // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
+  if (lstm_recognizer_ == nullptr) {
+#ifndef DISABLED_LEGACY_ENGINE
+    classify_word_and_language(2, pr_it, &word_data);
+#endif  // ndef DISABLED_LEGACY_ENGINE
+  } else {
+    classify_word_and_language(1, pr_it, &word_data);
+  }
+#ifndef DISABLED_LEGACY_ENGINE
+  if (tessedit_debug_quality_metrics) {
+    int16_t char_qual;
+    int16_t good_char_qual;
+    WERD_RES* word_res = pr_it->word();
+    word_char_quality(word_res, &char_qual, &good_char_qual);
+    tprintf("\n%d chars;  word_blob_quality: %d;  outline_errs: %d; "
+            "char_quality: %d; good_char_quality: %d\n",
+            word_res->reject_map.length(),
+            word_blob_quality(word_res),
+            word_outline_errs(word_res), char_qual, good_char_qual);
+  }
+#endif  // ndef DISABLED_LEGACY_ENGINE
+  return true;
+}
+
+// Helper function to check for a target word and handle it appropriately.
+// Inspired by Jetsoft's requirement to process only single words on pass2
+// and beyond.
+// If word_config is not null:
+//   If the word_box and target_word_box overlap, read the word_config file
+//   else reset to previous config data.
+//   return true.
+// else
+//   If the word_box and target_word_box overlap or pass <= 1, return true.
+// Note that this function uses a fixed temporary file for storing the previous
+// configs, so it is neither thread-safe, nor process-safe, but the assumption
+// is that it will only be used for one debug window at a time.
+//
+// Since this function is used for debugging (and not to change OCR results)
+// set only debug params from the word config file.
+bool Tesseract::ProcessTargetWord(const TBOX& word_box,
+                                  const TBOX& target_word_box,
+                                  const char* word_config,
+                                  int pass) {
+  if (word_config != nullptr) {
+    if (word_box.major_overlap(target_word_box)) {
+      if (backup_config_file_ == nullptr) {
+        backup_config_file_ = kBackUpConfigFile;
+        FILE* config_fp = fopen(backup_config_file_, "wb");
+        if (config_fp == nullptr) {
+          tprintf("Error, failed to open file \"%s\"\n", backup_config_file_);
+        } else {
+          ParamUtils::PrintParams(config_fp, params());
+          fclose(config_fp);
+        }
+        ParamUtils::ReadParamsFile(word_config,
+                                   SET_PARAM_CONSTRAINT_DEBUG_ONLY,
+                                   params());
+      }
+    } else {
+      if (backup_config_file_ != nullptr) {
+        ParamUtils::ReadParamsFile(backup_config_file_,
+                                   SET_PARAM_CONSTRAINT_DEBUG_ONLY,
+                                   params());
+        backup_config_file_ = nullptr;
+      }
+    }
+  } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
+    return false;
+  }
+  return true;
+}
+
+/** If tesseract is to be run, sets the words up ready for it. */
+void Tesseract::SetupAllWordsPassN(int pass_n,
+                                   const TBOX* target_word_box,
+                                   const char* word_config,
+                                   PAGE_RES* page_res,
+                                   std::vector<WordData>* words) {
+  // Prepare all the words.
+  PAGE_RES_IT page_res_it(page_res);
+  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
+       page_res_it.forward()) {
+    if (target_word_box == nullptr ||
+        ProcessTargetWord(page_res_it.word()->word->bounding_box(),
+                          *target_word_box, word_config, 1)) {
+      words->push_back(WordData(page_res_it));
+    }
+  }
+  // Setup all the words for recognition with polygonal approximation.
+  for (int w = 0; w < words->size(); ++w) {
+    SetupWordPassN(pass_n, &(*words)[w]);
+    if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
+  }
+}
+
+// Sets up the single word ready for whichever engine is to be run.
+void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
+  if (pass_n == 1 || !word->word->done) {
+    if (pass_n == 1) {
+      word->word->SetupForRecognition(unicharset, this, BestPix(),
+                                      tessedit_ocr_engine_mode, nullptr,
+                                      classify_bln_numeric_mode,
+                                      textord_use_cjk_fp_model,
+                                      poly_allow_detailed_fx,
+                                      word->row, word->block);
+    } else if (pass_n == 2) {
+      // TODO(rays) Should we do this on pass1 too?
+      word->word->caps_height = 0.0;
+      if (word->word->x_height == 0.0f)
+        word->word->x_height = word->row->x_height();
+    }
+    word->lang_words.truncate(0);
+    for (int s = 0; s <= sub_langs_.size(); ++s) {
+      // The sub_langs_.size() entry is for the master language.
+      Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
+      auto* word_res = new WERD_RES;
+      word_res->InitForRetryRecognition(*word->word);
+      word->lang_words.push_back(word_res);
+      // LSTM doesn't get setup for pass2.
+      if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
+        word_res->SetupForRecognition(
+              lang_t->unicharset, lang_t, BestPix(),
+              lang_t->tessedit_ocr_engine_mode, nullptr,
+              lang_t->classify_bln_numeric_mode,
+              lang_t->textord_use_cjk_fp_model,
+              lang_t->poly_allow_detailed_fx, word->row, word->block);
+      }
+    }
+  }
+}
+
+// Runs word recognition on all the words.
+bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
+                                   PAGE_RES_IT* pr_it,
+                                   std::vector<WordData>* words) {
+  // TODO(rays) Before this loop can be parallelized (it would yield a massive
+  // speed-up) all remaining member globals need to be converted to local/heap
+  // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
+  // added. The results will be significantly different with adaption on, and
+  // deterioration will need investigation.
+  pr_it->restart_page();
+  for (int w = 0; w < words->size(); ++w) {
+    WordData* word = &(*words)[w];
+    if (w > 0) word->prev_word = &(*words)[w - 1];
+    if (monitor != nullptr) {
+      monitor->ocr_alive = true;
+      if (pass_n == 1) {
+        monitor->progress = 70 * w / words->size();
+      } else {
+        monitor->progress = 70 + 30 * w / words->size();
+      }
+      if (monitor->progress_callback2 != nullptr) {
+        TBOX box = pr_it->word()->word->bounding_box();
+        (*monitor->progress_callback2)(monitor, box.left(),
+                                      box.right(), box.top(), box.bottom());
+      }
+      if (monitor->deadline_exceeded() ||
+          (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this,
+                                                         words->size()))) {
+        // Timeout. Fake out the rest of the words.
+        for (; w < words->size(); ++w) {
+          (*words)[w].word->SetupFake(unicharset);
+        }
+        return false;
+      }
+    }
+    if (word->word->tess_failed) {
+      int s;
+      for (s = 0; s < word->lang_words.size() &&
+           word->lang_words[s]->tess_failed; ++s) {}
+      // If all are failed, skip it. Image words are skipped by this test.
+      if (s > word->lang_words.size()) continue;
+    }
+    // Sync pr_it with the wth WordData.
+    while (pr_it->word() != nullptr && pr_it->word() != word->word)
+      pr_it->forward();
+    ASSERT_HOST(pr_it->word() != nullptr);
+    bool make_next_word_fuzzy = false;
+  #ifndef DISABLED_LEGACY_ENGINE
+    if (!AnyLSTMLang() &&
+        ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
+      // Needs to be setup again to see the new outlines in the chopped_word.
+      SetupWordPassN(pass_n, word);
+    }
+  #endif  // ndef DISABLED_LEGACY_ENGINE
+
+    classify_word_and_language(pass_n, pr_it, word);
+    if (tessedit_dump_choices || debug_noise_removal) {
+      tprintf("Pass%d: %s [%s]\n", pass_n,
+              word->word->best_choice->unichar_string().c_str(),
+              word->word->best_choice->debug_string().c_str());
+    }
+    pr_it->forward();
+    if (make_next_word_fuzzy && pr_it->word() != nullptr) {
+      pr_it->MakeCurrentWordFuzzy();
+    }
+  }
+  return true;
+}
+
+/**
+ * recog_all_words()
+ *
+ * Walk the page_res, recognizing all the words.
+ * If monitor is not null, it is used as a progress monitor/timeout/cancel.
+ * If dopasses is 0, all recognition passes are run,
+ * 1 just pass 1, 2 passes2 and higher.
+ * If target_word_box is not null, special things are done to words that
+ * overlap the target_word_box:
+ * if word_config is not null, the word config file is read for just the
+ * target word(s), otherwise, on pass 2 and beyond ONLY the target words
+ * are processed (Jetsoft modification.)
+ * Returns false if we cancelled prematurely.
+ *
+ * @param page_res page structure
+ * @param monitor progress monitor
+ * @param word_config word_config file
+ * @param target_word_box specifies just to extract a rectangle
+ * @param dopasses 0 - all, 1 just pass 1, 2 passes 2 and higher
+ */
+
+bool Tesseract::recog_all_words(PAGE_RES* page_res,
+                                ETEXT_DESC* monitor,
+                                const TBOX* target_word_box,
+                                const char* word_config,
+                                int dopasses) {
+  PAGE_RES_IT page_res_it(page_res);
+
+  if (tessedit_minimal_rej_pass1) {
+    tessedit_test_adaption.set_value (true);
+    tessedit_minimal_rejection.set_value (true);
+  }
+
+  if (dopasses==0 || dopasses==1) {
+    page_res_it.restart_page();
+    // ****************** Pass 1 *******************
+
+    #ifndef DISABLED_LEGACY_ENGINE
+    // If the adaptive classifier is full switch to one we prepared earlier,
+    // ie on the previous page. If the current adaptive classifier is non-empty,
+    // prepare a backup starting at this page, in case it fills up. Do all this
+    // independently for each language.
+    if (AdaptiveClassifierIsFull()) {
+      SwitchAdaptiveClassifier();
+    } else if (!AdaptiveClassifierIsEmpty()) {
+      StartBackupAdaptiveClassifier();
+    }
+    // Now check the sub-langs as well.
+    for (int i = 0; i < sub_langs_.size(); ++i) {
+      if (sub_langs_[i]->AdaptiveClassifierIsFull()) {
+        sub_langs_[i]->SwitchAdaptiveClassifier();
+      } else if (!sub_langs_[i]->AdaptiveClassifierIsEmpty()) {
+        sub_langs_[i]->StartBackupAdaptiveClassifier();
+      }
+    }
+
+    #endif  // ndef DISABLED_LEGACY_ENGINE
+
+    // Set up all words ready for recognition, so that if parallelism is on
+    // all the input and output classes are ready to run the classifier.
+    std::vector<WordData> words;
+    SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
+    #ifndef DISABLED_LEGACY_ENGINE
+    if (tessedit_parallelize) {
+      PrerecAllWordsPar(words);
+    }
+    #endif  // ndef DISABLED_LEGACY_ENGINE
+
+    stats_.word_count = words.size();
+
+    stats_.dict_words = 0;
+    stats_.doc_blob_quality = 0;
+    stats_.doc_outline_errs = 0;
+    stats_.doc_char_quality = 0;
+    stats_.good_char_count = 0;
+    stats_.doc_good_char_quality = 0;
+
+    most_recently_used_ = this;
+    // Run pass 1 word recognition.
+    if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;
+    // Pass 1 post-processing.
+    for (page_res_it.restart_page(); page_res_it.word() != nullptr;
+         page_res_it.forward()) {
+      if (page_res_it.word()->word->flag(W_REP_CHAR)) {
+        fix_rep_char(&page_res_it);
+        continue;
+      }
+
+      // Count dict words.
+      if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
+        ++(stats_.dict_words);
+
+      // Update misadaption log (we only need to do it on pass 1, since
+      // adaption only happens on this pass).
+      if (page_res_it.word()->blamer_bundle != nullptr &&
+          page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
+        page_res->misadaption_log.push_back(
+            page_res_it.word()->blamer_bundle->misadaption_debug());
+      }
+    }
+  }
+
+  if (dopasses == 1) return true;
+
+  #ifndef DISABLED_LEGACY_ENGINE
+
+  // ****************** Pass 2 *******************
+  if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption &&
+      AnyTessLang()) {
+    page_res_it.restart_page();
+    std::vector<WordData> words;
+    SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
+    if (tessedit_parallelize) {
+      PrerecAllWordsPar(words);
+    }
+    most_recently_used_ = this;
+    // Run pass 2 word recognition.
+    if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
+  }
+
+  // The next passes are only required for Tess-only.
+  if (AnyTessLang() && !AnyLSTMLang()) {
+    // ****************** Pass 3 *******************
+    // Fix fuzzy spaces.
+
+    if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces
+        && !tessedit_word_for_word && !right_to_left())
+      fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
+
+    // ****************** Pass 4 *******************
+    if (tessedit_enable_dict_correction) dictionary_correction_pass(page_res);
+    if (tessedit_enable_bigram_correction) bigram_correction_pass(page_res);
+
+    // ****************** Pass 5,6 *******************
+    rejection_passes(page_res, monitor, target_word_box, word_config);
+
+    // ****************** Pass 8 *******************
+    font_recognition_pass(page_res);
+
+    // ****************** Pass 9 *******************
+    // Check the correctness of the final results.
+    blamer_pass(page_res);
+    script_pos_pass(page_res);
+  }
+
+  #endif  // ndef DISABLED_LEGACY_ENGINE
+
+  // Write results pass.
+  // This is now redundant, but retained commented so show how to obtain
+  // bounding boxes and style information.
+
+  #ifndef DISABLED_LEGACY_ENGINE
+  // changed by jetsoft
+  // needed for dll to output memory structure
+  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
+    output_pass(page_res_it, target_word_box);
+  // end jetsoft
+  #endif  //ndef DISABLED_LEGACY_ENGINE
+
+  const auto pageseg_mode = static_cast<PageSegMode>(
+      static_cast<int>(tessedit_pageseg_mode));
+  textord_.CleanupSingleRowResult(pageseg_mode, page_res);
+
+  // Remove empty words, as these mess up the result iterators.
+  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
+       page_res_it.forward()) {
+    const WERD_RES* word = page_res_it.word();
+    const POLY_BLOCK* pb = page_res_it.block()->block != nullptr
+                         ? page_res_it.block()->block->pdblk.poly_block()
+                         : nullptr;
+    if (word->best_choice == nullptr || word->best_choice->length() == 0 ||
+        (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) {
+      page_res_it.DeleteCurrentWord();
+    }
+  }
+
+  if (monitor != nullptr) {
+    monitor->progress = 100;
+  }
+  return true;
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
+  PAGE_RES_IT word_it(page_res);
+
+  WERD_RES *w_prev = nullptr;
+  WERD_RES *w = word_it.word();
+  while (true) {
+    w_prev = w;
+    while (word_it.forward() != nullptr &&
+           (!word_it.word() || word_it.word()->part_of_combo)) {
+      // advance word_it, skipping over parts of combos
+    }
+    if (!word_it.word()) break;
+    w = word_it.word();
+    if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
+      continue;
+    }
+    if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
+      if (tessedit_bigram_debug) {
+        tprintf("Skipping because one of the words is W_REP_CHAR\n");
+      }
+      continue;
+    }
+    // Two words sharing the same language model, excellent!
+    GenericVector<WERD_CHOICE *> overrides_word1;
+    GenericVector<WERD_CHOICE *> overrides_word2;
+
+    const STRING orig_w1_str = w_prev->best_choice->unichar_string();
+    const STRING orig_w2_str = w->best_choice->unichar_string();
+    WERD_CHOICE prev_best(w->uch_set);
+    {
+      int w1start, w1end;
+      w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
+      prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
+    }
+    WERD_CHOICE this_best(w->uch_set);
+    {
+      int w2start, w2end;
+      w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
+      this_best = w->best_choice->shallow_copy(w2start, w2end);
+    }
+
+    if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
+      if (tessedit_bigram_debug) {
+        tprintf("Top choice \"%s %s\" verified by bigram model.\n",
+                orig_w1_str.c_str(), orig_w2_str.c_str());
+      }
+      continue;
+    }
+    if (tessedit_bigram_debug > 2) {
+      tprintf("Examining alt choices for \"%s %s\".\n",
+              orig_w1_str.c_str(), orig_w2_str.c_str());
+    }
+    if (tessedit_bigram_debug > 1) {
+      if (!w_prev->best_choices.singleton()) {
+        w_prev->PrintBestChoices();
+      }
+      if (!w->best_choices.singleton()) {
+        w->PrintBestChoices();
+      }
+    }
+    float best_rating = 0.0;
+    int best_idx = 0;
+    WERD_CHOICE_IT prev_it(&w_prev->best_choices);
+    for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
+      WERD_CHOICE *p1 = prev_it.data();
+      WERD_CHOICE strip1(w->uch_set);
+      {
+        int p1start, p1end;
+        p1->GetNonSuperscriptSpan(&p1start, &p1end);
+        strip1 = p1->shallow_copy(p1start, p1end);
+      }
+      WERD_CHOICE_IT w_it(&w->best_choices);
+      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
+        WERD_CHOICE *p2 = w_it.data();
+        WERD_CHOICE strip2(w->uch_set);
+        {
+          int p2start, p2end;
+          p2->GetNonSuperscriptSpan(&p2start, &p2end);
+          strip2 = p2->shallow_copy(p2start, p2end);
+        }
+        if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
+          overrides_word1.push_back(p1);
+          overrides_word2.push_back(p2);
+          if (overrides_word1.size() == 1 ||
+              p1->rating() + p2->rating() < best_rating) {
+            best_rating = p1->rating() + p2->rating();
+            best_idx = overrides_word1.size() - 1;
+          }
+        }
+      }
+    }
+    if (!overrides_word1.empty()) {
+      // Excellent, we have some bigram matches.
+      if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice,
+                                            *overrides_word1[best_idx]) &&
+          EqualIgnoringCaseAndTerminalPunct(*w->best_choice,
+                                            *overrides_word2[best_idx])) {
+        if (tessedit_bigram_debug > 1) {
+          tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
+                  "model.\n", orig_w1_str.c_str(), orig_w2_str.c_str());
+        }
+        continue;
+      }
+      const STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
+      const STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
+      if (new_w1_str != orig_w1_str) {
+        w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
+      }
+      if (new_w2_str != orig_w2_str) {
+        w->ReplaceBestChoice(overrides_word2[best_idx]);
+      }
+      if (tessedit_bigram_debug > 0) {
+        STRING choices_description;
+        int num_bigram_choices
+            = overrides_word1.size() * overrides_word2.size();
+        if (num_bigram_choices == 1) {
+          choices_description = "This was the unique bigram choice.";
+        } else {
+          if (tessedit_bigram_debug > 1) {
+            STRING bigrams_list;
+            const int kMaxChoicesToPrint = 20;
+            for (int i = 0; i < overrides_word1.size() &&
+                 i < kMaxChoicesToPrint; i++) {
+              if (i > 0) { bigrams_list += ", "; }
+              WERD_CHOICE *p1 = overrides_word1[i];
+              WERD_CHOICE *p2 = overrides_word2[i];
+              bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
+            }
+            choices_description = "There were many choices: {";
+            choices_description += bigrams_list;
+            choices_description += "}";
+          } else {
+            choices_description.add_str_int("There were ", num_bigram_choices);
+            choices_description += " compatible bigrams.";
+          }
+        }
+        tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
+                orig_w1_str.c_str(), orig_w2_str.c_str(),
+                new_w1_str.c_str(), new_w2_str.c_str(),
+                choices_description.c_str());
+      }
+    }
+  }
+}
+
+void Tesseract::rejection_passes(PAGE_RES* page_res,
+                                 ETEXT_DESC* monitor,
+                                 const TBOX* target_word_box,
+                                 const char* word_config) {
+  PAGE_RES_IT page_res_it(page_res);
+  // ****************** Pass 5 *******************
+  // Gather statistics on rejects.
+  int word_index = 0;
+  while (!tessedit_test_adaption && page_res_it.word() != nullptr) {
+    WERD_RES* word = page_res_it.word();
+    word_index++;
+    if (monitor != nullptr) {
+      monitor->ocr_alive = true;
+      monitor->progress = 95 + 5 * word_index / stats_.word_count;
+    }
+    if (word->rebuild_word == nullptr) {
+      // Word was not processed by tesseract.
+      page_res_it.forward();
+      continue;
+    }
+    check_debug_pt(word, 70);
+
+    // changed by jetsoft
+    // specific to its needs to extract one word when need
+    if (target_word_box &&
+        !ProcessTargetWord(word->word->bounding_box(),
+                           *target_word_box, word_config, 4)) {
+      page_res_it.forward();
+      continue;
+    }
+    // end jetsoft
+
+    page_res_it.rej_stat_word();
+    const int chars_in_word = word->reject_map.length();
+    const int rejects_in_word = word->reject_map.reject_count();
+
+    const int blob_quality = word_blob_quality(word);
+    stats_.doc_blob_quality += blob_quality;
+    const int outline_errs = word_outline_errs(word);
+    stats_.doc_outline_errs += outline_errs;
+    int16_t all_char_quality;
+    int16_t accepted_all_char_quality;
+    word_char_quality(word, &all_char_quality, &accepted_all_char_quality);
+    stats_.doc_char_quality += all_char_quality;
+    const uint8_t permuter_type = word->best_choice->permuter();
+    if ((permuter_type == SYSTEM_DAWG_PERM) ||
+        (permuter_type == FREQ_DAWG_PERM) ||
+        (permuter_type == USER_DAWG_PERM)) {
+      stats_.good_char_count += chars_in_word - rejects_in_word;
+      stats_.doc_good_char_quality += accepted_all_char_quality;
+    }
+    check_debug_pt(word, 80);
+    if (tessedit_reject_bad_qual_wds &&
+        (blob_quality == 0) && (outline_errs >= chars_in_word))
+      word->reject_map.rej_word_bad_quality();
+    check_debug_pt(word, 90);
+    page_res_it.forward();
+  }
+
+  if (tessedit_debug_quality_metrics) {
+    tprintf
+      ("QUALITY: num_chs= %d  num_rejs= %d %5.3f blob_qual= %d %5.3f"
+       " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
+      page_res->char_count, page_res->rej_count,
+      page_res->rej_count / static_cast<float>(page_res->char_count),
+      stats_.doc_blob_quality,
+      stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
+      stats_.doc_outline_errs,
+      stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
+      stats_.doc_char_quality,
+      stats_.doc_char_quality / static_cast<float>(page_res->char_count),
+      stats_.doc_good_char_quality,
+      (stats_.good_char_count > 0) ?
+      (stats_.doc_good_char_quality /
+       static_cast<float>(stats_.good_char_count)) : 0.0);
+  }
+  bool good_quality_doc =
+    ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
+     quality_rej_pc) &&
+    (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
+     quality_blob_pc) &&
+    (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
+     quality_outline_pc) &&
+    (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
+     quality_char_pc);
+
+  // ****************** Pass 6 *******************
+  // Do whole document or whole block rejection pass
+  if (!tessedit_test_adaption) {
+    quality_based_rejection(page_res_it, good_quality_doc);
+  }
+}
+
+#endif  // ndef DISABLED_LEGACY_ENGINE
+
+void Tesseract::blamer_pass(PAGE_RES* page_res) {
+  if (!wordrec_run_blamer) return;
+  PAGE_RES_IT page_res_it(page_res);
+  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
+      page_res_it.forward()) {
+    WERD_RES *word = page_res_it.word();
+    BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word);
+    page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++;
+  }
+  tprintf("Blame reasons:\n");
+  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
+    tprintf("%s %d\n", BlamerBundle::IncorrectReasonName(
+        static_cast<IncorrectResultReason>(bl)),
+        page_res->blame_reasons[bl]);
+  }
+  if (page_res->misadaption_log.size() > 0) {
+    tprintf("Misadaption log:\n");
+    for (int i = 0; i < page_res->misadaption_log.size(); ++i) {
+      tprintf("%s\n", page_res->misadaption_log[i].c_str());
+    }
+  }
+}
+
+// Sets script positions and detects smallcaps on all output words.
+void Tesseract::script_pos_pass(PAGE_RES* page_res) {
+  PAGE_RES_IT page_res_it(page_res);
+  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
+      page_res_it.forward()) {
+    WERD_RES* word = page_res_it.word();
+     if (word->word->flag(W_REP_CHAR)) {
+      page_res_it.forward();
+      continue;
+    }
+    const float x_height = page_res_it.block()->block->x_height();
+    float word_x_height = word->x_height;
+    if (word_x_height < word->best_choice->min_x_height() ||
+        word_x_height > word->best_choice->max_x_height()) {
+      word_x_height = (word->best_choice->min_x_height() +
+          word->best_choice->max_x_height()) / 2.0f;
+    }
+    // Test for small caps. Word capheight must be close to block xheight,
+    // and word must contain no lower case letters, and at least one upper case.
+    const double small_cap_xheight = x_height * kXHeightCapRatio;
+    const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
+    if (word->uch_set->script_has_xheight() &&
+        small_cap_xheight - small_cap_delta <= word_x_height &&
+        word_x_height <= small_cap_xheight + small_cap_delta) {
+      // Scan for upper/lower.
+      int num_upper = 0;
+      int num_lower = 0;
+      for (int i = 0; i < word->best_choice->length(); ++i) {
+        if (word->uch_set->get_isupper(word->best_choice->unichar_id(i)))
+          ++num_upper;
+        else if (word->uch_set->get_islower(word->best_choice->unichar_id(i)))
+          ++num_lower;
+      }
+      if (num_upper > 0 && num_lower == 0)
+        word->small_caps = true;
+    }
+    word->SetScriptPositions();
+  }
+}
+
+// Helper finds the gap between the index word and the next.
+static void WordGap(const PointerVector<WERD_RES>& words, int index, int* right,
+                    int* next_left) {
+  *right = -INT32_MAX;
+  *next_left = INT32_MAX;
+  if (index < words.size()) {
+    *right = words[index]->word->bounding_box().right();
+    if (index + 1 < words.size())
+      *next_left = words[index + 1]->word->bounding_box().left();
+  }
+}
+
+// Factored helper computes the rating, certainty, badness and validity of
+// the permuter of the words in [first_index, end_index).
+static void EvaluateWordSpan(const PointerVector<WERD_RES>& words,
+                             int first_index, int end_index, float* rating,
+                             float* certainty, bool* bad,
+                             bool* valid_permuter) {
+  if (end_index <= first_index) {
+    *bad = true;
+    *valid_permuter = false;
+  }
+  for (int index = first_index; index < end_index && index < words.size();
+       ++index) {
+    WERD_CHOICE* choice = words[index]->best_choice;
+    if (choice == nullptr) {
+      *bad = true;
+    } else {
+      *rating += choice->rating();
+      *certainty = std::min(*certainty, choice->certainty());
+      if (!Dict::valid_word_permuter(choice->permuter(), false))
+        *valid_permuter = false;
+    }
+  }
+}
+
+// Helper chooses the best combination of words, transferring good ones from
+// new_words to best_words. To win, a new word must have (better rating and
+// certainty) or (better permuter status and rating within rating ratio and
+// certainty within certainty margin) than current best.
+// All the new_words are consumed (moved to best_words or deleted.)
+// The return value is the number of new_words used minus the number of
+// best_words that remain in the output.
+static int SelectBestWords(double rating_ratio,
+                           double certainty_margin,
+                           bool debug,
+                           PointerVector<WERD_RES>* new_words,
+                           PointerVector<WERD_RES>* best_words) {
+  // Process the smallest groups of words that have an overlapping word
+  // boundary at the end.
+  GenericVector<WERD_RES*> out_words;
+  // Index into each word vector (best, new).
+  int b = 0, n = 0;
+  int num_best = 0, num_new = 0;
+  while (b < best_words->size() || n < new_words->size()) {
+    // Start of the current run in each.
+    int start_b = b, start_n = n;
+    while (b < best_words->size() || n < new_words->size()) {
+      int b_right = -INT32_MAX;
+      int next_b_left = INT32_MAX;
+      WordGap(*best_words, b, &b_right, &next_b_left);
+      int n_right = -INT32_MAX;
+      int next_n_left = INT32_MAX;
+      WordGap(*new_words, n, &n_right, &next_n_left);
+      if (std::max(b_right, n_right) < std::min(next_b_left, next_n_left)) {
+        // The word breaks overlap. [start_b,b] and [start_n, n] match.
+        break;
+      }
+      // Keep searching for the matching word break.
+      if ((b_right < n_right && b < best_words->size()) ||
+          n == new_words->size())
+        ++b;
+      else
+        ++n;
+    }
+    // Rating of the current run in each.
+    float b_rating = 0.0f, n_rating = 0.0f;
+    // Certainty of the current run in each.
+    float b_certainty = 0.0f, n_certainty = 0.0f;
+    // True if any word is missing its best choice.
+    bool b_bad = false, n_bad = false;
+    // True if all words have a valid permuter.
+    bool b_valid_permuter = true, n_valid_permuter = true;
+    const int end_b = b < best_words->size() ? b + 1 : b;
+    const int end_n = n < new_words->size() ? n + 1 : n;
+    EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty,
+                     &b_bad, &b_valid_permuter);
+    EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty,
+                     &n_bad, &n_valid_permuter);
+    bool new_better = false;
+    if (!n_bad && (b_bad || (n_certainty > b_certainty &&
+                             n_rating < b_rating) ||
+                            (!b_valid_permuter && n_valid_permuter &&
+                             n_rating < b_rating * rating_ratio &&
+                             n_certainty > b_certainty - certainty_margin))) {
+      // New is better.
+      for (int i = start_n; i < end_n; ++i) {
+        out_words.push_back((*new_words)[i]);
+        (*new_words)[i] = nullptr;
+        ++num_new;
+      }
+      new_better = true;
+    } else if (!b_bad) {
+      // Current best is better.
+      for (int i = start_b; i < end_b; ++i) {
+        out_words.push_back((*best_words)[i]);
+        (*best_words)[i] = nullptr;
+        ++num_best;
+      }
+    }
+    if (debug) {
+      tprintf("%d new words %s than %d old words: r: %g v %g c: %g v %g"
+              " valid dict: %d v %d\n",
+              end_n - start_n, new_better ? "better" : "worse",
+              end_b - start_b, n_rating, b_rating,
+              n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);
+    }
+    // Move on to the next group.
+    b = end_b;
+    n = end_n;
+  }
+  // Transfer from out_words to best_words.
+  best_words->clear();
+  for (int i = 0; i < out_words.size(); ++i)
+    best_words->push_back(out_words[i]);
+  return num_new - num_best;
+}
+
+// Helper to recognize the word using the given (language-specific) tesseract.
+// Returns positive if this recognizer found more new best words than the
+// number kept from best_words.
+int Tesseract::RetryWithLanguage(const WordData& word_data,
+                                 WordRecognizer recognizer, bool debug,
+                                 WERD_RES** in_word,
+                                 PointerVector<WERD_RES>* best_words) {
+  if (debug) {
+    tprintf("Trying word using lang %s, oem %d\n",
+            lang.c_str(), static_cast<int>(tessedit_ocr_engine_mode));
+  }
+  // Run the recognizer on the word.
+  PointerVector<WERD_RES> new_words;
+  (this->*recognizer)(word_data, in_word, &new_words);
+  if (new_words.empty()) {
+    // Transfer input word to new_words, as the classifier must have put
+    // the result back in the input.
+    new_words.push_back(*in_word);
+    *in_word = nullptr;
+  }
+  if (debug) {
+    for (int i = 0; i < new_words.size(); ++i)
+      new_words[i]->DebugTopChoice("Lang result");
+  }
+  // Initial version is a bit of a hack based on better certainty and rating
+  // or a dictionary vs non-dictionary word.
+  return SelectBestWords(classify_max_rating_ratio,
+                         classify_max_certainty_margin,
+                         debug, &new_words, best_words);
+}
+
+// Helper returns true if all the words are acceptable.
+static bool WordsAcceptable(const PointerVector<WERD_RES>& words) {
+  for (int w = 0; w < words.size(); ++w) {
+    if (words[w]->tess_failed || !words[w]->tess_accepted) return false;
+  }
+  return true;
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+// Moves good-looking "noise"/diacritics from the reject list to the main
+// blob list on the current word. Returns true if anything was done, and
+// sets make_next_word_fuzzy if blob(s) were added to the end of the word.
+bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT* pr_it,
+                                   bool* make_next_word_fuzzy) {
+  *make_next_word_fuzzy = false;
+  WERD* real_word = pr_it->word()->word;
+  if (real_word->rej_cblob_list()->empty() ||
+      real_word->cblob_list()->empty() ||
+      real_word->rej_cblob_list()->length() > noise_maxperword)
+    return false;
+  real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
+  // Get the noise outlines into a vector with matching bool map.
+  GenericVector<C_OUTLINE*> outlines;
+  real_word->GetNoiseOutlines(&outlines);
+  GenericVector<bool> word_wanted;
+  GenericVector<bool> overlapped_any_blob;
+  GenericVector<C_BLOB*> target_blobs;
+  AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,
+                                     &word_wanted, &overlapped_any_blob,
+                                     &target_blobs);
+  // Filter the outlines that overlapped any blob and put them into the word
+  // now. This simplifies the remaining task and also makes it more accurate
+  // as it has more completed blobs to work on.
+  GenericVector<bool> wanted;
+  GenericVector<C_BLOB*> wanted_blobs;
+  GenericVector<C_OUTLINE*> wanted_outlines;
+  int num_overlapped = 0;
+  int num_overlapped_used = 0;
+  for (int i = 0; i < overlapped_any_blob.size(); ++i) {
+    if (overlapped_any_blob[i]) {
+      ++num_overlapped;
+      if (word_wanted[i]) ++num_overlapped_used;
+      wanted.push_back(word_wanted[i]);
+      wanted_blobs.push_back(target_blobs[i]);
+      wanted_outlines.push_back(outlines[i]);
+      outlines[i] = nullptr;
+    }
+  }
+  real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr);
+  AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,
+                             &target_blobs);
+  int non_overlapped = 0;
+  int non_overlapped_used = 0;
+  for (int i = 0; i < word_wanted.size(); ++i) {
+    if (word_wanted[i]) ++non_overlapped_used;
+    if (outlines[i] != nullptr) ++non_overlapped_used;
+  }
+  if (debug_noise_removal) {
+    tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
+            num_overlapped_used, num_overlapped, non_overlapped_used,
+            non_overlapped);
+    real_word->bounding_box().print();
+  }
+  // Now we have decided which outlines we want, put them into the real_word.
+  if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,
+                                     make_next_word_fuzzy)) {
+    pr_it->MakeCurrentWordFuzzy();
+  }
+  // TODO(rays) Parts of combos have a deep copy of the real word, and need
+  // to have their noise outlines moved/assigned in the same way!!
+  return num_overlapped_used != 0 || non_overlapped_used != 0;
+}
+
+// Attempts to put noise/diacritic outlines into the blobs that they overlap.
+// Input: a set of noisy outlines that probably belong to the real_word.
+// Output: word_wanted indicates which outlines are to be assigned to a blob,
+//   target_blobs indicates which to assign to, and overlapped_any_blob is
+//   true for all outlines that overlapped a blob.
+void Tesseract::AssignDiacriticsToOverlappingBlobs(
+    const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
+    PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
+    GenericVector<bool>* overlapped_any_blob,
+    GenericVector<C_BLOB*>* target_blobs) {
+  std::vector<bool> blob_wanted;
+  word_wanted->resize(outlines.size(), false);
+  overlapped_any_blob->resize(outlines.size(), false);
+  target_blobs->resize(outlines.size(), nullptr);
+  // For each real blob, find the outlines that seriously overlap it.
+  // A single blob could be several merged characters, so there can be quite
+  // a few outlines overlapping, and the full engine needs to be used to chop
+  // and join to get a sensible result.
+  C_BLOB_IT blob_it(real_word->cblob_list());
+  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+    C_BLOB* blob = blob_it.data();
+    const TBOX blob_box = blob->bounding_box();
+    blob_wanted.resize(outlines.size(), false);
+    int num_blob_outlines = 0;
+    for (int i = 0; i < outlines.size(); ++i) {
+      if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
+          !(*word_wanted)[i]) {
+        blob_wanted[i] = true;
+        (*overlapped_any_blob)[i] = true;
+        ++num_blob_outlines;
+      }
+    }
+    if (debug_noise_removal) {
+      tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
+      blob_box.print();
+    }
+    // If any outlines overlap the blob, and not too many, classify the blob
+    // (using the full engine, languages and all), and choose the maximal
+    // combination of outlines that doesn't hurt the end-result classification
+    // by too much. Mark them as wanted.
+    if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
+      if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
+                                      outlines, num_blob_outlines,
+                                      &blob_wanted)) {
+        for (int i = 0; i < blob_wanted.size(); ++i) {
+          if (blob_wanted[i]) {
+            // Claim the outline and record where it is going.
+            (*word_wanted)[i] = true;
+            (*target_blobs)[i] = blob;
+          }
+        }
+      }
+    }
+  }
+}
+
+// Attempts to assign non-overlapping outlines to their nearest blobs or
+// make new blobs out of them.
+void Tesseract::AssignDiacriticsToNewBlobs(
+    const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
+    PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
+    GenericVector<C_BLOB*>* target_blobs) {
+  std::vector<bool> blob_wanted;
+  word_wanted->resize(outlines.size(), false);
+  target_blobs->resize(outlines.size(), nullptr);
+  // Check for outlines that need to be turned into stand-alone blobs.
+  for (int i = 0; i < outlines.size(); ++i) {
+    if (outlines[i] == nullptr) continue;
+    // Get a set of adjacent outlines that don't overlap any existing blob.
+    blob_wanted.resize(outlines.size(), false);
+    int num_blob_outlines = 0;
+    TBOX total_ol_box(outlines[i]->bounding_box());
+    while (i < outlines.size() && outlines[i] != nullptr) {
+      blob_wanted[i] = true;
+      total_ol_box += outlines[i]->bounding_box();
+      ++i;
+      ++num_blob_outlines;
+    }
+    // Find the insertion point.
+    C_BLOB_IT blob_it(real_word->cblob_list());
+    while (!blob_it.at_last() &&
+           blob_it.data_relative(1)->bounding_box().left() <=
+               total_ol_box.left()) {
+      blob_it.forward();
+    }
+    // Choose which combination of them we actually want and where to put
+    // them.
+    if (debug_noise_removal)
+      tprintf("Num blobless outlines = %d\n", num_blob_outlines);
+    C_BLOB* left_blob = blob_it.data();
+    TBOX left_box = left_blob->bounding_box();
+    C_BLOB* right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
+    if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr ||
+         !right_blob->bounding_box().x_overlap(total_ol_box)) &&
+        SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
+                                    outlines, num_blob_outlines,
+                                    &blob_wanted)) {
+      if (debug_noise_removal) tprintf("Added to left blob\n");
+      for (int j = 0; j < blob_wanted.size(); ++j) {
+        if (blob_wanted[j]) {
+          (*word_wanted)[j] = true;
+          (*target_blobs)[j] = left_blob;
+        }
+      }
+    } else if (right_blob != nullptr &&
+               (!left_box.x_overlap(total_ol_box) ||
+                right_blob->bounding_box().x_overlap(total_ol_box)) &&
+               SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it,
+                                           right_blob, outlines,
+                                           num_blob_outlines, &blob_wanted)) {
+      if (debug_noise_removal) tprintf("Added to right blob\n");
+      for (int j = 0; j < blob_wanted.size(); ++j) {
+        if (blob_wanted[j]) {
+          (*word_wanted)[j] = true;
+          (*target_blobs)[j] = right_blob;
+        }
+      }
+    } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr,
+                                           outlines, num_blob_outlines,
+                                           &blob_wanted)) {
+      if (debug_noise_removal) tprintf("Fitted between blobs\n");
+      for (int j = 0; j < blob_wanted.size(); ++j) {
+        if (blob_wanted[j]) {
+          (*word_wanted)[j] = true;
+          (*target_blobs)[j] = nullptr;
+        }
+      }
+    }
+  }
+}
+
+// Starting with ok_outlines set to indicate which outlines overlap the blob,
+// chooses the optimal set (approximately) and returns true if any outlines
+// are desired, in which case ok_outlines indicates which ones.
+bool Tesseract::SelectGoodDiacriticOutlines(
+    int pass, float certainty_threshold, PAGE_RES_IT* pr_it, C_BLOB* blob,
+    const GenericVector<C_OUTLINE*>& outlines, int num_outlines,
+    std::vector<bool>* ok_outlines) {
+  STRING best_str;
+  float target_cert = certainty_threshold;
+  if (blob != nullptr) {
+    float target_c2;
+    target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);
+    if (debug_noise_removal) {
+      tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.c_str(),
+              target_cert, target_c2);
+      blob->bounding_box().print();
+    }
+    target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
+  }
+  std::vector<bool> test_outlines = *ok_outlines;
+  // Start with all the outlines in.
+  STRING all_str;
+  std::vector<bool> best_outlines = *ok_outlines;
+  float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
+                                             pr_it, blob, &all_str);
+  if (debug_noise_removal) {
+    TBOX ol_box;
+    for (int i = 0; i < test_outlines.size(); ++i) {
+      if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
+    }
+    tprintf("All Noise blob classified as %s=%g, delta=%g at:",
+            all_str.c_str(), best_cert, best_cert - target_cert);
+    ol_box.print();
+  }
+  // Iteratively zero out the bit that improves the certainty the most, until
+  // we get past the threshold, have zero bits, or fail to improve.
+  int best_index = 0;  // To zero out.
+  while (num_outlines > 1 && best_index >= 0 &&
+         (blob == nullptr || best_cert < target_cert || blob != nullptr)) {
+    // Find the best bit to zero out.
+    best_index = -1;
+    for (int i = 0; i < outlines.size(); ++i) {
+      if (test_outlines[i]) {
+        test_outlines[i] = false;
+        STRING str;
+        float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
+                                              pr_it, blob, &str);
+        if (debug_noise_removal) {
+          TBOX ol_box;
+          for (int j = 0; j < outlines.size(); ++j) {
+            if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
+            tprintf("%c", test_outlines[j] ? 'T' : 'F');
+          }
+          tprintf(" blob classified as %s=%g, delta=%g) at:", str.c_str(),
+                  cert, cert - target_cert);
+          ol_box.print();
+        }
+        if (cert > best_cert) {
+          best_cert = cert;
+          best_index = i;
+          best_outlines = test_outlines;
+        }
+        test_outlines[i] = true;
+      }
+    }
+    if (best_index >= 0) {
+      test_outlines[best_index] = false;
+      --num_outlines;
+    }
+  }
+  if (best_cert >= target_cert) {
+    // Save the best combination.
+    *ok_outlines = best_outlines;
+    if (debug_noise_removal) {
+      tprintf("%s noise combination ", blob ? "Adding" : "New");
+      for (int i = 0; i < best_outlines.size(); ++i) {
+        tprintf("%c", best_outlines[i] ? 'T' : 'F');
+      }
+      tprintf(" yields certainty %g, beating target of %g\n", best_cert,
+              target_cert);
+    }
+    return true;
+  }
+
+  return false;
+}
+
+// Classifies the given blob plus the outlines flagged by ok_outlines, undoes
+// the inclusion of the outlines, and returns the certainty of the raw choice.
+float Tesseract::ClassifyBlobPlusOutlines(
+    const std::vector<bool>& ok_outlines,
+    const GenericVector<C_OUTLINE*>& outlines, int pass_n, PAGE_RES_IT* pr_it,
+    C_BLOB* blob, STRING* best_str) {
+  C_OUTLINE_IT ol_it;
+  C_OUTLINE* first_to_keep = nullptr;
+  C_BLOB* local_blob = nullptr;
+  if (blob != nullptr) {
+    // Add the required outlines to the blob.
+    ol_it.set_to_list(blob->out_list());
+    first_to_keep = ol_it.data();
+  }
+  for (int i = 0; i < ok_outlines.size(); ++i) {
+    if (ok_outlines[i]) {
+      // This outline is to be added.
+      if (blob == nullptr) {
+        local_blob = new C_BLOB(outlines[i]);
+        blob = local_blob;
+        ol_it.set_to_list(blob->out_list());
+      } else {
+        ol_it.add_before_stay_put(outlines[i]);
+      }
+    }
+  }
+  float c2;
+  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
+  ol_it.move_to_first();
+  if (first_to_keep == nullptr) {
+    // We created blob. Empty its outlines and delete it.
+    for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
+    delete local_blob;
+    cert = -c2;
+  } else {
+    // Remove the outlines that we put in.
+    for (; ol_it.data() != first_to_keep; ol_it.forward()) {
+      ol_it.extract();
+    }
+  }
+  return cert;
+}
+
+// Classifies the given blob (part of word_data->word->word) as an individual
+// word, using languages, chopper etc, returning only the certainty of the
+// best raw choice, and undoing all the work done to fake out the word.
+float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it,
+                                    C_BLOB* blob, STRING* best_str, float* c2) {
+  WERD* real_word = pr_it->word()->word;
+  WERD* word = real_word->ConstructFromSingleBlob(
+      real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
+  WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
+  // Get a new iterator that points to the new word.
+  PAGE_RES_IT it(pr_it->page_res);
+  while (it.word() != word_res && it.word() != nullptr) it.forward();
+  ASSERT_HOST(it.word() == word_res);
+  WordData wd(it);
+  // Force full initialization.
+  SetupWordPassN(1, &wd);
+  classify_word_and_language(pass_n, &it, &wd);
+  if (debug_noise_removal) {
+    if (wd.word->raw_choice != nullptr) {
+      tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
+              wd.row->x_height(), wd.word->raw_choice->min_x_height(),
+              wd.word->raw_choice->max_x_height());
+    } else {
+      tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height,
+              wd.row->x_height());
+    }
+  }
+  float cert = 0.0f;
+  if (wd.word->raw_choice != nullptr) {  // This probably shouldn't happen, but...
+    cert = wd.word->raw_choice->certainty();
+    float rat = wd.word->raw_choice->rating();
+    *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
+    *best_str = wd.word->raw_choice->unichar_string();
+  } else {
+    *c2 = 0.0f;
+    *best_str = "";
+  }
+  it.DeleteCurrentWord();
+  pr_it->ResetWordIterator();
+  return cert;
+}
+
+#endif  // ndef DISABLED_LEGACY_ENGINE
+
+// Generic function for classifying a word. Can be used either for pass1 or
+// pass2 according to the function passed to recognizer.
+// word_data holds the word to be recognized, and its block and row, and
+// pr_it points to the word as well, in case we are running LSTM and it wants
+// to output multiple words.
+// Recognizes in the current language, and if successful that is all.
+// If recognition was not successful, tries all available languages until
+// it gets a successful result or runs out of languages. Keeps the best result.
+void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
+                                           WordData* word_data) {
+#ifdef DISABLED_LEGACY_ENGINE
+  WordRecognizer recognizer = &Tesseract::classify_word_pass1;
+#else
+  WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
+                                          : &Tesseract::classify_word_pass2;
+#endif  // def DISABLED_LEGACY_ENGINE
+
+  // Best result so far.
+  PointerVector<WERD_RES> best_words;
+  // Points to the best result. May be word or in lang_words.
+  const WERD_RES* word = word_data->word;
+  clock_t start_t = clock();
+  const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
+  if (debug) {
+    tprintf("%s word with lang %s at:",
+            word->done ? "Already done" : "Processing",
+            most_recently_used_->lang.c_str());
+    word->word->bounding_box().print();
+  }
+  if (word->done) {
+    // If done on pass1, leave it as-is.
+    if (!word->tess_failed)
+      most_recently_used_ = word->tesseract;
+    return;
+  }
+  int sub = sub_langs_.size();
+  if (most_recently_used_ != this) {
+    // Get the index of the most_recently_used_.
+    for (sub = 0; sub < sub_langs_.size() &&
+         most_recently_used_ != sub_langs_[sub]; ++sub) {}
+  }
+  most_recently_used_->RetryWithLanguage(
+      *word_data, recognizer, debug, &word_data->lang_words[sub], &best_words);
+  Tesseract* best_lang_tess = most_recently_used_;
+  if (!WordsAcceptable(best_words)) {
+    // Try all the other languages to see if they are any better.
+    if (most_recently_used_ != this &&
+        this->RetryWithLanguage(*word_data, recognizer, debug,
+                                &word_data->lang_words[sub_langs_.size()],
+                                &best_words) > 0) {
+      best_lang_tess = this;
+    }
+    for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
+         ++i) {
+      if (most_recently_used_ != sub_langs_[i] &&
+          sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug,
+                                           &word_data->lang_words[i],
+                                           &best_words) > 0) {
+        best_lang_tess = sub_langs_[i];
+      }
+    }
+  }
+  most_recently_used_ = best_lang_tess;
+  if (!best_words.empty()) {
+    if (best_words.size() == 1 && !best_words[0]->combination) {
+      // Move the best single result to the main word.
+      word_data->word->ConsumeWordResults(best_words[0]);
+    } else {
+      // Words came from LSTM, and must be moved to the PAGE_RES properly.
+      word_data->word = best_words.back();
+      pr_it->ReplaceCurrentWord(&best_words);
+    }
+    ASSERT_HOST(word_data->word->box_word != nullptr);
+  } else {
+    tprintf("no best words!!\n");
+  }
+  clock_t ocr_t = clock();
+  if (tessedit_timing_debug) {
+    tprintf("%s (ocr took %.2f sec)\n",
+            word_data->word->best_choice->unichar_string().c_str(),
+            static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
+  }
+}
+
+/**
+ * classify_word_pass1
+ *
+ * Baseline normalize the word and pass it to Tess.
+ */
+
+void Tesseract::classify_word_pass1(const WordData& word_data,
+                                    WERD_RES** in_word,
+                                    PointerVector<WERD_RES>* out_words) {
+  ROW* row = word_data.row;
+  BLOCK* block = word_data.block;
+  prev_word_best_choice_ = word_data.prev_word != nullptr
+      ? word_data.prev_word->word->best_choice : nullptr;
+#ifdef DISABLED_LEGACY_ENGINE
+  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
+#else
+  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
+      tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
+#endif  // def DISABLED_LEGACY_ENGINE
+    if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
+      LSTMRecognizeWord(*block, row, *in_word, out_words);
+      if (!out_words->empty())
+        return;  // Successful lstm recognition.
+    }
+    if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
+      // No fallback allowed, so use a fake.
+      (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
+      return;
+    }
+
+  #ifndef DISABLED_LEGACY_ENGINE
+    // Fall back to tesseract for failed words or odd words.
+    (*in_word)->SetupForRecognition(unicharset, this, BestPix(),
+                                    OEM_TESSERACT_ONLY, nullptr,
+                                    classify_bln_numeric_mode,
+                                    textord_use_cjk_fp_model,
+                                    poly_allow_detailed_fx, row, block);
+  #endif  // ndef DISABLED_LEGACY_ENGINE
+  }
+
+#ifndef DISABLED_LEGACY_ENGINE
+  WERD_RES* word = *in_word;
+  match_word_pass_n(1, word, row, block);
+  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
+    word->tess_would_adapt = AdaptableWord(word);
+    bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
+
+    if (adapt_ok) {
+      // Send word to adaptive classifier for training.
+      word->BestChoiceToCorrectText();
+      LearnWord(nullptr, word);
+      // Mark misadaptions if running blamer.
+      if (word->blamer_bundle != nullptr) {
+        word->blamer_bundle->SetMisAdaptionDebug(word->best_choice,
+                                                 wordrec_debug_blamer);
+      }
+    }
+
+    if (tessedit_enable_doc_dict && !word->IsAmbiguous())
+      tess_add_doc_word(word->best_choice);
+  }
+#endif  // ndef DISABLED_LEGACY_ENGINE
+}
+
+// Helper to report the result of the xheight fix.
+void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht,
+                                   WERD_RES* word, WERD_RES* new_word) {
+  tprintf("New XHT Match:%s = %s ",
+          word->best_choice->unichar_string().c_str(),
+          word->best_choice->debug_string().c_str());
+  word->reject_map.print(debug_fp);
+  tprintf(" -> %s = %s ",
+          new_word->best_choice->unichar_string().c_str(),
+          new_word->best_choice->debug_string().c_str());
+  new_word->reject_map.print(debug_fp);
+  tprintf(" %s->%s %s %s\n",
+          word->guessed_x_ht ? "GUESS" : "CERT",
+          new_word->guessed_x_ht ? "GUESS" : "CERT",
+          new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
+          accept_new_word ? "ACCEPTED" : "");
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+// Run the x-height fix-up, based on min/max top/bottom information in
+// unicharset.
+// Returns true if the word was changed.
+// See the comment in fixxht.cpp for a description of the overall process.
+bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
+  int original_misfits = CountMisfitTops(word);
+  if (original_misfits == 0)
+    return false;
+  float baseline_shift = 0.0f;
+  float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
+  if (baseline_shift != 0.0f) {
+    // Try the shift on its own first.
+    if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height,
+                              word, block, row))
+      return false;
+    original_misfits = CountMisfitTops(word);
+    if (original_misfits > 0) {
+      float new_baseline_shift;
+      // Now recompute the new x_height.
+      new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
+      if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
+        // No test of return value here, as we are definitely making a change
+        // to the word by shifting the baseline.
+        TestNewNormalization(original_misfits, baseline_shift, new_x_ht,
+                             word, block, row);
+      }
+    }
+    return true;
+  } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
+    return TestNewNormalization(original_misfits, 0.0f, new_x_ht,
+                                word, block, row);
+  } else {
+    return false;
+  }
+}
+
+// Runs recognition with the test baseline shift and x-height and returns true
+// if there was an improvement in recognition result.
+bool Tesseract::TestNewNormalization(int original_misfits,
+                                     float baseline_shift, float new_x_ht,
+                                     WERD_RES *word, BLOCK* block, ROW *row) {
+  bool accept_new_x_ht = false;
+  WERD_RES new_x_ht_word(word->word);
+  if (word->blamer_bundle != nullptr) {
+    new_x_ht_word.blamer_bundle = new BlamerBundle();
+    new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
+  }
+  new_x_ht_word.x_height = new_x_ht;
+  new_x_ht_word.baseline_shift = baseline_shift;
+  new_x_ht_word.caps_height = 0.0;
+  new_x_ht_word.SetupForRecognition(
+        unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
+        classify_bln_numeric_mode, textord_use_cjk_fp_model,
+      poly_allow_detailed_fx, row, block);
+  match_word_pass_n(2, &new_x_ht_word, row, block);
+  if (!new_x_ht_word.tess_failed) {
+    int new_misfits = CountMisfitTops(&new_x_ht_word);
+    if (debug_x_ht_level >= 1) {
+      tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
+              original_misfits, word->x_height,
+              new_misfits, new_x_ht);
+      tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
+              word->best_choice->rating(), word->best_choice->certainty(),
+              new_x_ht_word.best_choice->rating(),
+              new_x_ht_word.best_choice->certainty());
+    }
+    // The misfits must improve and either the rating or certainty.
+    accept_new_x_ht = new_misfits < original_misfits &&
+                      (new_x_ht_word.best_choice->certainty() >
+                          word->best_choice->certainty() ||
+                       new_x_ht_word.best_choice->rating() <
+                          word->best_choice->rating());
+    if (debug_x_ht_level >= 1) {
+      ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
+    }
+  }
+  if (accept_new_x_ht) {
+    word->ConsumeWordResults(&new_x_ht_word);
+    return true;
+  }
+  return false;
+}
+
+#endif  // ndef DISABLED_LEGACY_ENGINE
+
+/**
+ * classify_word_pass2
+ *
+ * Control what to do with the word in pass 2
+ */
+
+void Tesseract::classify_word_pass2(const WordData& word_data,
+                                    WERD_RES** in_word,
+                                    PointerVector<WERD_RES>* out_words) {
+  // Return if we do not want to run Tesseract.
+  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
+    return;
+  }
+#ifndef DISABLED_LEGACY_ENGINE
+  ROW* row = word_data.row;
+  BLOCK* block = word_data.block;
+  WERD_RES* word = *in_word;
+  prev_word_best_choice_ = word_data.prev_word != nullptr
+      ? word_data.prev_word->word->best_choice : nullptr;
+
+  check_debug_pt(word, 30);
+  if (!word->done) {
+    word->caps_height = 0.0;
+    if (word->x_height == 0.0f)
+      word->x_height = row->x_height();
+    match_word_pass_n(2, word, row, block);
+    check_debug_pt(word, 40);
+  }
+
+  SubAndSuperscriptFix(word);
+
+  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
+    if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() &&
+        block->classify_rotation().y() == 0.0f) {
+      // Use the tops and bottoms since they are available.
+      TrainedXheightFix(word, block, row);
+    }
+  }
+#ifndef GRAPHICS_DISABLED
+  if (tessedit_display_outwords) {
+    if (fx_win == nullptr)
+      create_fx_win();
+    clear_fx_win();
+    word->rebuild_word->plot(fx_win);
+    TBOX wbox = word->rebuild_word->bounding_box();
+    fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
+                            wbox.right(), wbox.bottom());
+    ScrollView::Update();
+  }
+#endif
+  check_debug_pt(word, 50);
+#endif  // ndef DISABLED_LEGACY_ENGINE
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+/**
+ * match_word_pass2
+ *
+ * Baseline normalize the word and pass it to Tess.
+ */
+void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word,
+                                  ROW *row, BLOCK* block) {
+  if (word->tess_failed) return;
+  tess_segment_pass_n(pass_n, word);
+
+  if (!word->tess_failed) {
+    if (!word->word->flag (W_REP_CHAR)) {
+       word->fix_quotes();
+      if (tessedit_fix_hyphens)
+        word->fix_hyphens();
+      /* Don't trust fix_quotes! - though I think I've fixed the bug */
+      if (word->best_choice->length() != word->box_word->length()) {
+        tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
+                " #Blobs=%d\n",
+                word->best_choice->debug_string().c_str(),
+                word->best_choice->length(),
+                word->box_word->length());
+
+      }
+      word->tess_accepted = tess_acceptable_word(word);
+
+      // Also sets word->done flag
+      make_reject_map(word, row, pass_n);
+    }
+  }
+  set_word_fonts(word);
+
+  ASSERT_HOST(word->raw_choice != nullptr);
+}
+#endif  // ndef DISABLED_LEGACY_ENGINE
+
+// Helper to return the best rated BLOB_CHOICE in the whole word that matches
+// the given char_id, or nullptr if none can be found.
+static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id,
+                                           WERD_RES* word_res) {
+  // Find the corresponding best BLOB_CHOICE from any position in the word_res.
+  BLOB_CHOICE* best_choice = nullptr;
+  for (int i = 0; i < word_res->best_choice->length(); ++i) {
+    BLOB_CHOICE* choice = FindMatchingChoice(char_id,
+                                             word_res->GetBlobChoices(i));
+    if (choice != nullptr) {
+      if (best_choice == nullptr || choice->rating() < best_choice->rating())
+        best_choice = choice;
+    }
+  }
+  return best_choice;
+}
+
+// Helper to insert blob_choice in each location in the leader word if there is
+// no matching BLOB_CHOICE there already, and correct any incorrect results
+// in the best_choice.
+static void CorrectRepcharChoices(BLOB_CHOICE* blob_choice,
+                                  WERD_RES* word_res) {
+  WERD_CHOICE* word = word_res->best_choice;
+  for (int i = 0; i < word_res->best_choice->length(); ++i) {
+    BLOB_CHOICE* choice = FindMatchingChoice(blob_choice->unichar_id(),
+                                             word_res->GetBlobChoices(i));
+    if (choice == nullptr) {
+      BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));
+      choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
+    }
+  }
+  // Correct any incorrect results in word.
+  for (int i = 0; i < word->length(); ++i) {
+    if (word->unichar_id(i) != blob_choice->unichar_id())
+      word->set_unichar_id(blob_choice->unichar_id(), i);
+  }
+}
+
+/**
+ * fix_rep_char()
+ * The word is a repeated char. (Leader.) Find the repeated char character.
+ * Create the appropriate single-word or multi-word sequence according to
+ * the size of spaces in between blobs, and correct the classifications
+ * where some of the characters disagree with the majority.
+ */
+void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
+  WERD_RES *word_res = page_res_it->word();
+  const WERD_CHOICE &word = *(word_res->best_choice);
+
+  // Find the frequency of each unique character in the word.
+  SortHelper<UNICHAR_ID> rep_ch(word.length());
+  for (int i = 0; i < word.length(); ++i) {
+    rep_ch.Add(word.unichar_id(i), 1);
+  }
+
+  // Find the most frequent result.
+  UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
+  int max_count = rep_ch.MaxCount(&maxch_id);
+  // Find the best exemplar of a classifier result for maxch_id.
+  BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
+  if (best_choice == nullptr) {
+    tprintf("Failed to find a choice for %s, occurring %d times\n",
+            word_res->uch_set->debug_str(maxch_id).c_str(), max_count);
+    return;
+  }
+  word_res->done = true;
+
+  // Measure the mean space.
+  int gap_count = 0;
+  WERD* werd = word_res->word;
+  C_BLOB_IT blob_it(werd->cblob_list());
+  C_BLOB* prev_blob = blob_it.data();
+  for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
+    C_BLOB* blob = blob_it.data();
+    int gap = blob->bounding_box().left();
+    gap -= prev_blob->bounding_box().right();
+    ++gap_count;
+    prev_blob = blob;
+  }
+  // Just correct existing classification.
+  CorrectRepcharChoices(best_choice, word_res);
+  word_res->reject_map.initialise(word.length());
+}
+
+ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(
+    const UNICHARSET& char_set, const char *s, const char *lengths) {
+  int i = 0;
+  int offset = 0;
+  int leading_punct_count;
+  int upper_count = 0;
+  int hyphen_pos = -1;
+  ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;
+
+  if (strlen (lengths) > 20)
+    return word_type;
+
+  /* Single Leading punctuation char*/
+
+  if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
+    offset += lengths[i++];
+  leading_punct_count = i;
+
+  /* Initial cap */
+  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
+    offset += lengths[i++];
+    upper_count++;
+  }
+  if (upper_count > 1) {
+    word_type = AC_UPPER_CASE;
+  } else {
+    /* Lower case word, possibly with an initial cap */
+    while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
+      offset += lengths[i++];
+    }
+    if (i - leading_punct_count < quality_min_initial_alphas_reqd)
+      goto not_a_word;
+    /*
+    Allow a single hyphen in a lower case word
+    - don't trust upper case - I've seen several cases of "H" -> "I-I"
+    */
+    if (lengths[i] == 1 && s[offset] == '-') {
+      hyphen_pos = i;
+      offset += lengths[i++];
+      if (s[offset] != '\0') {
+        while ((s[offset] != '\0') &&
+               char_set.get_islower(s + offset, lengths[i])) {
+          offset += lengths[i++];
+        }
+        if (i < hyphen_pos + 3)
+          goto not_a_word;
+      }
+    } else {
+      /* Allow "'s" in NON hyphenated lower case words */
+      if (lengths[i] == 1 && (s[offset] == '\'') &&
+          lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
+        offset += lengths[i++];
+        offset += lengths[i++];
+      }
+    }
+    if (upper_count > 0)
+      word_type = AC_INITIAL_CAP;
+    else
+      word_type = AC_LOWER_CASE;
+  }
+
+  /* Up to two different, constrained trailing punctuation chars */
+  if (lengths[i] == 1 && s[offset] != '\0' &&
+      STRING(chs_trailing_punct1).contains(s[offset]))
+    offset += lengths[i++];
+  if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
+      s[offset - lengths[i - 1]] != s[offset] &&
+      STRING(chs_trailing_punct2).contains (s[offset]))
+    offset += lengths[i++];
+
+  if (s[offset] != '\0')
+    word_type = AC_UNACCEPTABLE;
+
+  not_a_word:
+
+  if (word_type == AC_UNACCEPTABLE) {
+    /* Look for abbreviation string */
+    i = 0;
+    offset = 0;
+    if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
+      word_type = AC_UC_ABBREV;
+      while (s[offset] != '\0' &&
+             char_set.get_isupper(s + offset, lengths[i]) &&
+             lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
+        offset += lengths[i++];
+        offset += lengths[i++];
+      }
+    }
+    else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
+      word_type = AC_LC_ABBREV;
+      while (s[offset] != '\0' &&
+             char_set.get_islower(s + offset, lengths[i]) &&
+             lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
+        offset += lengths[i++];
+        offset += lengths[i++];
+      }
+    }
+    if (s[offset] != '\0')
+      word_type = AC_UNACCEPTABLE;
+  }
+
+  return word_type;
+}
+
+bool Tesseract::check_debug_pt(WERD_RES* word, int location) {
+  bool show_map_detail = false;
+  int16_t i;
+
+  if (!test_pt)
+    return false;
+
+  tessedit_rejection_debug.set_value (false);
+  debug_x_ht_level.set_value(0);
+
+  if (word->word->bounding_box().contains(FCOORD (test_pt_x, test_pt_y))) {
+    if (location < 0)
+      return true;               // For breakpoint use
+    tessedit_rejection_debug.set_value(true);
+    debug_x_ht_level.set_value(2);
+    tprintf ("\n\nTESTWD::");
+    switch (location) {
+      case 0:
+        tprintf ("classify_word_pass1 start\n");
+        word->word->print();
+        break;
+      case 10:
+        tprintf ("make_reject_map: initial map");
+        break;
+      case 20:
+        tprintf ("make_reject_map: after NN");
+        break;
+      case 30:
+        tprintf ("classify_word_pass2 - START");
+        break;
+      case 40:
+        tprintf ("classify_word_pass2 - Pre Xht");
+        break;
+      case 50:
+        tprintf ("classify_word_pass2 - END");
+        show_map_detail = true;
+        break;
+      case 60:
+        tprintf ("fixspace");
+        break;
+      case 70:
+        tprintf ("MM pass START");
+        break;
+      case 80:
+        tprintf ("MM pass END");
+        break;
+      case 90:
+        tprintf ("After Poor quality rejection");
+        break;
+      case 100:
+        tprintf ("unrej_good_quality_words - START");
+        break;
+      case 110:
+        tprintf ("unrej_good_quality_words - END");
+        break;
+      case 120:
+        tprintf ("Write results pass");
+        show_map_detail = true;
+        break;
+    }
+    if (word->best_choice != nullptr) {
+      tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
+      word->reject_map.print(debug_fp);
+      tprintf("\n");
+      if (show_map_detail) {
+        tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
+        for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
+          tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
+          word->reject_map[i].full_print(debug_fp);
+        }
+      }
+    } else {
+      tprintf("null best choice\n");
+    }
+    tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
+    tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
+    return true;
+  } else {
+    return false;
+  }
+}
+
+/**
+ * find_modal_font
+ *
+ * Find the modal font and remove from the stats.
+ */
+static void find_modal_font(  // good chars in word
+    STATS* fonts,             // font stats
+    int16_t* font_out,        // output font
+    int8_t* font_count        // output count
+) {
+  int16_t font;  //font index
+  int32_t count; //pile count
+
+  if (fonts->get_total () > 0) {
+    font = static_cast<int16_t>(fonts->mode ());
+    *font_out = font;
+    count = fonts->pile_count (font);
+    *font_count = count < INT8_MAX ? count : INT8_MAX;
+    fonts->add (font, -*font_count);
+  }
+  else {
+    *font_out = -1;
+    *font_count = 0;
+  }
+}
+
+/**
+ * set_word_fonts
+ *
+ * Get the fonts for the word.
+ */
+void Tesseract::set_word_fonts(WERD_RES *word) {
+  // Don't try to set the word fonts for an lstm word, as the configs
+  // will be meaningless.
+  if (word->chopped_word == nullptr) return;
+  ASSERT_HOST(word->best_choice != nullptr);
+
+#ifndef DISABLED_LEGACY_ENGINE
+  const int fontinfo_size = get_fontinfo_table().size();
+  if (fontinfo_size == 0) return;
+  GenericVector<int> font_total_score;
+  font_total_score.init_to_size(fontinfo_size, 0);
+
+  // Compute the font scores for the word
+  if (tessedit_debug_fonts) {
+    tprintf("Examining fonts in %s\n",
+            word->best_choice->debug_string().c_str());
+  }
+  for (int b = 0; b < word->best_choice->length(); ++b) {
+    const BLOB_CHOICE* choice = word->GetBlobChoice(b);
+    if (choice == nullptr) continue;
+    auto &fonts = choice->fonts();
+    for (int f = 0; f < fonts.size(); ++f) {
+      const int fontinfo_id = fonts[f].fontinfo_id;
+      if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
+        font_total_score[fontinfo_id] += fonts[f].score;
+      }
+    }
+  }
+  // Find the top and 2nd choice for the word.
+  int score1 = 0, score2 = 0;
+  int16_t font_id1 = -1, font_id2 = -1;
+  for (int f = 0; f < fontinfo_size; ++f) {
+    if (tessedit_debug_fonts && font_total_score[f] > 0) {
+      tprintf("Font %s, total score = %d\n",
+              fontinfo_table_.get(f).name, font_total_score[f]);
+    }
+    if (font_total_score[f] > score1) {
+      score2 = score1;
+      font_id2 = font_id1;
+      score1 = font_total_score[f];
+      font_id1 = f;
+    } else if (font_total_score[f] > score2) {
+      score2 = font_total_score[f];
+      font_id2 = f;
+    }
+  }
+  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : nullptr;
+  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : nullptr;
+  // Each score has a limit of UINT16_MAX, so divide by that to get the number
+  // of "votes" for that font, ie number of perfect scores.
+  word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX);
+  word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX);
+  if (score1 > 0) {
+    const FontInfo fi = fontinfo_table_.get(font_id1);
+    if (tessedit_debug_fonts) {
+      if (word->fontinfo_id2_count > 0 && font_id2 >= 0) {
+        tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
+                fi.name, word->fontinfo_id_count,
+                fontinfo_table_.get(font_id2).name,
+                word->fontinfo_id2_count);
+      } else {
+        tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
+                fi.name, word->fontinfo_id_count);
+      }
+    }
+  }
+#endif  // ndef DISABLED_LEGACY_ENGINE
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+/**
+ * font_recognition_pass
+ *
+ * Smooth the fonts for the document.
+ */
+void Tesseract::font_recognition_pass(PAGE_RES* page_res) {
+  PAGE_RES_IT page_res_it(page_res);
+  WERD_RES *word;                // current word
+  STATS doc_fonts(0, font_table_size_);           // font counters
+
+  // Gather font id statistics.
+  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
+       page_res_it.forward()) {
+    word = page_res_it.word();
+    if (word->fontinfo != nullptr) {
+      doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
+    }
+    if (word->fontinfo2 != nullptr) {
+      doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
+    }
+  }
+  int16_t doc_font;               // modal font
+  int8_t doc_font_count;          // modal font
+  find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
+  if (doc_font_count == 0)
+    return;
+  // Get the modal font pointer.
+  const FontInfo* modal_font = nullptr;
+  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
+       page_res_it.forward()) {
+    word = page_res_it.word();
+    if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) {
+      modal_font = word->fontinfo;
+      break;
+    }
+    if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) {
+      modal_font = word->fontinfo2;
+      break;
+    }
+  }
+  ASSERT_HOST(modal_font != nullptr);
+
+  // Assign modal font to weak words.
+  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
+       page_res_it.forward()) {
+    word = page_res_it.word();
+    const int length = word->best_choice->length();
+
+    const int count = word->fontinfo_id_count;
+    if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
+      word->fontinfo = modal_font;
+      // Counts only get 1 as it came from the doc.
+      word->fontinfo_id_count = 1;
+    }
+  }
+}
+#endif  // ndef DISABLED_LEGACY_ENGINE
+
+// If a word has multiple alternates check if the best choice is in the
+// dictionary. If not, replace it with an alternate that exists in the
+// dictionary.
+void Tesseract::dictionary_correction_pass(PAGE_RES *page_res) {
+  PAGE_RES_IT word_it(page_res);
+  for (WERD_RES* word = word_it.word(); word != nullptr;
+       word = word_it.forward()) {
+    if (word->best_choices.singleton())
+      continue;  // There are no alternates.
+
+    const WERD_CHOICE* best = word->best_choice;
+    if (word->tesseract->getDict().valid_word(*best) != 0)
+      continue;  // The best choice is in the dictionary.
+
+    WERD_CHOICE_IT choice_it(&word->best_choices);
+    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
+         choice_it.forward()) {
+      WERD_CHOICE* alternate = choice_it.data();
+      if (word->tesseract->getDict().valid_word(*alternate)) {
+        // The alternate choice is in the dictionary.
+        if (tessedit_bigram_debug) {
+          tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
+                  best->unichar_string().c_str(),
+                  alternate->unichar_string().c_str());
+        }
+        // Replace the 'best' choice with a better choice.
+        word->ReplaceBestChoice(alternate);
+        break;
+      }
+    }
+  }
+}
+
+}  // namespace tesseract
diff --git a/tesseract/src/ccmain/control.h b/tesseract/src/ccmain/control.h
new file mode 100644
index 00000000..cd57ddba
--- /dev/null
+++ b/tesseract/src/ccmain/control.h
@@ -0,0 +1,38 @@
+/**********************************************************************
+ * File:        control.h  (Formerly control.h)
+ * Description: Module-independent matcher controller.
+ * Author:      Ray Smith
+ * Created:     Thu Apr 23 11:09:58 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+/**
+ * @file control.h
+ * Module-independent matcher controller.
+ */
+
+#ifndef CONTROL_H
+#define CONTROL_H
+
+enum ACCEPTABLE_WERD_TYPE
+{
+  AC_UNACCEPTABLE,               ///< Unacceptable word
+  AC_LOWER_CASE,                 ///< ALL lower case
+  AC_UPPER_CASE,                 ///< ALL upper case
+  AC_INITIAL_CAP,                ///< ALL but initial lc
+  AC_LC_ABBREV,                  ///< a.b.c.
+  AC_UC_ABBREV                   ///< A.B.C.
+};
+
+#endif
diff --git a/tesseract/src/ccmain/docqual.cpp b/tesseract/src/ccmain/docqual.cpp
new file mode 100644
index 00000000..f74f9ead
--- /dev/null
+++ b/tesseract/src/ccmain/docqual.cpp
@@ -0,0 +1,981 @@
+/******************************************************************
+ * File:        docqual.cpp  (Formerly docqual.c)
+ * Description: Document Quality Metrics
+ * Author:      Phil Cheatle
+ *
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <cctype>
+#include "docqual.h"
+#include "reject.h"
+#include "tessvars.h"
+#include "tesseractclass.h"
+
+namespace tesseract{
+
+static void countMatchingBlobs(int16_t& match_count, int /*index*/) {
+  ++match_count;
+}
+
+static void countAcceptedBlobs(WERD_RES* word, int16_t& match_count,
+                               int16_t& accepted_match_count, int index) {
+  if (word->reject_map[index].accepted()) {
+    ++accepted_match_count;
+  }
+  ++match_count;
+}
+
+static void acceptIfGoodQuality(WERD_RES* word, int index) {
+  if (word->reject_map[index].accept_if_good_quality()) {
+    word->reject_map[index].setrej_quality_accept();
+  }
+}
+
+/*************************************************************************
+ * word_blob_quality()
+ * How many blobs in the box_word are identical to those of the inword?
+ * ASSUME blobs in both initial word and box_word are in ascending order of
+ * left hand blob edge.
+ *************************************************************************/
+int16_t Tesseract::word_blob_quality(WERD_RES* word) {
+  int16_t match_count = 0;
+  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
+      !word->rebuild_word->blobs.empty()) {
+    using namespace std::placeholders;  // for _1
+    word->bln_boxes->ProcessMatchedBlobs(
+        *word->rebuild_word,
+        std::bind(countMatchingBlobs, match_count, _1));
+  }
+  return match_count;
+}
+
+int16_t Tesseract::word_outline_errs(WERD_RES *word) {
+  int16_t i = 0;
+  int16_t err_count = 0;
+
+  if (word->rebuild_word != nullptr) {
+    for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
+      TBLOB* blob = word->rebuild_word->blobs[b];
+      err_count += count_outline_errs(word->best_choice->unichar_string()[i],
+                                      blob->NumOutlines());
+      i++;
+    }
+  }
+  return err_count;
+}
+
+/*************************************************************************
+ * word_char_quality()
+ * Combination of blob quality and outline quality - how many good chars are
+ * there? - I.e chars which pass the blob AND outline tests.
+ *************************************************************************/
+void Tesseract::word_char_quality(WERD_RES* word, int16_t* match_count,
+                                  int16_t* accepted_match_count) {
+  *match_count = 0;
+  *accepted_match_count = 0;
+  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
+      !word->rebuild_word->blobs.empty()) {
+    using namespace std::placeholders;  // for _1
+    word->bln_boxes->ProcessMatchedBlobs(
+        *word->rebuild_word,
+        std::bind(countAcceptedBlobs,
+                  word, *match_count, *accepted_match_count, _1));
+  }
+}
+
+/*************************************************************************
+ * unrej_good_chs()
+ * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
+ *************************************************************************/
+void Tesseract::unrej_good_chs(WERD_RES* word) {
+  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
+      word->rebuild_word->blobs.empty()) {
+    using namespace std::placeholders;  // for _1
+    word->bln_boxes->ProcessMatchedBlobs(
+      *word->rebuild_word, std::bind(acceptIfGoodQuality, word, _1));
+  }
+}
+
+int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {
+  int expected_outline_count;
+
+  if (STRING (outlines_odd).contains (c))
+    return 0;  // Don't use this char
+  else if (STRING (outlines_2).contains (c))
+    expected_outline_count = 2;
+  else
+    expected_outline_count = 1;
+  return abs (outline_count - expected_outline_count);
+}
+
+void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
+                                        bool good_quality_doc) {
+  if ((tessedit_good_quality_unrej && good_quality_doc))
+    unrej_good_quality_words(page_res_it);
+  doc_and_block_rejection(page_res_it, good_quality_doc);
+  if (unlv_tilde_crunching) {
+    tilde_crunch(page_res_it);
+    tilde_delete(page_res_it);
+  }
+}
+
+/*************************************************************************
+ * unrej_good_quality_words()
+ * Accept potential rejects in words which pass the following checks:
+ *    - Contains a potential reject
+ *    - Word looks like a sensible alpha word.
+ *    - Word segmentation is the same as the original image
+ *    - All characters have the expected number of outlines
+ * NOTE - the rejection counts are recalculated after unrejection
+ *      - CAN'T do it in a single pass without a bit of fiddling
+ *    - keep it simple but inefficient
+ *************************************************************************/
+void Tesseract::unrej_good_quality_words(  //unreject potential
+                                         PAGE_RES_IT &page_res_it) {
+  WERD_RES *word;
+  ROW_RES *current_row;
+  BLOCK_RES *current_block;
+  int i;
+
+  page_res_it.restart_page ();
+  while (page_res_it.word () != nullptr) {
+    check_debug_pt (page_res_it.word (), 100);
+    if (bland_unrej) {
+      word = page_res_it.word ();
+      for (i = 0; i < word->reject_map.length (); i++) {
+        if (word->reject_map[i].accept_if_good_quality ())
+          word->reject_map[i].setrej_quality_accept ();
+      }
+      page_res_it.forward ();
+    }
+    else if ((page_res_it.row ()->char_count > 0) &&
+      ((page_res_it.row ()->rej_count /
+      static_cast<float>(page_res_it.row ()->char_count)) <=
+    quality_rowrej_pc)) {
+      word = page_res_it.word ();
+      if (word->reject_map.quality_recoverable_rejects() &&
+          (tessedit_unrej_any_wd ||
+           acceptable_word_string(*word->uch_set,
+                                  word->best_choice->unichar_string().c_str(),
+                                  word->best_choice->unichar_lengths().c_str())
+               != AC_UNACCEPTABLE)) {
+        unrej_good_chs(word);
+      }
+      page_res_it.forward ();
+    }
+    else {
+      // Skip to end of dodgy row.
+      current_row = page_res_it.row ();
+      while ((page_res_it.word () != nullptr) &&
+        (page_res_it.row () == current_row))
+        page_res_it.forward ();
+    }
+    check_debug_pt (page_res_it.word (), 110);
+  }
+  page_res_it.restart_page ();
+  page_res_it.page_res->char_count = 0;
+  page_res_it.page_res->rej_count = 0;
+  current_block = nullptr;
+  current_row = nullptr;
+  while (page_res_it.word () != nullptr) {
+    if (current_block != page_res_it.block ()) {
+      current_block = page_res_it.block ();
+      current_block->char_count = 0;
+      current_block->rej_count = 0;
+    }
+    if (current_row != page_res_it.row ()) {
+      current_row = page_res_it.row ();
+      current_row->char_count = 0;
+      current_row->rej_count = 0;
+      current_row->whole_word_rej_count = 0;
+    }
+    page_res_it.rej_stat_word ();
+    page_res_it.forward ();
+  }
+}
+
+
+/*************************************************************************
+ * doc_and_block_rejection()
+ *
+ * If the page has too many rejects - reject all of it.
+ * If any block has too many rejects - reject all words in the block
+ *************************************************************************/
+
+void Tesseract::doc_and_block_rejection(  //reject big chunks
+                                        PAGE_RES_IT &page_res_it,
+                                        bool good_quality_doc) {
+  int16_t block_no = 0;
+  int16_t row_no = 0;
+  BLOCK_RES *current_block;
+  ROW_RES *current_row;
+
+  bool rej_word;
+  bool prev_word_rejected;
+  int16_t char_quality = 0;
+  int16_t accepted_char_quality;
+
+  if (page_res_it.page_res->rej_count * 100.0 /
+      page_res_it.page_res->char_count > tessedit_reject_doc_percent) {
+    reject_whole_page(page_res_it);
+    if (tessedit_debug_doc_rejection) {
+      tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
+              page_res_it.page_res->char_count,
+              page_res_it.page_res->rej_count);
+    }
+  } else {
+    if (tessedit_debug_doc_rejection) {
+      tprintf("NO PAGE REJECTION #chars: %d  # Rejects: %d; \n",
+              page_res_it.page_res->char_count,
+              page_res_it.page_res->rej_count);
+    }
+
+    /* Walk blocks testing for block rejection */
+
+    page_res_it.restart_page();
+    WERD_RES* word;
+    while ((word = page_res_it.word()) != nullptr) {
+      current_block = page_res_it.block();
+      block_no = current_block->block->pdblk.index();
+      if (current_block->char_count > 0 &&
+          (current_block->rej_count * 100.0 / current_block->char_count) >
+           tessedit_reject_block_percent) {
+        if (tessedit_debug_block_rejection) {
+          tprintf("REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\n",
+                  block_no, current_block->char_count,
+                  current_block->rej_count);
+        }
+        prev_word_rejected = false;
+        while ((word = page_res_it.word()) != nullptr &&
+               (page_res_it.block() == current_block)) {
+          if (tessedit_preserve_blk_rej_perfect_wds) {
+            rej_word = word->reject_map.reject_count() > 0 ||
+                word->reject_map.length () < tessedit_preserve_min_wd_len;
+            if (rej_word && tessedit_dont_blkrej_good_wds &&
+                word->reject_map.length() >= tessedit_preserve_min_wd_len &&
+                acceptable_word_string(
+                    *word->uch_set,
+                    word->best_choice->unichar_string().c_str(),
+                    word->best_choice->unichar_lengths().c_str()) !=
+                AC_UNACCEPTABLE) {
+              word_char_quality(word, &char_quality, &accepted_char_quality);
+              rej_word = char_quality != word->reject_map.length();
+            }
+          } else {
+            rej_word = true;
+          }
+          if (rej_word) {
+            /*
+              Reject spacing if both current and prev words are rejected.
+              NOTE - this is NOT restricted to FUZZY spaces. - When tried this
+              generated more space errors.
+            */
+            if (tessedit_use_reject_spaces &&
+                prev_word_rejected &&
+                page_res_it.prev_row() == page_res_it.row() &&
+                word->word->space() == 1)
+              word->reject_spaces = true;
+            word->reject_map.rej_word_block_rej();
+          }
+          prev_word_rejected = rej_word;
+          page_res_it.forward();
+        }
+      } else {
+        if (tessedit_debug_block_rejection) {
+          tprintf("NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \n",
+                  block_no, page_res_it.block()->char_count,
+                  page_res_it.block()->rej_count);
+        }
+
+        /* Walk rows in block testing for row rejection */
+        row_no = 0;
+        while (page_res_it.word() != nullptr &&
+               page_res_it.block() == current_block) {
+          current_row = page_res_it.row();
+          row_no++;
+          /* Reject whole row if:
+            fraction of chars on row which are rejected exceed a limit AND
+            fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
+            limit
+          */
+          if (current_row->char_count > 0 &&
+              (current_row->rej_count * 100.0 / current_row->char_count) >
+              tessedit_reject_row_percent &&
+              (current_row->whole_word_rej_count * 100.0 /
+                  current_row->rej_count) <
+              tessedit_whole_wd_rej_row_percent) {
+            if (tessedit_debug_block_rejection) {
+              tprintf("REJECTING ROW %d  #chars: %d;  #Rejects: %d\n",
+                      row_no, current_row->char_count,
+                      current_row->rej_count);
+            }
+            prev_word_rejected = false;
+            while ((word = page_res_it.word()) != nullptr &&
+                   page_res_it.row () == current_row) {
+              /* Preserve words on good docs unless they are mostly rejected*/
+              if (!tessedit_row_rej_good_docs && good_quality_doc) {
+                rej_word = word->reject_map.reject_count() /
+                    static_cast<float>(word->reject_map.length()) >
+                    tessedit_good_doc_still_rowrej_wd;
+              } else if (tessedit_preserve_row_rej_perfect_wds) {
+                /* Preserve perfect words anyway */
+                rej_word = word->reject_map.reject_count() > 0 ||
+                    word->reject_map.length () < tessedit_preserve_min_wd_len;
+                if (rej_word && tessedit_dont_rowrej_good_wds &&
+                    word->reject_map.length() >= tessedit_preserve_min_wd_len &&
+                    acceptable_word_string(*word->uch_set,
+                        word->best_choice->unichar_string().c_str(),
+                        word->best_choice->unichar_lengths().c_str()) !=
+                            AC_UNACCEPTABLE) {
+                  word_char_quality(word, &char_quality,
+                                    &accepted_char_quality);
+                  rej_word = char_quality != word->reject_map.length();
+                }
+              } else {
+                rej_word = true;
+              }
+              if (rej_word) {
+                /*
+                  Reject spacing if both current and prev words are rejected.
+                  NOTE - this is NOT restricted to FUZZY spaces. - When tried
+                  this generated more space errors.
+                */
+                if (tessedit_use_reject_spaces &&
+                    prev_word_rejected &&
+                    page_res_it.prev_row() == page_res_it.row() &&
+                    word->word->space () == 1)
+                  word->reject_spaces = true;
+                word->reject_map.rej_word_row_rej();
+              }
+              prev_word_rejected = rej_word;
+              page_res_it.forward();
+            }
+          } else {
+            if (tessedit_debug_block_rejection) {
+              tprintf("NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \n",
+                      row_no, current_row->char_count, current_row->rej_count);
+            }
+            while (page_res_it.word() != nullptr &&
+                   page_res_it.row() == current_row)
+              page_res_it.forward();
+          }
+        }
+      }
+    }
+  }
+}
+
+/*************************************************************************
+ * reject_whole_page()
+ * Don't believe any of it - set the reject map to 00..00 in all words
+ *
+ *************************************************************************/
+
+void reject_whole_page(PAGE_RES_IT &page_res_it) {
+  page_res_it.restart_page ();
+  while (page_res_it.word () != nullptr) {
+    page_res_it.word ()->reject_map.rej_word_doc_rej ();
+    page_res_it.forward ();
+  }
+                                 //whole page is rejected
+  page_res_it.page_res->rejected = true;
+}
+
+void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
+  WERD_RES *word;
+  GARBAGE_LEVEL garbage_level;
+  PAGE_RES_IT copy_it;
+  bool prev_potential_marked = false;
+  bool found_terrible_word = false;
+  bool ok_dict_word;
+
+  page_res_it.restart_page();
+  while (page_res_it.word() != nullptr) {
+    POLY_BLOCK* pb = page_res_it.block()->block->pdblk.poly_block();
+    if (pb != nullptr && !pb->IsText()) {
+      page_res_it.forward();
+      continue;
+    }
+    word = page_res_it.word();
+
+    if (crunch_early_convert_bad_unlv_chs)
+      convert_bad_unlv_chs(word);
+
+    if (crunch_early_merge_tess_fails)
+      word->merge_tess_fails();
+
+    if (word->reject_map.accept_count () != 0) {
+      found_terrible_word = false;
+                                 //Forget earlier potential crunches
+      prev_potential_marked = false;
+    }
+    else {
+      ok_dict_word = safe_dict_word(word);
+      garbage_level = garbage_word(word, ok_dict_word);
+
+      if ((garbage_level != G_NEVER_CRUNCH) &&
+      (terrible_word_crunch (word, garbage_level))) {
+        if (crunch_debug > 0) {
+          tprintf ("T CRUNCHING: \"%s\"\n",
+            word->best_choice->unichar_string().c_str());
+        }
+        word->unlv_crunch_mode = CR_KEEP_SPACE;
+        if (prev_potential_marked) {
+          while (copy_it.word () != word) {
+            if (crunch_debug > 0) {
+              tprintf ("P1 CRUNCHING: \"%s\"\n",
+                copy_it.word()->best_choice->unichar_string().c_str());
+            }
+            copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
+            copy_it.forward ();
+          }
+          prev_potential_marked = false;
+        }
+        found_terrible_word = true;
+      }
+      else if ((garbage_level != G_NEVER_CRUNCH) &&
+        (potential_word_crunch (word,
+      garbage_level, ok_dict_word))) {
+        if (found_terrible_word) {
+          if (crunch_debug > 0) {
+            tprintf ("P2 CRUNCHING: \"%s\"\n",
+              word->best_choice->unichar_string().c_str());
+          }
+          word->unlv_crunch_mode = CR_KEEP_SPACE;
+        }
+        else if (!prev_potential_marked) {
+          copy_it = page_res_it;
+          prev_potential_marked = true;
+          if (crunch_debug > 1) {
+            tprintf ("P3 CRUNCHING: \"%s\"\n",
+              word->best_choice->unichar_string().c_str());
+          }
+        }
+      }
+      else {
+        found_terrible_word = false;
+                                 //Forget earlier potential crunches
+        prev_potential_marked = false;
+        if (crunch_debug > 2) {
+          tprintf ("NO CRUNCH: \"%s\"\n",
+            word->best_choice->unichar_string().c_str());
+        }
+      }
+    }
+    page_res_it.forward ();
+  }
+}
+
+
+bool Tesseract::terrible_word_crunch(WERD_RES* word,
+                                     GARBAGE_LEVEL garbage_level) {
+  float rating_per_ch;
+  int adjusted_len;
+  int crunch_mode = 0;
+
+  if ((word->best_choice->unichar_string().length() == 0) ||
+      (strspn(word->best_choice->unichar_string().c_str(), " ") ==
+       word->best_choice->unichar_string().unsigned_size()))
+    crunch_mode = 1;
+  else {
+    adjusted_len = word->reject_map.length ();
+    if (adjusted_len > crunch_rating_max)
+      adjusted_len = crunch_rating_max;
+    rating_per_ch = word->best_choice->rating () / adjusted_len;
+
+    if (rating_per_ch > crunch_terrible_rating)
+      crunch_mode = 2;
+    else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
+      crunch_mode = 3;
+    else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
+      (garbage_level != G_OK))
+      crunch_mode = 4;
+    else if ((rating_per_ch > crunch_poor_garbage_rate) &&
+      (garbage_level != G_OK))
+      crunch_mode = 5;
+  }
+  if (crunch_mode > 0) {
+    if (crunch_debug > 2) {
+      tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
+        crunch_mode, word->best_choice->unichar_string().c_str());
+    }
+    return true;
+  }
+  else
+    return false;
+}
+
+bool Tesseract::potential_word_crunch(WERD_RES* word,
+                                      GARBAGE_LEVEL garbage_level,
+                                      bool ok_dict_word) {
+  float rating_per_ch;
+  int adjusted_len;
+  const char *str = word->best_choice->unichar_string().c_str();
+  const char *lengths = word->best_choice->unichar_lengths().c_str();
+  bool word_crunchable;
+  int poor_indicator_count = 0;
+
+  word_crunchable = !crunch_leave_accept_strings ||
+                    word->reject_map.length() < 3 ||
+                    (acceptable_word_string(*word->uch_set,
+                                            str, lengths) == AC_UNACCEPTABLE &&
+                     !ok_dict_word);
+
+  adjusted_len = word->reject_map.length();
+  if (adjusted_len > 10)
+    adjusted_len = 10;
+  rating_per_ch = word->best_choice->rating() / adjusted_len;
+
+  if (rating_per_ch > crunch_pot_poor_rate) {
+    if (crunch_debug > 2) {
+      tprintf("Potential poor rating on \"%s\"\n",
+              word->best_choice->unichar_string().c_str());
+    }
+    poor_indicator_count++;
+  }
+
+  if (word_crunchable &&
+      word->best_choice->certainty() < crunch_pot_poor_cert) {
+    if (crunch_debug > 2) {
+      tprintf("Potential poor cert on \"%s\"\n",
+              word->best_choice->unichar_string().c_str());
+    }
+    poor_indicator_count++;
+  }
+
+  if (garbage_level != G_OK) {
+    if (crunch_debug > 2) {
+      tprintf("Potential garbage on \"%s\"\n",
+              word->best_choice->unichar_string().c_str());
+    }
+    poor_indicator_count++;
+  }
+  return poor_indicator_count >= crunch_pot_indicators;
+}
+
+void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
+  WERD_RES *word;
+  PAGE_RES_IT copy_it;
+  bool deleting_from_bol = false;
+  bool marked_delete_point = false;
+  int16_t debug_delete_mode;
+  CRUNCH_MODE delete_mode;
+  int16_t x_debug_delete_mode;
+  CRUNCH_MODE x_delete_mode;
+
+  page_res_it.restart_page();
+  while (page_res_it.word() != nullptr) {
+    word = page_res_it.word();
+
+    delete_mode = word_deletable (word, debug_delete_mode);
+    if (delete_mode != CR_NONE) {
+      if (word->word->flag (W_BOL) || deleting_from_bol) {
+        if (crunch_debug > 0) {
+          tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
+            debug_delete_mode,
+            word->best_choice->unichar_string().c_str());
+        }
+        word->unlv_crunch_mode = delete_mode;
+        deleting_from_bol = true;
+      } else if (word->word->flag(W_EOL)) {
+        if (marked_delete_point) {
+          while (copy_it.word() != word) {
+            x_delete_mode = word_deletable (copy_it.word (),
+              x_debug_delete_mode);
+            if (crunch_debug > 0) {
+              tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
+                x_debug_delete_mode,
+                copy_it.word()->best_choice->unichar_string().c_str());
+            }
+            copy_it.word ()->unlv_crunch_mode = x_delete_mode;
+            copy_it.forward ();
+          }
+        }
+        if (crunch_debug > 0) {
+          tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
+            debug_delete_mode,
+            word->best_choice->unichar_string().c_str());
+        }
+        word->unlv_crunch_mode = delete_mode;
+        deleting_from_bol = false;
+        marked_delete_point = false;
+      }
+      else {
+        if (!marked_delete_point) {
+          copy_it = page_res_it;
+          marked_delete_point = true;
+        }
+      }
+    }
+    else {
+      deleting_from_bol = false;
+                                 //Forget earlier potential crunches
+      marked_delete_point = false;
+    }
+    /*
+      The following step has been left till now as the tess fails are used to
+      determine if the word is deletable.
+    */
+    if (!crunch_early_merge_tess_fails)
+      word->merge_tess_fails();
+    page_res_it.forward ();
+  }
+}
+
+
+void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
+  int i;
+  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
+  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
+  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
+  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
+  for (i = 0; i < word_res->reject_map.length(); ++i) {
+    if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
+      word_res->best_choice->set_unichar_id(unichar_dash, i);
+      if (word_res->reject_map[i].accepted ())
+        word_res->reject_map[i].setrej_unlv_rej ();
+    }
+    if (word_res->best_choice->unichar_id(i) == unichar_pow) {
+      word_res->best_choice->set_unichar_id(unichar_space, i);
+      if (word_res->reject_map[i].accepted ())
+        word_res->reject_map[i].setrej_unlv_rej ();
+    }
+  }
+}
+
+GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {
+  enum STATES
+  {
+    JUNK,
+    FIRST_UPPER,
+    FIRST_LOWER,
+    FIRST_NUM,
+    SUBSEQUENT_UPPER,
+    SUBSEQUENT_LOWER,
+    SUBSEQUENT_NUM
+  };
+  const char *str = word->best_choice->unichar_string().c_str();
+  const char *lengths = word->best_choice->unichar_lengths().c_str();
+  STATES state = JUNK;
+  int len = 0;
+  int isolated_digits = 0;
+  int isolated_alphas = 0;
+  int bad_char_count = 0;
+  int tess_rejs = 0;
+  int dodgy_chars = 0;
+  int ok_chars;
+  UNICHAR_ID last_char = -1;
+  int alpha_repetition_count = 0;
+  int longest_alpha_repetition_count = 0;
+  int longest_lower_run_len = 0;
+  int lower_string_count = 0;
+  int longest_upper_run_len = 0;
+  int upper_string_count = 0;
+  int total_alpha_count = 0;
+  int total_digit_count = 0;
+
+  for (; *str != '\0'; str += *(lengths++)) {
+    len++;
+    if (word->uch_set->get_isupper (str, *lengths)) {
+      total_alpha_count++;
+      switch (state) {
+        case SUBSEQUENT_UPPER:
+        case FIRST_UPPER:
+          state = SUBSEQUENT_UPPER;
+          upper_string_count++;
+          if (longest_upper_run_len < upper_string_count)
+            longest_upper_run_len = upper_string_count;
+          if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
+            alpha_repetition_count++;
+            if (longest_alpha_repetition_count < alpha_repetition_count) {
+              longest_alpha_repetition_count = alpha_repetition_count;
+            }
+          }
+          else {
+            last_char = word->uch_set->unichar_to_id(str, *lengths);
+            alpha_repetition_count = 1;
+          }
+          break;
+        case FIRST_NUM:
+          isolated_digits++;
+          // Fall through.
+        default:
+          state = FIRST_UPPER;
+          last_char = word->uch_set->unichar_to_id(str, *lengths);
+          alpha_repetition_count = 1;
+          upper_string_count = 1;
+          break;
+      }
+    }
+    else if (word->uch_set->get_islower (str, *lengths)) {
+      total_alpha_count++;
+      switch (state) {
+        case SUBSEQUENT_LOWER:
+        case FIRST_LOWER:
+          state = SUBSEQUENT_LOWER;
+          lower_string_count++;
+          if (longest_lower_run_len < lower_string_count)
+            longest_lower_run_len = lower_string_count;
+          if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
+            alpha_repetition_count++;
+            if (longest_alpha_repetition_count < alpha_repetition_count) {
+              longest_alpha_repetition_count = alpha_repetition_count;
+            }
+          }
+          else {
+            last_char = word->uch_set->unichar_to_id(str, *lengths);
+            alpha_repetition_count = 1;
+          }
+          break;
+        case FIRST_NUM:
+          isolated_digits++;
+          // Fall through.
+        default:
+          state = FIRST_LOWER;
+          last_char = word->uch_set->unichar_to_id(str, *lengths);
+          alpha_repetition_count = 1;
+          lower_string_count = 1;
+          break;
+      }
+    }
+    else if (word->uch_set->get_isdigit (str, *lengths)) {
+      total_digit_count++;
+      switch (state) {
+        case FIRST_NUM:
+          state = SUBSEQUENT_NUM;
+        case SUBSEQUENT_NUM:
+          break;
+        case FIRST_UPPER:
+        case FIRST_LOWER:
+          isolated_alphas++;
+          // Fall through.
+        default:
+          state = FIRST_NUM;
+          break;
+      }
+    }
+    else {
+      if (*lengths == 1 && *str == ' ')
+        tess_rejs++;
+      else
+        bad_char_count++;
+      switch (state) {
+        case FIRST_NUM:
+          isolated_digits++;
+          break;
+        case FIRST_UPPER:
+        case FIRST_LOWER:
+          isolated_alphas++;
+        default:
+          break;
+      }
+      state = JUNK;
+    }
+  }
+
+  switch (state) {
+    case FIRST_NUM:
+      isolated_digits++;
+      break;
+    case FIRST_UPPER:
+    case FIRST_LOWER:
+      isolated_alphas++;
+    default:
+      break;
+  }
+
+  if (crunch_include_numerals) {
+    total_alpha_count += total_digit_count - isolated_digits;
+  }
+
+  if (crunch_leave_ok_strings && len >= 4 &&
+      2 * (total_alpha_count - isolated_alphas) > len &&
+      longest_alpha_repetition_count < crunch_long_repetitions) {
+    if ((crunch_accept_ok &&
+         acceptable_word_string(*word->uch_set, str, lengths) !=
+             AC_UNACCEPTABLE) ||
+        longest_lower_run_len > crunch_leave_lc_strings ||
+        longest_upper_run_len > crunch_leave_uc_strings)
+      return G_NEVER_CRUNCH;
+  }
+  if (word->reject_map.length() > 1 &&
+      strpbrk(str, " ") == nullptr &&
+      (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
+       word->best_choice->permuter() == FREQ_DAWG_PERM ||
+       word->best_choice->permuter() == USER_DAWG_PERM ||
+       word->best_choice->permuter() == NUMBER_PERM ||
+       acceptable_word_string(*word->uch_set, str, lengths) !=
+           AC_UNACCEPTABLE || ok_dict_word))
+    return G_OK;
+
+  ok_chars = len - bad_char_count - isolated_digits -
+    isolated_alphas - tess_rejs;
+
+  if (crunch_debug > 3) {
+    tprintf("garbage_word: \"%s\"\n",
+            word->best_choice->unichar_string().c_str());
+    tprintf("LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\n",
+            len,
+            bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
+  }
+  if (bad_char_count == 0 &&
+      tess_rejs == 0 &&
+      (len > isolated_digits + isolated_alphas || len <= 2))
+    return G_OK;
+
+  if (tess_rejs > ok_chars ||
+      (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
+    return G_TERRIBLE;
+
+  if (len > 4) {
+    dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
+        isolated_alphas;
+    if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5)
+      return G_DODGY;
+    else
+      return G_OK;
+  } else {
+    dodgy_chars = 2 * tess_rejs + bad_char_count;
+    if ((len == 4 && dodgy_chars > 2) ||
+        (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
+      return G_DODGY;
+    else
+      return G_OK;
+  }
+}
+
+
+/*************************************************************************
+ * word_deletable()
+ *     DELETE WERDS AT ENDS OF ROWS IF
+ *        Word is crunched &&
+ *        ( string length = 0                                          OR
+ *          > 50% of chars are "|" (before merging)                    OR
+ *          certainty < -10                                            OR
+ *          rating /char > 60                                          OR
+ *          TOP of word is more than 0.5 xht BELOW baseline            OR
+ *          BOTTOM of word is more than 0.5 xht ABOVE xht              OR
+ *          length of word < 3xht                                      OR
+ *          height of word < 0.7 xht                                   OR
+ *          height of word > 3.0 xht                                   OR
+ *          >75% of the outline BBs have longest dimension < 0.5xht
+ *************************************************************************/
+
+CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {
+  int word_len = word->reject_map.length ();
+  float rating_per_ch;
+  TBOX box;                       //BB of word
+
+  if (word->unlv_crunch_mode == CR_NONE) {
+    delete_mode = 0;
+    return CR_NONE;
+  }
+
+  if (word_len == 0) {
+    delete_mode = 1;
+    return CR_DELETE;
+  }
+
+  if (word->rebuild_word != nullptr) {
+    // Cube leaves rebuild_word nullptr.
+    box = word->rebuild_word->bounding_box();
+    if (box.height () < crunch_del_min_ht * kBlnXHeight) {
+      delete_mode = 4;
+      return CR_DELETE;
+    }
+
+    if (noise_outlines(word->rebuild_word)) {
+      delete_mode = 5;
+      return CR_DELETE;
+    }
+  }
+
+  if ((failure_count (word) * 1.5) > word_len) {
+    delete_mode = 2;
+    return CR_LOOSE_SPACE;
+  }
+
+  if (word->best_choice->certainty () < crunch_del_cert) {
+    delete_mode = 7;
+    return CR_LOOSE_SPACE;
+  }
+
+  rating_per_ch = word->best_choice->rating () / word_len;
+
+  if (rating_per_ch > crunch_del_rating) {
+    delete_mode = 8;
+    return CR_LOOSE_SPACE;
+  }
+
+  if (box.top () < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
+    delete_mode = 9;
+    return CR_LOOSE_SPACE;
+  }
+
+  if (box.bottom () >
+  kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
+    delete_mode = 10;
+    return CR_LOOSE_SPACE;
+  }
+
+  if (box.height () > crunch_del_max_ht * kBlnXHeight) {
+    delete_mode = 11;
+    return CR_LOOSE_SPACE;
+  }
+
+  if (box.width () < crunch_del_min_width * kBlnXHeight) {
+    delete_mode = 3;
+    return CR_LOOSE_SPACE;
+  }
+
+  delete_mode = 0;
+  return CR_NONE;
+}
+
+int16_t Tesseract::failure_count(WERD_RES *word) {
+  const char *str = word->best_choice->unichar_string().c_str();
+  int tess_rejs = 0;
+
+  for (; *str != '\0'; str++) {
+    if (*str == ' ')
+      tess_rejs++;
+  }
+  return tess_rejs;
+}
+
+
+bool Tesseract::noise_outlines(TWERD* word) {
+  TBOX box;                       // BB of outline
+  int16_t outline_count = 0;
+  int16_t small_outline_count = 0;
+  int16_t max_dimension;
+  float small_limit = kBlnXHeight * crunch_small_outlines_size;
+
+  for (int b = 0; b < word->NumBlobs(); ++b) {
+    TBLOB* blob = word->blobs[b];
+    for (TESSLINE* ol = blob->outlines; ol != nullptr; ol = ol->next) {
+      outline_count++;
+      box = ol->bounding_box();
+      if (box.height() > box.width())
+        max_dimension = box.height();
+      else
+        max_dimension = box.width();
+      if (max_dimension < small_limit)
+        small_outline_count++;
+    }
+  }
+  return small_outline_count >= outline_count;
+}
+
+}  // namespace tesseract
diff --git a/tesseract/src/ccmain/docqual.h b/tesseract/src/ccmain/docqual.h
new file mode 100644
index 00000000..57fa9aeb
--- /dev/null
+++ b/tesseract/src/ccmain/docqual.h
@@ -0,0 +1,43 @@
+/******************************************************************
+ * File:        docqual.h  (Formerly docqual.h)
+ * Description: Document Quality Metrics
+ * Author:      Phil Cheatle
+ *
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef DOCQUAL_H
+#define DOCQUAL_H
+
+#include <cstdint>  // for int16_t
+
+namespace tesseract {
+
+class PAGE_RES_IT;
+class ROW;
+class WERD_RES;
+
+enum GARBAGE_LEVEL
+{
+  G_NEVER_CRUNCH,
+  G_OK,
+  G_DODGY,
+  G_TERRIBLE
+};
+
+int16_t word_blob_quality(WERD_RES* word);
+void reject_whole_page(PAGE_RES_IT &page_res_it);
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/ccmain/equationdetect.cpp b/tesseract/src/ccmain/equationdetect.cpp
new file mode 100644
index 00000000..518468b4
--- /dev/null
+++ b/tesseract/src/ccmain/equationdetect.cpp
@@ -0,0 +1,1516 @@
+///////////////////////////////////////////////////////////////////////
+// File:        equationdetect.cpp
+// Description: Helper classes to detect equations.
+// Author:      Zongyi (Joe) Liu (joeliu@google.com)
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#include "equationdetect.h"
+
+#include "bbgrid.h"
+#include "classify.h"
+#include "colpartition.h"
+#include "colpartitiongrid.h"
+#include "colpartitionset.h"
+#include "ratngs.h"
+#include "tesseractclass.h"
+
+#include "helpers.h"
+
+#include <algorithm>
+#include <cfloat>
+#include <limits>
+#include <memory>
+
+namespace tesseract {
+
+// Config variables.
+static BOOL_VAR(equationdetect_save_bi_image, false, "Save input bi image");
+static BOOL_VAR(equationdetect_save_spt_image, false, "Save special character image");
+static BOOL_VAR(equationdetect_save_seed_image, false, "Save the seed image");
+static BOOL_VAR(equationdetect_save_merged_image, false, "Save the merged image");
+
+///////////////////////////////////////////////////////////////////////////
+// Utility ColParition sort functions.
+///////////////////////////////////////////////////////////////////////////
+static int SortCPByTopReverse(const void* p1, const void* p2) {
+  const ColPartition* cp1 = *static_cast<ColPartition* const*>(p1);
+  const ColPartition* cp2 = *static_cast<ColPartition* const*>(p2);
+  ASSERT_HOST(cp1 != nullptr && cp2 != nullptr);
+  const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box());
+  return box2.top() - box1.top();
+}
+
+static int SortCPByBottom(const void* p1, const void* p2) {
+  const ColPartition* cp1 = *static_cast<ColPartition* const*>(p1);
+  const ColPartition* cp2 = *static_cast<ColPartition* const*>(p2);
+  ASSERT_HOST(cp1 != nullptr && cp2 != nullptr);
+  const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box());
+  return box1.bottom() - box2.bottom();
+}
+
+static int SortCPByHeight(const void* p1, const void* p2) {
+  const ColPartition* cp1 = *static_cast<ColPartition* const*>(p1);
+  const ColPartition* cp2 = *static_cast<ColPartition* const*>(p2);
+  ASSERT_HOST(cp1 != nullptr && cp2 != nullptr);
+  const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box());
+  return box1.height() - box2.height();
+}
+
+// TODO(joeliu): we may want to parameterize these constants.
+const float kMathDigitDensityTh1 = 0.25;
+const float kMathDigitDensityTh2 = 0.1;
+const float kMathItalicDensityTh = 0.5;
+const float kUnclearDensityTh = 0.25;
+const int kSeedBlobsCountTh = 10;
+const int kLeftIndentAlignmentCountTh = 1;
+
+// Returns true if PolyBlockType is of text type or equation type.
+inline bool IsTextOrEquationType(PolyBlockType type) {
+  return PTIsTextType(type) || type == PT_EQUATION;
+}
+
+inline bool IsLeftIndented(const EquationDetect::IndentType type) {
+  return type == EquationDetect::LEFT_INDENT ||
+      type == EquationDetect::BOTH_INDENT;
+}
+
+inline bool IsRightIndented(const EquationDetect::IndentType type) {
+  return type == EquationDetect::RIGHT_INDENT ||
+      type == EquationDetect::BOTH_INDENT;
+}
+
+EquationDetect::EquationDetect(const char* equ_datapath,
+                               const char* equ_name) {
+  const char* default_name = "equ";
+  if (equ_name == nullptr) {
+    equ_name = default_name;
+  }
+  lang_tesseract_ = nullptr;
+  resolution_ = 0;
+  page_count_ = 0;
+
+  if (equ_tesseract_.init_tesseract(equ_datapath, equ_name,
+                                    OEM_TESSERACT_ONLY)) {
+    tprintf("Warning: equation region detection requested,"
+            " but %s failed to load from %s\n", equ_name, equ_datapath);
+  }
+
+  cps_super_bbox_ = nullptr;
+}
+
+EquationDetect::~EquationDetect() { delete (cps_super_bbox_); }
+
+void EquationDetect::SetLangTesseract(Tesseract* lang_tesseract) {
+  lang_tesseract_ = lang_tesseract;
+}
+
+void EquationDetect::SetResolution(const int resolution) {
+  resolution_ = resolution;
+}
+
+int EquationDetect::LabelSpecialText(TO_BLOCK* to_block) {
+  if (to_block == nullptr) {
+    tprintf("Warning: input to_block is nullptr!\n");
+    return -1;
+  }
+
+  GenericVector<BLOBNBOX_LIST*> blob_lists;
+  blob_lists.push_back(&(to_block->blobs));
+  blob_lists.push_back(&(to_block->large_blobs));
+  for (int i = 0; i < blob_lists.size(); ++i) {
+    BLOBNBOX_IT bbox_it(blob_lists[i]);
+    for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
+         bbox_it.forward()) {
+      bbox_it.data()->set_special_text_type(BSTT_NONE);
+    }
+  }
+
+  return 0;
+}
+
+void EquationDetect::IdentifySpecialText(
+    BLOBNBOX *blobnbox, const int height_th) {
+  ASSERT_HOST(blobnbox != nullptr);
+  if (blobnbox->bounding_box().height() < height_th && height_th > 0) {
+    // For small blob, we simply set to BSTT_NONE.
+    blobnbox->set_special_text_type(BSTT_NONE);
+    return;
+  }
+
+  BLOB_CHOICE_LIST ratings_equ, ratings_lang;
+  C_BLOB* blob = blobnbox->cblob();
+  // TODO(joeliu/rays) Fix this. We may have to normalize separately for
+  // each classifier here, as they may require different PolygonalCopy.
+  TBLOB* tblob = TBLOB::PolygonalCopy(false, blob);
+  const TBOX& box = tblob->bounding_box();
+
+  // Normalize the blob. Set the origin to the place we want to be the
+  // bottom-middle, and scaling is to make the height the x-height.
+  const float scaling = static_cast<float>(kBlnXHeight) / box.height();
+  const float x_orig = (box.left() + box.right()) / 2.0f, y_orig = box.bottom();
+  std::unique_ptr<TBLOB> normed_blob(new TBLOB(*tblob));
+  normed_blob->Normalize(nullptr, nullptr, nullptr, x_orig, y_orig, scaling, scaling,
+                         0.0f, static_cast<float>(kBlnBaselineOffset),
+                         false, nullptr);
+  equ_tesseract_.AdaptiveClassifier(normed_blob.get(), &ratings_equ);
+  lang_tesseract_->AdaptiveClassifier(normed_blob.get(), &ratings_lang);
+  delete tblob;
+
+  // Get the best choice from ratings_lang and rating_equ. As the choice in the
+  // list has already been sorted by the certainty, we simply use the first
+  // choice.
+  BLOB_CHOICE *lang_choice = nullptr, *equ_choice = nullptr;
+  if (ratings_lang.length() > 0) {
+    BLOB_CHOICE_IT choice_it(&ratings_lang);
+    lang_choice = choice_it.data();
+  }
+  if (ratings_equ.length() > 0) {
+    BLOB_CHOICE_IT choice_it(&ratings_equ);
+    equ_choice = choice_it.data();
+  }
+
+  const float lang_score = lang_choice ? lang_choice->certainty() : -FLT_MAX;
+  const float equ_score = equ_choice ? equ_choice->certainty() : -FLT_MAX;
+
+  const float kConfScoreTh = -5.0f, kConfDiffTh = 1.8;
+  // The scores here are negative, so the max/min == fabs(min/max).
+  // float ratio = fmax(lang_score, equ_score) / fmin(lang_score, equ_score);
+  const float diff = fabs(lang_score - equ_score);
+  BlobSpecialTextType type = BSTT_NONE;
+
+  // Classification.
+  if (fmax(lang_score, equ_score) < kConfScoreTh) {
+    // If both score are very small, then mark it as unclear.
+    type = BSTT_UNCLEAR;
+  } else if (diff > kConfDiffTh && equ_score > lang_score) {
+    // If equ_score is significantly higher, then we classify this character as
+    // math symbol.
+    type = BSTT_MATH;
+  } else if (lang_choice) {
+    // For other cases: lang_score is similar or significantly higher.
+    type = EstimateTypeForUnichar(
+        lang_tesseract_->unicharset, lang_choice->unichar_id());
+  }
+
+  if (type == BSTT_NONE && lang_tesseract_->get_fontinfo_table().get(
+      lang_choice->fontinfo_id()).is_italic()) {
+    // For text symbol, we still check if it is italic.
+    blobnbox->set_special_text_type(BSTT_ITALIC);
+  } else {
+    blobnbox->set_special_text_type(type);
+  }
+}
+
+BlobSpecialTextType EquationDetect::EstimateTypeForUnichar(
+    const UNICHARSET& unicharset, const UNICHAR_ID id) const {
+  const STRING s = unicharset.id_to_unichar(id);
+  if (unicharset.get_isalpha(id)) {
+    return BSTT_NONE;
+  }
+
+  if (unicharset.get_ispunctuation(id)) {
+    // Exclude some special texts that are likely to be confused as math symbol.
+    static GenericVector<UNICHAR_ID> ids_to_exclude;
+    if (ids_to_exclude.empty()) {
+      static const STRING kCharsToEx[] = {"'", "`", "\"", "\\", ",", ".",
+          "〈", "〉", "《", "》", "」", "「", ""};
+      int i = 0;
+      while (kCharsToEx[i] != "") {
+        ids_to_exclude.push_back(
+            unicharset.unichar_to_id(kCharsToEx[i++].c_str()));
+      }
+      ids_to_exclude.sort();
+    }
+    return ids_to_exclude.bool_binary_search(id) ? BSTT_NONE : BSTT_MATH;
+  }
+
+  // Check if it is digit. In addition to the isdigit attribute, we also check
+  // if this character belongs to those likely to be confused with a digit.
+  static const STRING kDigitsChars = "|";
+  if (unicharset.get_isdigit(id) ||
+      (s.length() == 1 && kDigitsChars.contains(s[0]))) {
+    return BSTT_DIGIT;
+  } else  {
+    return BSTT_MATH;
+  }
+}
+
+void EquationDetect::IdentifySpecialText() {
+  // Set configuration for Tesseract::AdaptiveClassifier.
+  equ_tesseract_.tess_cn_matching.set_value(1);  // turn it on
+  equ_tesseract_.tess_bn_matching.set_value(0);
+
+  // Set the multiplier to zero for lang_tesseract_ to improve the accuracy.
+  const int classify_class_pruner = lang_tesseract_->classify_class_pruner_multiplier;
+  const int classify_integer_matcher =
+      lang_tesseract_->classify_integer_matcher_multiplier;
+  lang_tesseract_->classify_class_pruner_multiplier.set_value(0);
+  lang_tesseract_->classify_integer_matcher_multiplier.set_value(0);
+
+  ColPartitionGridSearch gsearch(part_grid_);
+  ColPartition *part = nullptr;
+  gsearch.StartFullSearch();
+  while ((part = gsearch.NextFullSearch()) != nullptr) {
+    if (!IsTextOrEquationType(part->type())) {
+      continue;
+    }
+    IdentifyBlobsToSkip(part);
+    BLOBNBOX_C_IT bbox_it(part->boxes());
+    // Compute the height threshold.
+    GenericVector<int> blob_heights;
+    for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
+         bbox_it.forward()) {
+      if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
+        blob_heights.push_back(bbox_it.data()->bounding_box().height());
+      }
+    }
+    blob_heights.sort();
+    const int height_th =  blob_heights[blob_heights.size() / 2] / 3 * 2;
+    for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
+         bbox_it.forward()) {
+      if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
+        IdentifySpecialText(bbox_it.data(), height_th);
+      }
+    }
+  }
+
+  // Set the multiplier values back.
+  lang_tesseract_->classify_class_pruner_multiplier.set_value(
+      classify_class_pruner);
+  lang_tesseract_->classify_integer_matcher_multiplier.set_value(
+      classify_integer_matcher);
+
+  if (equationdetect_save_spt_image) {  // For debug.
+    STRING outfile;
+    GetOutputTiffName("_spt", &outfile);
+    PaintSpecialTexts(outfile);
+  }
+}
+
+void EquationDetect::IdentifyBlobsToSkip(ColPartition* part) {
+  ASSERT_HOST(part);
+  BLOBNBOX_C_IT blob_it(part->boxes());
+
+  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+    // At this moment, no blob should have been joined.
+    ASSERT_HOST(!blob_it.data()->joined_to_prev());
+  }
+  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+    BLOBNBOX* blob = blob_it.data();
+    if (blob->joined_to_prev() || blob->special_text_type() == BSTT_SKIP) {
+      continue;
+    }
+    TBOX blob_box = blob->bounding_box();
+
+    // Search if any blob can be merged into blob. If found, then we mark all
+    // these blobs as BSTT_SKIP.
+    BLOBNBOX_C_IT blob_it2 = blob_it;
+    bool found = false;
+    while (!blob_it2.at_last()) {
+      BLOBNBOX* nextblob = blob_it2.forward();
+      const TBOX& nextblob_box = nextblob->bounding_box();
+      if (nextblob_box.left() >= blob_box.right()) {
+        break;
+      }
+      const float kWidthR = 0.4, kHeightR = 0.3;
+      const bool xoverlap = blob_box.major_x_overlap(nextblob_box),
+          yoverlap = blob_box.y_overlap(nextblob_box);
+      const float widthR = static_cast<float>(
+          std::min(nextblob_box.width(), blob_box.width())) /
+          std::max(nextblob_box.width(), blob_box.width());
+      const float heightR = static_cast<float>(
+          std::min(nextblob_box.height(), blob_box.height())) /
+          std::max(nextblob_box.height(), blob_box.height());
+
+      if (xoverlap && yoverlap && widthR > kWidthR && heightR > kHeightR) {
+        // Found one, set nextblob type and recompute blob_box.
+        found = true;
+        nextblob->set_special_text_type(BSTT_SKIP);
+        blob_box += nextblob_box;
+      }
+    }
+    if (found) {
+      blob->set_special_text_type(BSTT_SKIP);
+    }
+  }
+}
+
+int EquationDetect::FindEquationParts(
+    ColPartitionGrid* part_grid, ColPartitionSet** best_columns) {
+  if (!lang_tesseract_) {
+    tprintf("Warning: lang_tesseract_ is nullptr!\n");
+    return -1;
+  }
+  if (!part_grid || !best_columns) {
+    tprintf("part_grid/best_columns is nullptr!!\n");
+    return -1;
+  }
+  cp_seeds_.clear();
+  part_grid_ = part_grid;
+  best_columns_ = best_columns;
+  resolution_ = lang_tesseract_->source_resolution();
+  STRING outfile;
+  page_count_++;
+
+  if (equationdetect_save_bi_image) {
+    GetOutputTiffName("_bi", &outfile);
+    pixWrite(outfile.c_str(), lang_tesseract_->pix_binary(), IFF_TIFF_G4);
+  }
+
+  // Pass 0: Compute special text type for blobs.
+  IdentifySpecialText();
+
+  // Pass 1: Merge parts by overlap.
+  MergePartsByLocation();
+
+  // Pass 2: compute the math blob density and find the seed partition.
+  IdentifySeedParts();
+  // We still need separate seed into block seed and inline seed partition.
+  IdentifyInlineParts();
+
+  if (equationdetect_save_seed_image) {
+    GetOutputTiffName("_seed", &outfile);
+    PaintColParts(outfile);
+  }
+
+  // Pass 3: expand block equation seeds.
+  while (!cp_seeds_.empty()) {
+    GenericVector<ColPartition*> seeds_expanded;
+    for (int i = 0; i < cp_seeds_.size(); ++i) {
+      if (ExpandSeed(cp_seeds_[i])) {
+        // If this seed is expanded, then we add it into seeds_expanded. Note
+        // this seed has been removed from part_grid_ if it is expanded.
+        seeds_expanded.push_back(cp_seeds_[i]);
+      }
+    }
+    // Add seeds_expanded back into part_grid_ and reset cp_seeds_.
+    for (int i = 0; i < seeds_expanded.size(); ++i) {
+      InsertPartAfterAbsorb(seeds_expanded[i]);
+    }
+    cp_seeds_ = seeds_expanded;
+  }
+
+  // Pass 4: find math block satellite text partitions and merge them.
+  ProcessMathBlockSatelliteParts();
+
+  if (equationdetect_save_merged_image) {  // For debug.
+    GetOutputTiffName("_merged", &outfile);
+    PaintColParts(outfile);
+  }
+
+  return 0;
+}
+
+void EquationDetect::MergePartsByLocation() {
+  while (true) {
+    ColPartition* part = nullptr;
+    // partitions that have been updated.
+    GenericVector<ColPartition*> parts_updated;
+    ColPartitionGridSearch gsearch(part_grid_);
+    gsearch.StartFullSearch();
+    while ((part = gsearch.NextFullSearch()) != nullptr) {
+      if (!IsTextOrEquationType(part->type())) {
+        continue;
+      }
+      GenericVector<ColPartition*> parts_to_merge;
+      SearchByOverlap(part, &parts_to_merge);
+      if (parts_to_merge.empty()) {
+        continue;
+      }
+
+      // Merge parts_to_merge with part, and remove them from part_grid_.
+      part_grid_->RemoveBBox(part);
+      for (int i = 0; i < parts_to_merge.size(); ++i) {
+        ASSERT_HOST(parts_to_merge[i] != nullptr && parts_to_merge[i] != part);
+        part->Absorb(parts_to_merge[i], nullptr);
+      }
+      gsearch.RepositionIterator();
+
+      parts_updated.push_back(part);
+    }
+
+    if (parts_updated.empty()) {  // Exit the loop
+      break;
+    }
+
+    // Re-insert parts_updated into part_grid_.
+    for (int i = 0; i < parts_updated.size(); ++i) {
+      InsertPartAfterAbsorb(parts_updated[i]);
+    }
+  }
+}
+
+void EquationDetect::SearchByOverlap(
+    ColPartition* seed,
+    GenericVector<ColPartition*>* parts_overlap) {
+  ASSERT_HOST(seed != nullptr && parts_overlap != nullptr);
+  if (!IsTextOrEquationType(seed->type())) {
+    return;
+  }
+  ColPartitionGridSearch search(part_grid_);
+  const TBOX& seed_box(seed->bounding_box());
+  const int kRadNeighborCells = 30;
+  search.StartRadSearch((seed_box.left() + seed_box.right()) / 2,
+                        (seed_box.top() + seed_box.bottom()) / 2,
+                        kRadNeighborCells);
+  search.SetUniqueMode(true);
+
+  // Search iteratively.
+  ColPartition *part;
+  GenericVector<ColPartition*> parts;
+  const float kLargeOverlapTh = 0.95;
+  const float kEquXOverlap = 0.4, kEquYOverlap = 0.5;
+  while ((part = search.NextRadSearch()) != nullptr) {
+    if (part == seed || !IsTextOrEquationType(part->type())) {
+      continue;
+    }
+    const TBOX& part_box(part->bounding_box());
+    bool merge = false;
+
+    const float x_overlap_fraction = part_box.x_overlap_fraction(seed_box),
+        y_overlap_fraction = part_box.y_overlap_fraction(seed_box);
+
+    // If part is large overlapped with seed, then set merge to true.
+    if (x_overlap_fraction >= kLargeOverlapTh &&
+        y_overlap_fraction >= kLargeOverlapTh) {
+      merge = true;
+    } else if (seed->type() == PT_EQUATION &&
+               IsTextOrEquationType(part->type())) {
+      if ((x_overlap_fraction > kEquXOverlap && y_overlap_fraction > 0.0) ||
+          (x_overlap_fraction > 0.0 && y_overlap_fraction > kEquYOverlap)) {
+        merge = true;
+      }
+    }
+
+    if (merge) {  // Remove the part from search and put it into parts.
+      search.RemoveBBox();
+      parts_overlap->push_back(part);
+    }
+  }
+}
+
+void EquationDetect::InsertPartAfterAbsorb(ColPartition* part) {
+  ASSERT_HOST(part);
+
+  // Before insert part back into part_grid_, we will need re-compute some
+  // of its attributes such as first_column_, last_column_. However, we still
+  // want to preserve its type.
+  BlobTextFlowType flow_type = part->flow();
+  PolyBlockType part_type = part->type();
+  BlobRegionType blob_type = part->blob_type();
+
+  // Call SetPartitionType to re-compute the attributes of part.
+  const TBOX& part_box(part->bounding_box());
+  int grid_x, grid_y;
+  part_grid_->GridCoords(
+      part_box.left(), part_box.bottom(), &grid_x, &grid_y);
+  part->SetPartitionType(resolution_, best_columns_[grid_y]);
+
+  // Reset the types back.
+  part->set_type(part_type);
+  part->set_blob_type(blob_type);
+  part->set_flow(flow_type);
+  part->SetBlobTypes();
+
+  // Insert into part_grid_.
+  part_grid_->InsertBBox(true, true, part);
+}
+
+void EquationDetect::IdentifySeedParts() {
+  ColPartitionGridSearch gsearch(part_grid_);
+  ColPartition *part = nullptr;
+  gsearch.StartFullSearch();
+
+  GenericVector<ColPartition*> seeds1, seeds2;
+  // The left coordinates of indented text partitions.
+  GenericVector<int> indented_texts_left;
+  // The foreground density of text partitions.
+  GenericVector<float> texts_foreground_density;
+  while ((part = gsearch.NextFullSearch()) != nullptr) {
+    if (!IsTextOrEquationType(part->type())) {
+      continue;
+    }
+    part->ComputeSpecialBlobsDensity();
+    const bool blobs_check = CheckSeedBlobsCount(part);
+    const int kTextBlobsTh = 20;
+
+    if (CheckSeedDensity(kMathDigitDensityTh1, kMathDigitDensityTh2, part) &&
+        blobs_check) {
+      // Passed high density threshold test, save into seeds1.
+      seeds1.push_back(part);
+    } else {
+      IndentType indent = IsIndented(part);
+      if (IsLeftIndented(indent) && blobs_check &&
+          CheckSeedDensity(kMathDigitDensityTh2, kMathDigitDensityTh2, part)) {
+        // Passed low density threshold test and is indented, save into seeds2.
+        seeds2.push_back(part);
+      } else if (!IsRightIndented(indent) &&
+                 part->boxes_count() > kTextBlobsTh) {
+        // This is likely to be a text part, save the features.
+        const TBOX&box = part->bounding_box();
+        if (IsLeftIndented(indent)) {
+          indented_texts_left.push_back(box.left());
+        }
+        texts_foreground_density.push_back(ComputeForegroundDensity(box));
+      }
+    }
+  }
+
+  // Sort the features collected from text regions.
+  indented_texts_left.sort();
+  texts_foreground_density.sort();
+  float foreground_density_th = 0.15;  // Default value.
+  if (!texts_foreground_density.empty()) {
+    // Use the median of the texts_foreground_density.
+    foreground_density_th = 0.8 * texts_foreground_density[
+        texts_foreground_density.size() / 2];
+  }
+
+  for (int i = 0; i < seeds1.size(); ++i) {
+    const TBOX& box = seeds1[i]->bounding_box();
+    if (CheckSeedFgDensity(foreground_density_th, seeds1[i]) &&
+        !(IsLeftIndented(IsIndented(seeds1[i])) &&
+          CountAlignment(indented_texts_left, box.left()) >=
+          kLeftIndentAlignmentCountTh)) {
+      // Mark as PT_EQUATION type.
+      seeds1[i]->set_type(PT_EQUATION);
+      cp_seeds_.push_back(seeds1[i]);
+    } else {  // Mark as PT_INLINE_EQUATION type.
+      seeds1[i]->set_type(PT_INLINE_EQUATION);
+    }
+  }
+
+  for (int i = 0; i < seeds2.size(); ++i) {
+    if (CheckForSeed2(indented_texts_left, foreground_density_th, seeds2[i])) {
+      seeds2[i]->set_type(PT_EQUATION);
+      cp_seeds_.push_back(seeds2[i]);
+    }
+  }
+}
+
+float EquationDetect::ComputeForegroundDensity(const TBOX& tbox) {
+  Pix *pix_bi = lang_tesseract_->pix_binary();
+  const int pix_height = pixGetHeight(pix_bi);
+  Box* box = boxCreate(tbox.left(), pix_height - tbox.top(),
+                       tbox.width(), tbox.height());
+  Pix *pix_sub = pixClipRectangle(pix_bi, box, nullptr);
+  l_float32 fract;
+  pixForegroundFraction(pix_sub, &fract);
+  pixDestroy(&pix_sub);
+  boxDestroy(&box);
+
+  return fract;
+}
+
+bool EquationDetect::CheckSeedFgDensity(const float density_th,
+                                        ColPartition* part) {
+  ASSERT_HOST(part);
+
+  // Split part horizontall, and check for each sub part.
+  GenericVector<TBOX> sub_boxes;
+  SplitCPHorLite(part, &sub_boxes);
+  float parts_passed = 0.0;
+  for (int i = 0; i < sub_boxes.size(); ++i) {
+    const float density = ComputeForegroundDensity(sub_boxes[i]);
+    if (density < density_th) {
+      parts_passed++;
+    }
+  }
+
+  // If most sub parts passed, then we return true.
+  const float kSeedPartRatioTh = 0.3;
+  bool retval = (parts_passed / sub_boxes.size() >= kSeedPartRatioTh);
+
+  return retval;
+}
+
+void EquationDetect::SplitCPHor(ColPartition* part,
+     GenericVector<ColPartition*>* parts_splitted) {
+  ASSERT_HOST(part && parts_splitted);
+  if (part->median_width() == 0 || part->boxes_count() == 0) {
+    return;
+  }
+
+  // Make a copy of part, and reset parts_splitted.
+  ColPartition* right_part = part->CopyButDontOwnBlobs();
+  parts_splitted->delete_data_pointers();
+  parts_splitted->clear();
+
+  const double kThreshold = part->median_width() * 3.0;
+  bool found_split = true;
+  while (found_split) {
+    found_split = false;
+    BLOBNBOX_C_IT box_it(right_part->boxes());
+    // Blobs are sorted left side first. If blobs overlap,
+    // the previous blob may have a "more right" right side.
+    // Account for this by always keeping the largest "right"
+    // so far.
+    int previous_right = INT32_MIN;
+
+    // Look for the next split in the partition.
+    for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
+      const TBOX& box = box_it.data()->bounding_box();
+      if (previous_right != INT32_MIN &&
+          box.left() - previous_right > kThreshold) {
+        // We have a split position. Split the partition in two pieces.
+        // Insert the left piece in the grid and keep processing the right.
+        const int mid_x = (box.left() + previous_right) / 2;
+        ColPartition* left_part = right_part;
+        right_part = left_part->SplitAt(mid_x);
+
+        parts_splitted->push_back(left_part);
+        left_part->ComputeSpecialBlobsDensity();
+        found_split = true;
+        break;
+      }
+
+      // The right side of the previous blobs.
+      previous_right = std::max(previous_right, static_cast<int>(box.right()));
+    }
+  }
+
+  // Add the last piece.
+  right_part->ComputeSpecialBlobsDensity();
+  parts_splitted->push_back(right_part);
+}
+
+void EquationDetect::SplitCPHorLite(ColPartition* part,
+     GenericVector<TBOX>* splitted_boxes) {
+  ASSERT_HOST(part && splitted_boxes);
+  splitted_boxes->clear();
+  if (part->median_width() == 0) {
+    return;
+  }
+
+  const double kThreshold = part->median_width() * 3.0;
+
+  // Blobs are sorted left side first. If blobs overlap,
+  // the previous blob may have a "more right" right side.
+  // Account for this by always keeping the largest "right"
+  // so far.
+  TBOX union_box;
+  int previous_right = INT32_MIN;
+  BLOBNBOX_C_IT box_it(part->boxes());
+  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
+    const TBOX& box = box_it.data()->bounding_box();
+    if (previous_right != INT32_MIN &&
+        box.left() - previous_right > kThreshold) {
+      // We have a split position.
+      splitted_boxes->push_back(union_box);
+      previous_right = INT32_MIN;
+    }
+    if (previous_right == INT32_MIN) {
+      union_box = box;
+    } else {
+      union_box += box;
+    }
+    // The right side of the previous blobs.
+    previous_right = std::max(previous_right, static_cast<int>(box.right()));
+  }
+
+  // Add the last piece.
+  if (previous_right != INT32_MIN) {
+    splitted_boxes->push_back(union_box);
+  }
+}
+
+bool EquationDetect::CheckForSeed2(
+    const GenericVector<int>& indented_texts_left,
+    const float foreground_density_th,
+    ColPartition* part) {
+  ASSERT_HOST(part);
+  const TBOX& box = part->bounding_box();
+
+  // Check if it is aligned with any indented_texts_left.
+  if (!indented_texts_left.empty() &&
+      CountAlignment(indented_texts_left, box.left()) >=
+      kLeftIndentAlignmentCountTh) {
+      return false;
+  }
+
+  // Check the foreground density.
+  if (ComputeForegroundDensity(box) > foreground_density_th) {
+    return false;
+  }
+
+  return true;
+}
+
+int EquationDetect::CountAlignment(
+    const GenericVector<int>& sorted_vec, const int val) const {
+  if (sorted_vec.empty()) {
+    return 0;
+  }
+  const int kDistTh = static_cast<int>(roundf(0.03 * resolution_));
+  const int pos = sorted_vec.binary_search(val);
+  int count = 0;
+
+  // Search left side.
+  int index = pos;
+  while (index >= 0 && abs(val - sorted_vec[index--]) < kDistTh) {
+    count++;
+  }
+
+  // Search right side.
+  index = pos + 1;
+  while (index < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) {
+    count++;
+  }
+
+  return count;
+}
+
+void EquationDetect::IdentifyInlineParts() {
+  ComputeCPsSuperBBox();
+  IdentifyInlinePartsHorizontal();
+  const int textparts_linespacing = EstimateTextPartLineSpacing();
+  IdentifyInlinePartsVertical(true, textparts_linespacing);
+  IdentifyInlinePartsVertical(false, textparts_linespacing);
+}
+
+void EquationDetect::ComputeCPsSuperBBox() {
+  ColPartitionGridSearch gsearch(part_grid_);
+  ColPartition *part = nullptr;
+  gsearch.StartFullSearch();
+  delete cps_super_bbox_;
+  cps_super_bbox_ = new TBOX();
+  while ((part = gsearch.NextFullSearch()) != nullptr) {
+    (*cps_super_bbox_) += part->bounding_box();
+  }
+}
+
+void EquationDetect::IdentifyInlinePartsHorizontal() {
+  ASSERT_HOST(cps_super_bbox_);
+  GenericVector<ColPartition*> new_seeds;
+  const int kMarginDiffTh = IntCastRounded(
+      0.5 * lang_tesseract_->source_resolution());
+  const int kGapTh = static_cast<int>(roundf(
+      1.0 * lang_tesseract_->source_resolution()));
+  ColPartitionGridSearch search(part_grid_);
+  search.SetUniqueMode(true);
+  // The center x coordinate of the cp_super_bbox_.
+  const int cps_cx = cps_super_bbox_->left() + cps_super_bbox_->width() / 2;
+  for (int i = 0; i < cp_seeds_.size(); ++i) {
+    ColPartition* part = cp_seeds_[i];
+    const TBOX& part_box(part->bounding_box());
+    const int left_margin = part_box.left() - cps_super_bbox_->left(),
+        right_margin = cps_super_bbox_->right() - part_box.right();
+    bool right_to_left;
+    if (left_margin + kMarginDiffTh < right_margin &&
+        left_margin < kMarginDiffTh) {
+      // part is left aligned, so we search if it has any right neighbor.
+      search.StartSideSearch(
+          part_box.right(), part_box.top(), part_box.bottom());
+      right_to_left = false;
+    } else if (left_margin > cps_cx) {
+      // part locates on the right half on image, so search if it has any left
+      // neighbor.
+      search.StartSideSearch(
+          part_box.left(), part_box.top(), part_box.bottom());
+      right_to_left = true;
+    } else {  // part is not an inline equation.
+      new_seeds.push_back(part);
+      continue;
+    }
+    ColPartition* neighbor = nullptr;
+    bool side_neighbor_found = false;
+    while ((neighbor = search.NextSideSearch(right_to_left)) != nullptr) {
+      const TBOX& neighbor_box(neighbor->bounding_box());
+      if (!IsTextOrEquationType(neighbor->type()) ||
+          part_box.x_gap(neighbor_box) > kGapTh ||
+          !part_box.major_y_overlap(neighbor_box) ||
+          part_box.major_x_overlap(neighbor_box)) {
+        continue;
+      }
+      // We have found one. Set the side_neighbor_found flag.
+      side_neighbor_found = true;
+      break;
+    }
+    if (!side_neighbor_found) {  // Mark part as PT_INLINE_EQUATION.
+      part->set_type(PT_INLINE_EQUATION);
+    } else {
+      // Check the geometric feature of neighbor.
+      const TBOX& neighbor_box(neighbor->bounding_box());
+      if (neighbor_box.width() > part_box.width() &&
+          neighbor->type() != PT_EQUATION) {  // Mark as PT_INLINE_EQUATION.
+        part->set_type(PT_INLINE_EQUATION);
+      } else {  // part is not an inline equation type.
+        new_seeds.push_back(part);
+      }
+    }
+  }
+
+  // Reset the cp_seeds_ using the new_seeds.
+  cp_seeds_ = new_seeds;
+}
+
+int EquationDetect::EstimateTextPartLineSpacing() {
+  ColPartitionGridSearch gsearch(part_grid_);
+
+  // Get the y gap between text partitions;
+  ColPartition *current = nullptr, *prev = nullptr;
+  gsearch.StartFullSearch();
+  GenericVector<int> ygaps;
+  while ((current = gsearch.NextFullSearch()) != nullptr) {
+    if (!PTIsTextType(current->type())) {
+      continue;
+    }
+    if (prev != nullptr) {
+      const TBOX &current_box = current->bounding_box();
+      const TBOX &prev_box = prev->bounding_box();
+      // prev and current should be x major overlap and non y overlap.
+      if (current_box.major_x_overlap(prev_box) &&
+          !current_box.y_overlap(prev_box)) {
+        int gap = current_box.y_gap(prev_box);
+        if (gap < std::min(current_box.height(), prev_box.height())) {
+          // The gap should be smaller than the height of the bounding boxes.
+          ygaps.push_back(gap);
+        }
+      }
+    }
+    prev = current;
+  }
+
+  if (ygaps.size() < 8) {  // We do not have enough data.
+    return -1;
+  }
+
+  // Compute the line spacing from ygaps: use the mean of the first half.
+  ygaps.sort();
+  int spacing = 0, count;
+  for (count = 0; count < ygaps.size() / 2; count++) {
+    spacing += ygaps[count];
+  }
+  return spacing / count;
+}
+
+void EquationDetect::IdentifyInlinePartsVertical(
+    const bool top_to_bottom, const int textparts_linespacing) {
+  if (cp_seeds_.empty()) {
+    return;
+  }
+
+  // Sort cp_seeds_.
+  if (top_to_bottom) {  // From top to bottom.
+    cp_seeds_.sort(&SortCPByTopReverse);
+  } else {  // From bottom to top.
+    cp_seeds_.sort(&SortCPByBottom);
+  }
+
+  GenericVector<ColPartition*> new_seeds;
+  for (int i = 0; i < cp_seeds_.size(); ++i) {
+    ColPartition* part = cp_seeds_[i];
+    // If we sort cp_seeds_ from top to bottom, then for each cp_seeds_, we look
+    // for its top neighbors, so that if two/more inline regions are connected
+    // to each other, then we will identify the top one, and then use it to
+    // identify the bottom one.
+    if (IsInline(!top_to_bottom, textparts_linespacing, part)) {
+      part->set_type(PT_INLINE_EQUATION);
+    } else {
+      new_seeds.push_back(part);
+    }
+  }
+  cp_seeds_ = new_seeds;
+}
+
+bool EquationDetect::IsInline(const bool search_bottom,
+                              const int textparts_linespacing,
+                              ColPartition* part) {
+  ASSERT_HOST(part != nullptr);
+  // Look for its nearest vertical neighbor that hardly overlaps in y but
+  // largely overlaps in x.
+  ColPartitionGridSearch search(part_grid_);
+  ColPartition *neighbor = nullptr;
+  const TBOX& part_box(part->bounding_box());
+  const float kYGapRatioTh = 1.0;
+
+  if (search_bottom) {
+    search.StartVerticalSearch(part_box.left(), part_box.right(),
+                               part_box.bottom());
+  } else {
+    search.StartVerticalSearch(part_box.left(), part_box.right(),
+                               part_box.top());
+  }
+  search.SetUniqueMode(true);
+  while ((neighbor = search.NextVerticalSearch(search_bottom)) != nullptr) {
+    const TBOX& neighbor_box(neighbor->bounding_box());
+    if (part_box.y_gap(neighbor_box) > kYGapRatioTh *
+             std::min(part_box.height(), neighbor_box.height())) {
+      // Finished searching.
+      break;
+    }
+    if (!PTIsTextType(neighbor->type())) {
+      continue;
+    }
+
+    // Check if neighbor and part is inline similar.
+    const float kHeightRatioTh = 0.5;
+    const int kYGapTh = textparts_linespacing > 0 ?
+        textparts_linespacing + static_cast<int>(roundf(0.02 * resolution_)):
+        static_cast<int>(roundf(0.05 * resolution_));  // Default value.
+    if (part_box.x_overlap(neighbor_box) &&  // Location feature.
+        part_box.y_gap(neighbor_box) <= kYGapTh &&  // Line spacing.
+        // Geo feature.
+        static_cast<float>(std::min(part_box.height(), neighbor_box.height())) /
+        std::max(part_box.height(), neighbor_box.height()) > kHeightRatioTh) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool EquationDetect::CheckSeedBlobsCount(ColPartition* part) {
+  if (!part) {
+    return false;
+  }
+  const int kSeedMathBlobsCount = 2;
+  const int kSeedMathDigitBlobsCount = 5;
+
+  const int blobs = part->boxes_count(),
+      math_blobs = part->SpecialBlobsCount(BSTT_MATH),
+      digit_blobs = part->SpecialBlobsCount(BSTT_DIGIT);
+  if (blobs < kSeedBlobsCountTh || math_blobs <= kSeedMathBlobsCount ||
+      math_blobs + digit_blobs <= kSeedMathDigitBlobsCount) {
+    return false;
+  }
+
+  return true;
+}
+
+bool EquationDetect::CheckSeedDensity(
+    const float math_density_high,
+    const float math_density_low,
+    const ColPartition* part) const {
+  ASSERT_HOST(part);
+  float math_digit_density = part->SpecialBlobsDensity(BSTT_MATH)
+      + part->SpecialBlobsDensity(BSTT_DIGIT);
+  float italic_density = part->SpecialBlobsDensity(BSTT_ITALIC);
+  if (math_digit_density > math_density_high) {
+    return true;
+  }
+  if (math_digit_density + italic_density > kMathItalicDensityTh &&
+      math_digit_density > math_density_low) {
+    return true;
+  }
+
+  return false;
+}
+
+EquationDetect::IndentType EquationDetect::IsIndented(ColPartition* part) {
+  ASSERT_HOST(part);
+
+  ColPartitionGridSearch search(part_grid_);
+  ColPartition *neighbor = nullptr;
+  const TBOX& part_box(part->bounding_box());
+  const int kXGapTh = static_cast<int>(roundf(0.5 * resolution_));
+  const int kRadiusTh = static_cast<int>(roundf(3.0 * resolution_));
+  const int kYGapTh = static_cast<int>(roundf(0.5 * resolution_));
+
+  // Here we use a simple approximation algorithm: from the center of part, We
+  // perform the radius search, and check if we can find a neighboring partition
+  // that locates on the top/bottom left of part.
+  search.StartRadSearch((part_box.left() + part_box.right()) / 2,
+      (part_box.top() + part_box.bottom()) / 2, kRadiusTh);
+  search.SetUniqueMode(true);
+  bool left_indented = false, right_indented = false;
+  while ((neighbor = search.NextRadSearch()) != nullptr &&
+         (!left_indented || !right_indented)) {
+    if (neighbor == part) {
+      continue;
+    }
+    const TBOX& neighbor_box(neighbor->bounding_box());
+
+    if (part_box.major_y_overlap(neighbor_box) &&
+        part_box.x_gap(neighbor_box) < kXGapTh) {
+      // When this happens, it is likely part is a fragment of an
+      // over-segmented colpartition. So we return false.
+      return NO_INDENT;
+    }
+
+    if (!IsTextOrEquationType(neighbor->type())) {
+      continue;
+    }
+
+    // The neighbor should be above/below part, and overlap in x direction.
+    if (!part_box.x_overlap(neighbor_box) || part_box.y_overlap(neighbor_box)) {
+      continue;
+    }
+
+    if (part_box.y_gap(neighbor_box) < kYGapTh) {
+      const int left_gap = part_box.left() - neighbor_box.left();
+      const int right_gap = neighbor_box.right() - part_box.right();
+      if (left_gap > kXGapTh) {
+        left_indented = true;
+      }
+      if (right_gap > kXGapTh) {
+        right_indented = true;
+      }
+    }
+  }
+
+  if (left_indented && right_indented) {
+    return BOTH_INDENT;
+  }
+  if (left_indented) {
+    return LEFT_INDENT;
+  }
+  if (right_indented) {
+    return RIGHT_INDENT;
+  }
+  return NO_INDENT;
+}
+
+bool EquationDetect::ExpandSeed(ColPartition* seed) {
+  if (seed == nullptr ||  // This seed has been absorbed by other seeds.
+      seed->IsVerticalType()) {  // We skip vertical type right now.
+    return false;
+  }
+
+  // Expand in four directions.
+  GenericVector<ColPartition*> parts_to_merge;
+  ExpandSeedHorizontal(true, seed, &parts_to_merge);
+  ExpandSeedHorizontal(false, seed, &parts_to_merge);
+  ExpandSeedVertical(true, seed, &parts_to_merge);
+  ExpandSeedVertical(false, seed, &parts_to_merge);
+  SearchByOverlap(seed, &parts_to_merge);
+
+  if (parts_to_merge.empty()) {  // We don't find any partition to merge.
+    return false;
+  }
+
+  // Merge all partitions in parts_to_merge with seed. We first remove seed
+  // from part_grid_ as its bounding box is going to expand. Then we add it
+  // back after it absorbs all parts_to_merge partitions.
+  part_grid_->RemoveBBox(seed);
+  for (int i = 0; i < parts_to_merge.size(); ++i) {
+    ColPartition* part = parts_to_merge[i];
+    if (part->type() == PT_EQUATION) {
+      // If part is in cp_seeds_, then we mark it as nullptr so that we won't
+      // process it again.
+      for (int j = 0; j < cp_seeds_.size(); ++j) {
+        if (part == cp_seeds_[j]) {
+          cp_seeds_[j] = nullptr;
+          break;
+        }
+      }
+    }
+
+    // part has already been removed from part_grid_ in function
+    // ExpandSeedHorizontal/ExpandSeedVertical.
+    seed->Absorb(part, nullptr);
+  }
+
+  return true;
+}
+
+void EquationDetect::ExpandSeedHorizontal(
+    const bool search_left,
+    ColPartition* seed,
+    GenericVector<ColPartition*>* parts_to_merge) {
+  ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr);
+  const float kYOverlapTh = 0.6;
+  const int kXGapTh = static_cast<int>(roundf(0.2 * resolution_));
+
+  ColPartitionGridSearch search(part_grid_);
+  const TBOX& seed_box(seed->bounding_box());
+  const int x = search_left ? seed_box.left() : seed_box.right();
+  search.StartSideSearch(x, seed_box.bottom(), seed_box.top());
+  search.SetUniqueMode(true);
+
+  // Search iteratively.
+  ColPartition *part = nullptr;
+  while ((part = search.NextSideSearch(search_left)) != nullptr) {
+    if (part == seed) {
+      continue;
+    }
+    const TBOX& part_box(part->bounding_box());
+    if (part_box.x_gap(seed_box) > kXGapTh) {  // Out of scope.
+      break;
+    }
+
+    // Check part location.
+    if ((part_box.left() >= seed_box.left() && search_left) ||
+        (part_box.right() <= seed_box.right() && !search_left)) {
+      continue;
+    }
+
+    if (part->type() != PT_EQUATION) {  // Non-equation type.
+      // Skip PT_LINLINE_EQUATION and non text type.
+      if (part->type() == PT_INLINE_EQUATION ||
+          (!IsTextOrEquationType(part->type()) &&
+           part->blob_type() != BRT_HLINE)) {
+        continue;
+      }
+      // For other types, it should be the near small neighbor of seed.
+      if (!IsNearSmallNeighbor(seed_box, part_box) ||
+          !CheckSeedNeighborDensity(part)) {
+        continue;
+      }
+    } else {  // Equation type, check the y overlap.
+      if (part_box.y_overlap_fraction(seed_box) < kYOverlapTh &&
+          seed_box.y_overlap_fraction(part_box) < kYOverlapTh) {
+        continue;
+      }
+    }
+
+    // Passed the check, delete it from search and add into parts_to_merge.
+    search.RemoveBBox();
+    parts_to_merge->push_back(part);
+  }
+}
+
+void EquationDetect::ExpandSeedVertical(
+    const bool search_bottom,
+    ColPartition* seed,
+    GenericVector<ColPartition*>* parts_to_merge) {
+  ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr &&
+              cps_super_bbox_ != nullptr);
+  const float kXOverlapTh = 0.4;
+  const int kYGapTh = static_cast<int>(roundf(0.2 * resolution_));
+
+  ColPartitionGridSearch search(part_grid_);
+  const TBOX& seed_box(seed->bounding_box());
+  const int y = search_bottom ? seed_box.bottom() : seed_box.top();
+  search.StartVerticalSearch(
+      cps_super_bbox_->left(), cps_super_bbox_->right(), y);
+  search.SetUniqueMode(true);
+
+  // Search iteratively.
+  ColPartition *part = nullptr;
+  GenericVector<ColPartition*> parts;
+  int skipped_min_top = std::numeric_limits<int>::max(), skipped_max_bottom = -1;
+  while ((part = search.NextVerticalSearch(search_bottom)) != nullptr) {
+    if (part == seed) {
+      continue;
+    }
+    const TBOX& part_box(part->bounding_box());
+
+    if (part_box.y_gap(seed_box) > kYGapTh) {  // Out of scope.
+      break;
+    }
+
+    // Check part location.
+    if ((part_box.bottom() >= seed_box.bottom() && search_bottom) ||
+        (part_box.top() <= seed_box.top() && !search_bottom)) {
+      continue;
+    }
+
+    bool skip_part = false;
+    if (part->type() != PT_EQUATION) {  // Non-equation type.
+      // Skip PT_LINLINE_EQUATION and non text type.
+      if (part->type() == PT_INLINE_EQUATION ||
+          (!IsTextOrEquationType(part->type()) &&
+           part->blob_type() != BRT_HLINE)) {
+        skip_part = true;
+      } else if (!IsNearSmallNeighbor(seed_box, part_box) ||
+          !CheckSeedNeighborDensity(part)) {
+        // For other types, it should be the near small neighbor of seed.
+        skip_part = true;
+      }
+    } else {  // Equation type, check the x overlap.
+      if (part_box.x_overlap_fraction(seed_box) < kXOverlapTh &&
+          seed_box.x_overlap_fraction(part_box) < kXOverlapTh) {
+        skip_part = true;
+      }
+    }
+    if (skip_part) {
+      if (part->type() != PT_EQUATION) {
+        if (skipped_min_top > part_box.top()) {
+          skipped_min_top = part_box.top();
+        }
+        if (skipped_max_bottom < part_box.bottom()) {
+          skipped_max_bottom = part_box.bottom();
+        }
+      }
+    } else {
+      parts.push_back(part);
+    }
+  }
+
+  // For every part in parts, we need verify it is not above skipped_min_top
+  // when search top, or not below skipped_max_bottom when search bottom. I.e.,
+  // we will skip a part if it looks like:
+  //             search bottom      |         search top
+  // seed:     ******************   | part:    **********
+  // skipped: xxx                   | skipped:  xxx
+  // part:       **********         | seed:    ***********
+  for (int i = 0; i < parts.size(); i++) {
+    const TBOX& part_box(parts[i]->bounding_box());
+    if ((search_bottom && part_box.top() <= skipped_max_bottom) ||
+        (!search_bottom && part_box.bottom() >= skipped_min_top)) {
+      continue;
+    }
+    // Add parts[i] into parts_to_merge, and delete it from part_grid_.
+    parts_to_merge->push_back(parts[i]);
+    part_grid_->RemoveBBox(parts[i]);
+  }
+}
+
+bool EquationDetect::IsNearSmallNeighbor(const TBOX& seed_box,
+                                         const TBOX& part_box) const {
+  const int kXGapTh = static_cast<int>(roundf(0.25 * resolution_));
+  const int kYGapTh = static_cast<int>(roundf(0.05 * resolution_));
+
+  // Check geometric feature.
+  if (part_box.height() > seed_box.height() ||
+      part_box.width() > seed_box.width()) {
+    return false;
+  }
+
+  // Check overlap and distance.
+  if ((!part_box.major_x_overlap(seed_box) ||
+       part_box.y_gap(seed_box) > kYGapTh) &&
+      (!part_box.major_y_overlap(seed_box) ||
+       part_box.x_gap(seed_box) > kXGapTh)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool EquationDetect::CheckSeedNeighborDensity(const ColPartition* part) const {
+  ASSERT_HOST(part);
+  if (part->boxes_count() < kSeedBlobsCountTh) {
+    // Too few blobs, skip the check.
+    return true;
+  }
+
+  // We check the math blobs density and the unclear blobs density.
+  if (part->SpecialBlobsDensity(BSTT_MATH) +
+      part->SpecialBlobsDensity(BSTT_DIGIT) > kMathDigitDensityTh1 ||
+      part->SpecialBlobsDensity(BSTT_UNCLEAR) > kUnclearDensityTh) {
+    return true;
+  }
+
+  return false;
+}
+
+void EquationDetect::ProcessMathBlockSatelliteParts() {
+  // Iterate over part_grid_, and find all parts that are text type but not
+  // equation type.
+  ColPartition *part = nullptr;
+  GenericVector<ColPartition*> text_parts;
+  ColPartitionGridSearch gsearch(part_grid_);
+  gsearch.StartFullSearch();
+  while ((part = gsearch.NextFullSearch()) != nullptr) {
+    if (part->type() == PT_FLOWING_TEXT || part->type() == PT_HEADING_TEXT) {
+      text_parts.push_back(part);
+    }
+  }
+  if (text_parts.empty()) {
+    return;
+  }
+
+  // Compute the medium height of the text_parts.
+  text_parts.sort(&SortCPByHeight);
+  const TBOX& text_box = text_parts[text_parts.size() / 2]->bounding_box();
+  int med_height = text_box.height();
+  if (text_parts.size() % 2 == 0 && text_parts.size() > 1) {
+    const TBOX& text_box =
+        text_parts[text_parts.size() / 2 - 1]->bounding_box();
+    med_height = static_cast<int>(roundf(
+        0.5 * (text_box.height() + med_height)));
+  }
+
+  // Iterate every text_parts and check if it is a math block satellite.
+  for (int i = 0; i < text_parts.size(); ++i) {
+    const TBOX& text_box(text_parts[i]->bounding_box());
+    if (text_box.height() > med_height) {
+      continue;
+    }
+    GenericVector<ColPartition*> math_blocks;
+    if (!IsMathBlockSatellite(text_parts[i], &math_blocks)) {
+      continue;
+    }
+
+    // Found. merge text_parts[i] with math_blocks.
+    part_grid_->RemoveBBox(text_parts[i]);
+    text_parts[i]->set_type(PT_EQUATION);
+    for (int j = 0; j < math_blocks.size(); ++j) {
+      part_grid_->RemoveBBox(math_blocks[j]);
+      text_parts[i]->Absorb(math_blocks[j], nullptr);
+    }
+    InsertPartAfterAbsorb(text_parts[i]);
+  }
+}
+
+bool EquationDetect::IsMathBlockSatellite(
+    ColPartition* part, GenericVector<ColPartition*>* math_blocks) {
+  ASSERT_HOST(part != nullptr && math_blocks != nullptr);
+  math_blocks->clear();
+  const TBOX& part_box(part->bounding_box());
+  // Find the top/bottom nearest neighbor of part.
+  ColPartition *neighbors[2];
+  int y_gaps[2] = {std::numeric_limits<int>::max(), std::numeric_limits<int>::max()};
+  // The horizontal boundary of the neighbors.
+  int neighbors_left = std::numeric_limits<int>::max(), neighbors_right = 0;
+  for (int i = 0; i < 2; ++i) {
+    neighbors[i] = SearchNNVertical(i != 0, part);
+    if (neighbors[i]) {
+      const TBOX& neighbor_box = neighbors[i]->bounding_box();
+      y_gaps[i] = neighbor_box.y_gap(part_box);
+      if (neighbor_box.left() < neighbors_left) {
+        neighbors_left = neighbor_box.left();
+      }
+      if (neighbor_box.right() > neighbors_right) {
+        neighbors_right = neighbor_box.right();
+      }
+    }
+  }
+  if (neighbors[0] == neighbors[1]) {
+    // This happens when part is inside neighbor.
+    neighbors[1] = nullptr;
+    y_gaps[1] = std::numeric_limits<int>::max();
+  }
+
+  // Check if part is within [neighbors_left, neighbors_right].
+  if (part_box.left() < neighbors_left || part_box.right() > neighbors_right) {
+    return false;
+  }
+
+  // Get the index of the near one in neighbors.
+  int index = y_gaps[0] < y_gaps[1] ? 0 : 1;
+
+  // Check the near one.
+  if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) {
+    math_blocks->push_back(neighbors[index]);
+  } else {
+    // If the near one failed the check, then we skip checking the far one.
+    return false;
+  }
+
+  // Check the far one.
+  index = 1 - index;
+  if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) {
+    math_blocks->push_back(neighbors[index]);
+  }
+
+  return true;
+}
+
+ColPartition* EquationDetect::SearchNNVertical(
+    const bool search_bottom, const ColPartition* part) {
+  ASSERT_HOST(part);
+  ColPartition *nearest_neighbor = nullptr, *neighbor = nullptr;
+  const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.5));
+
+  ColPartitionGridSearch search(part_grid_);
+  search.SetUniqueMode(true);
+  const TBOX& part_box(part->bounding_box());
+  int y = search_bottom ? part_box.bottom() : part_box.top();
+  search.StartVerticalSearch(part_box.left(), part_box.right(), y);
+  int min_y_gap = std::numeric_limits<int>::max();
+  while ((neighbor = search.NextVerticalSearch(search_bottom)) != nullptr) {
+    if (neighbor == part || !IsTextOrEquationType(neighbor->type())) {
+      continue;
+    }
+    const TBOX& neighbor_box(neighbor->bounding_box());
+    int y_gap = neighbor_box.y_gap(part_box);
+    if (y_gap > kYGapTh) {  // Out of scope.
+      break;
+    }
+    if (!neighbor_box.major_x_overlap(part_box) ||
+        (search_bottom && neighbor_box.bottom() > part_box.bottom()) ||
+        (!search_bottom && neighbor_box.top() < part_box.top())) {
+      continue;
+    }
+    if (y_gap < min_y_gap) {
+      min_y_gap = y_gap;
+      nearest_neighbor = neighbor;
+    }
+  }
+
+  return nearest_neighbor;
+}
+
+bool EquationDetect::IsNearMathNeighbor(
+    const int y_gap, const ColPartition *neighbor) const {
+  if (!neighbor) {
+    return false;
+  }
+  const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.1));
+  return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh;
+}
+
+void EquationDetect::GetOutputTiffName(const char* name,
+                                       STRING* image_name) const {
+  ASSERT_HOST(image_name && name);
+  char page[50];
+  snprintf(page, sizeof(page), "%04d", page_count_);
+  *image_name = STRING(lang_tesseract_->imagebasename) + page + name + ".tif";
+}
+
+void EquationDetect::PaintSpecialTexts(const STRING& outfile) const {
+  Pix *pix = nullptr, *pixBi = lang_tesseract_->pix_binary();
+  pix = pixConvertTo32(pixBi);
+  ColPartitionGridSearch gsearch(part_grid_);
+  ColPartition* part = nullptr;
+  gsearch.StartFullSearch();
+  while ((part = gsearch.NextFullSearch()) != nullptr) {
+    BLOBNBOX_C_IT blob_it(part->boxes());
+    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+      RenderSpecialText(pix, blob_it.data());
+    }
+  }
+
+  pixWrite(outfile.c_str(), pix, IFF_TIFF_LZW);
+  pixDestroy(&pix);
+}
+
+void EquationDetect::PaintColParts(const STRING& outfile) const {
+  Pix *pix = pixConvertTo32(lang_tesseract_->BestPix());
+  ColPartitionGridSearch gsearch(part_grid_);
+  gsearch.StartFullSearch();
+  ColPartition* part = nullptr;
+  while ((part = gsearch.NextFullSearch()) != nullptr) {
+    const TBOX& tbox = part->bounding_box();
+    Box *box = boxCreate(tbox.left(), pixGetHeight(pix) - tbox.top(),
+                         tbox.width(), tbox.height());
+    if (part->type() == PT_EQUATION) {
+      pixRenderBoxArb(pix, box, 5, 255, 0, 0);
+    } else if (part->type() == PT_INLINE_EQUATION) {
+      pixRenderBoxArb(pix, box, 5, 0, 255, 0);
+    } else {
+      pixRenderBoxArb(pix, box, 5, 0, 0, 255);
+    }
+    boxDestroy(&box);
+  }
+
+  pixWrite(outfile.c_str(), pix, IFF_TIFF_LZW);
+  pixDestroy(&pix);
+}
+
+void EquationDetect::PrintSpecialBlobsDensity(const ColPartition* part) const {
+  ASSERT_HOST(part);
+  TBOX box(part->bounding_box());
+  int h = pixGetHeight(lang_tesseract_->BestPix());
+  tprintf("Printing special blobs density values for ColParition (t=%d,b=%d) ",
+          h - box.top(), h - box.bottom());
+  box.print();
+  tprintf("blobs count = %d, density = ", part->boxes_count());
+  for (int i = 0; i < BSTT_COUNT; ++i) {
+    auto type = static_cast<BlobSpecialTextType>(i);
+    tprintf("%d:%f ", i, part->SpecialBlobsDensity(type));
+  }
+  tprintf("\n");
+}
+
+}  // namespace tesseract
diff --git a/tesseract/src/ccmain/equationdetect.h b/tesseract/src/ccmain/equationdetect.h
new file mode 100644
index 00000000..ffa418fe
--- /dev/null
+++ b/tesseract/src/ccmain/equationdetect.h
@@ -0,0 +1,273 @@
+///////////////////////////////////////////////////////////////////////
+// File:        equationdetect.h
+// Description: The equation detection class that inherits equationdetectbase.
+// Author:      Zongyi (Joe) Liu (joeliu@google.com)
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCMAIN_EQUATIONDETECT_H_
+#define TESSERACT_CCMAIN_EQUATIONDETECT_H_
+
+#include "blobbox.h"             // for BLOBNBOX (ptr only), BlobSpecialText...
+#include "equationdetectbase.h"  // for EquationDetectBase
+#include "genericvector.h"       // for GenericVector
+#include "tesseractclass.h"      // for Tesseract
+#include <tesseract/unichar.h>             // for UNICHAR_ID
+
+class TBOX;
+class UNICHARSET;
+
+namespace tesseract {
+
+class Tesseract;
+class ColPartition;
+class ColPartitionGrid;
+class ColPartitionSet;
+
+class TESS_API EquationDetect : public EquationDetectBase {
+ public:
+  EquationDetect(const char* equ_datapath,
+                 const char* equ_language);
+  ~EquationDetect() override;
+
+  enum IndentType {
+    NO_INDENT,
+    LEFT_INDENT,
+    RIGHT_INDENT,
+    BOTH_INDENT,
+    INDENT_TYPE_COUNT
+  };
+
+  // Reset the lang_tesseract_ pointer. This function should be called before we
+  // do any detector work.
+  void SetLangTesseract(Tesseract* lang_tesseract);
+
+  // Iterate over the blobs inside to_block, and set the blobs that we want to
+  // process to BSTT_NONE. (By default, they should be BSTT_SKIP). The function
+  // returns 0 upon success.
+  int LabelSpecialText(TO_BLOCK* to_block) override;
+
+  // Find possible equation partitions from part_grid. Should be called
+  // after the special_text_type of blobs are set.
+  // It returns 0 upon success.
+  int FindEquationParts(ColPartitionGrid* part_grid,
+                        ColPartitionSet** best_columns) override;
+
+  // Reset the resolution of the processing image. TEST only function.
+  void SetResolution(const int resolution);
+
+ protected:
+  // Identify the special text type for one blob, and update its field. When
+  // height_th is set (> 0), we will label the blob as BSTT_NONE if its height
+  // is less than height_th.
+  void IdentifySpecialText(BLOBNBOX *blob, const int height_th);
+
+  // Estimate the type for one unichar.
+  BlobSpecialTextType EstimateTypeForUnichar(
+      const UNICHARSET& unicharset, const UNICHAR_ID id) const;
+
+  // Compute special text type for each blobs in part_grid_.
+  void IdentifySpecialText();
+
+  // Identify blobs that we want to skip during special blob type
+  // classification.
+  void IdentifyBlobsToSkip(ColPartition* part);
+
+  // The ColPartitions in part_grid_ maybe over-segmented, particularly in the
+  // block equation regions. So we like to identify these partitions and merge
+  // them before we do the searching.
+  void MergePartsByLocation();
+
+  // Staring from the seed center, we do radius search. And for partitions that
+  // have large overlaps with seed, we remove them from part_grid_ and add into
+  // parts_overlap. Note: this function may update the part_grid_, so if the
+  // caller is also running ColPartitionGridSearch, use the RepositionIterator
+  // to continue.
+  void SearchByOverlap(ColPartition* seed,
+                       GenericVector<ColPartition*>* parts_overlap);
+
+  // Insert part back into part_grid_, after it absorbs some other parts.
+  void InsertPartAfterAbsorb(ColPartition* part);
+
+  // Identify the colparitions in part_grid_, label them as PT_EQUATION, and
+  // save them into cp_seeds_.
+  void IdentifySeedParts();
+
+  // Check the blobs count for a seed region candidate.
+  bool CheckSeedBlobsCount(ColPartition* part);
+
+  // Compute the foreground pixel density for a tbox area.
+  float ComputeForegroundDensity(const TBOX& tbox);
+
+  // Check if part from seed2 label: with low math density and left indented. We
+  // are using two checks:
+  // 1. If its left is aligned with any coordinates in indented_texts_left,
+  // which we assume have been sorted.
+  // 2. If its foreground density is over foreground_density_th.
+  bool CheckForSeed2(
+      const GenericVector<int>& indented_texts_left,
+      const float foreground_density_th,
+      ColPartition* part);
+
+  // Count the number of values in sorted_vec that is close to val, used to
+  // check if a partition is aligned with text partitions.
+  int CountAlignment(
+      const GenericVector<int>& sorted_vec, const int val) const;
+
+  // Check for a seed candidate using the foreground pixel density. And we
+  // return true if the density is below a certain threshold, because characters
+  // in equation regions usually are apart with more white spaces.
+  bool CheckSeedFgDensity(const float density_th, ColPartition* part);
+
+  // A light version of SplitCPHor: instead of really doing the part split, we
+  // simply compute the union bounding box of each split part.
+  void SplitCPHorLite(ColPartition* part, GenericVector<TBOX>* splitted_boxes);
+
+  // Split the part (horizontally), and save the split result into
+  // parts_splitted. Note that it is caller's responsibility to release the
+  // memory owns by parts_splitted. On the other hand, the part is unchanged
+  // during this process and still owns the blobs, so do NOT call DeleteBoxes
+  // when freeing the colpartitions in parts_splitted.
+  void SplitCPHor(ColPartition* part,
+                  GenericVector<ColPartition*>* parts_splitted);
+
+  // Check the density for a seed candidate (part) using its math density and
+  // italic density, returns true if the check passed.
+  bool CheckSeedDensity(const float math_density_high,
+                        const float math_density_low,
+                        const ColPartition* part) const;
+
+  // Check if part is indented.
+  IndentType IsIndented(ColPartition* part);
+
+  // Identify inline partitions from cp_seeds_, and re-label them.
+  void IdentifyInlineParts();
+
+  // Compute the super bounding box for all colpartitions inside part_grid_.
+  void ComputeCPsSuperBBox();
+
+  // Identify inline partitions from cp_seeds_ using the horizontal search.
+  void IdentifyInlinePartsHorizontal();
+
+  // Estimate the line spacing between two text partitions. Returns -1 if not
+  // enough data.
+  int EstimateTextPartLineSpacing();
+
+  // Identify inline partitions from cp_seeds_ using vertical search.
+  void IdentifyInlinePartsVertical(const bool top_to_bottom,
+                                   const int textPartsLineSpacing);
+
+  // Check if part is an inline equation zone. This should be called after we
+  // identified the seed regions.
+  bool IsInline(const bool search_bottom,
+                const int textPartsLineSpacing,
+                ColPartition* part);
+
+  // For a given seed partition, we search the part_grid_ and see if there is
+  // any partition can be merged with it. It returns true if the seed has been
+  // expanded.
+  bool ExpandSeed(ColPartition* seed);
+
+  // Starting from the seed position, we search the part_grid_
+  // horizontally/vertically, find all partitions that can be
+  // merged with seed, remove them from part_grid_, and put them  into
+  // parts_to_merge.
+  void ExpandSeedHorizontal(const bool search_left,
+                            ColPartition* seed,
+                            GenericVector<ColPartition*>* parts_to_merge);
+  void ExpandSeedVertical(const bool search_bottom,
+                          ColPartition* seed,
+                          GenericVector<ColPartition*>* parts_to_merge);
+
+  // Check if a part_box is the small neighbor of seed_box.
+  bool IsNearSmallNeighbor(const TBOX& seed_box,
+                           const TBOX& part_box) const;
+
+  // Perform the density check for part, which we assume is nearing a seed
+  // partition. It returns true if the check passed.
+  bool CheckSeedNeighborDensity(const ColPartition* part) const;
+
+  // After identify the math blocks, we do one more scanning on all text
+  // partitions, and check if any of them is the satellite of:
+  // math blocks: here a p is the satellite of q if:
+  // 1. q is the nearest vertical neighbor of p, and
+  // 2. y_gap(p, q) is less than a threshold, and
+  // 3. x_overlap(p, q) is over a threshold.
+  // Note that p can be the satellites of two blocks: its top neighbor and
+  // bottom neighbor.
+  void ProcessMathBlockSatelliteParts();
+
+  // Check if part is the satellite of one/two math blocks. If it is, we return
+  // true, and save the blocks into math_blocks.
+  bool IsMathBlockSatellite(
+      ColPartition* part, GenericVector<ColPartition*>* math_blocks);
+
+  // Search the nearest neighbor of part in one vertical direction as defined in
+  // search_bottom. It returns the neighbor found that major x overlap with it,
+  // or nullptr when not found.
+  ColPartition* SearchNNVertical(const bool search_bottom,
+                                 const ColPartition* part);
+
+  // Check if the neighbor with vertical distance of y_gap is a near and math
+  // block partition.
+  bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const;
+
+  // Generate the tiff file name for output/debug file.
+  void GetOutputTiffName(const char* name, STRING* image_name) const;
+
+  // Debugger function that renders ColPartitions on the input image, where:
+  // parts labeled as PT_EQUATION will be painted in red, PT_INLINE_EQUATION
+  // will be painted in green, and other parts will be painted in blue.
+  void PaintColParts(const STRING& outfile) const;
+
+  // Debugger function that renders the blobs in part_grid_ over the input
+  // image.
+  void PaintSpecialTexts(const STRING& outfile) const;
+
+  // Debugger function that print the math blobs density values for a
+  // ColPartition object.
+  void PrintSpecialBlobsDensity(const ColPartition* part) const;
+
+  // The tesseract engine initialized from equation training data.
+  Tesseract equ_tesseract_;
+
+  // The tesseract engine used for OCR. This pointer is passed in by the caller,
+  // so do NOT destroy it in this class.
+  Tesseract* lang_tesseract_;
+
+  // The ColPartitionGrid that we are processing. This pointer is passed in from
+  // the caller, so do NOT destroy it in the class.
+  ColPartitionGrid* part_grid_ = nullptr;
+
+  // A simple array of pointers to the best assigned column division at
+  // each grid y coordinate. This pointer is passed in from the caller, so do
+  // NOT destroy it in the class.
+  ColPartitionSet** best_columns_ = nullptr;
+
+  // The super bounding box of all cps in the part_grid_.
+  TBOX* cps_super_bbox_;
+
+  // The seed ColPartition for equation region.
+  GenericVector<ColPartition*> cp_seeds_;
+
+  // The resolution (dpi) of the processing image.
+  int resolution_;
+
+  // The number of pages we have processed.
+  int page_count_;
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_CCMAIN_EQUATIONDETECT_H_
diff --git a/tesseract/src/ccmain/fixspace.cpp b/tesseract/src/ccmain/fixspace.cpp
new file mode 100644
index 00000000..c15e99d3
--- /dev/null
+++ b/tesseract/src/ccmain/fixspace.cpp
@@ -0,0 +1,885 @@
+/******************************************************************
+ * File:        fixspace.cpp  (Formerly fixspace.c)
+ * Description: Implements a pass over the page res, exploring the alternative
+ *              spacing possibilities, trying to use context to improve the
+ *              word spacing
+ * Author:      Phil Cheatle
+ *
+ * (C) Copyright 1993, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "fixspace.h"
+
+#include "blobs.h"             // for TWERD, TBLOB, TESSLINE
+#include "boxword.h"           // for BoxWord
+#include "errcode.h"           // for ASSERT_HOST
+#include "normalis.h"          // for kBlnXHeight, kBlnBaselineOffset
+#include "pageres.h"           // for WERD_RES_IT, WERD_RES, WERD_RES_LIST
+#include "params.h"            // for IntParam, StringParam, BoolParam, Doub...
+#include "ratngs.h"            // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM
+#include "rect.h"              // for TBOX
+#include "stepblob.h"          // for C_BLOB_IT, C_BLOB_LIST, C_BLOB
+#include "tesseractclass.h"    // for Tesseract, TesseractStats, WordData
+#include "tessvars.h"          // for debug_fp
+#include "tprintf.h"           // for tprintf
+#include "unicharset.h"        // for UNICHARSET
+#include "werd.h"              // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP
+
+#include <tesseract/ocrclass.h>          // for ETEXT_DESC
+#include "strngs.h"            // for STRING
+#include <tesseract/unichar.h>           // for UNICHAR_ID
+
+#include <cstdint>             // for INT16_MAX, int16_t, int32_t
+
+namespace tesseract {
+
+class BLOCK;
+class ROW;
+
+#define PERFECT_WERDS   999
+
+/**********************************************************************
+ *  c_blob_comparator()
+ *
+ *  Blob comparator used to sort a blob list so that blobs are in increasing
+ *  order of left edge.
+ **********************************************************************/
+
+static int c_blob_comparator(              // sort blobs
+                      const void *blob1p,  // ptr to ptr to blob1
+                      const void *blob2p   // ptr to ptr to blob2
+                     ) {
+  const C_BLOB *blob1 = *reinterpret_cast<const C_BLOB* const*>(blob1p);
+  const C_BLOB *blob2 = *reinterpret_cast<const C_BLOB* const*>(blob2p);
+
+  return blob1->bounding_box ().left () - blob2->bounding_box ().left ();
+}
+
+/**
+ * @name fix_fuzzy_spaces()
+ * Walk over the page finding sequences of words joined by fuzzy spaces. Extract
+ * them as a sublist, process the sublist to find the optimal arrangement of
+ * spaces then replace the sublist in the ROW_RES.
+ *
+ * @param monitor progress monitor
+ * @param word_count count of words in doc
+ * @param[out] page_res
+ */
+void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
+                                 int32_t word_count,
+                                 PAGE_RES *page_res) {
+  BLOCK_RES_IT block_res_it;
+  ROW_RES_IT row_res_it;
+  WERD_RES_IT word_res_it_from;
+  WERD_RES_IT word_res_it_to;
+  WERD_RES *word_res;
+  WERD_RES_LIST fuzzy_space_words;
+  int16_t new_length;
+  bool prevent_null_wd_fixsp;   // DON'T process blobless wds
+  int32_t word_index;              // current word
+
+  block_res_it.set_to_list(&page_res->block_res_list);
+  word_index = 0;
+  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
+       block_res_it.forward()) {
+    row_res_it.set_to_list(&block_res_it.data()->row_res_list);
+    for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
+         row_res_it.forward()) {
+      word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
+      while (!word_res_it_from.at_last()) {
+        word_res = word_res_it_from.data();
+        while (!word_res_it_from.at_last() &&
+               !(word_res->combination ||
+                 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
+                 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
+          fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
+                         block_res_it.data()->block);
+          word_res = word_res_it_from.forward();
+          word_index++;
+          if (monitor != nullptr) {
+            monitor->ocr_alive = true;
+            monitor->progress = 90 + 5 * word_index / word_count;
+            if (monitor->deadline_exceeded() ||
+                (monitor->cancel != nullptr &&
+                 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
+            return;
+          }
+        }
+
+        if (!word_res_it_from.at_last()) {
+          word_res_it_to = word_res_it_from;
+          prevent_null_wd_fixsp =
+            word_res->word->cblob_list()->empty();
+          if (check_debug_pt(word_res, 60))
+            debug_fix_space_level.set_value(10);
+          word_res_it_to.forward();
+          word_index++;
+          if (monitor != nullptr) {
+            monitor->ocr_alive = true;
+            monitor->progress = 90 + 5 * word_index / word_count;
+            if (monitor->deadline_exceeded() ||
+                (monitor->cancel != nullptr &&
+                 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
+            return;
+          }
+          while (!word_res_it_to.at_last () &&
+                 (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
+                  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
+            if (check_debug_pt(word_res, 60))
+              debug_fix_space_level.set_value(10);
+            if (word_res->word->cblob_list()->empty())
+              prevent_null_wd_fixsp = true;
+            word_res = word_res_it_to.forward();
+          }
+          if (check_debug_pt(word_res, 60))
+            debug_fix_space_level.set_value(10);
+          if (word_res->word->cblob_list()->empty())
+            prevent_null_wd_fixsp = true;
+          if (prevent_null_wd_fixsp) {
+            word_res_it_from = word_res_it_to;
+          } else {
+            fuzzy_space_words.assign_to_sublist(&word_res_it_from,
+                                                &word_res_it_to);
+            fix_fuzzy_space_list(fuzzy_space_words,
+                                 row_res_it.data()->row,
+                                 block_res_it.data()->block);
+            new_length = fuzzy_space_words.length();
+            word_res_it_from.add_list_before(&fuzzy_space_words);
+            for (;
+                 !word_res_it_from.at_last() && new_length > 0;
+                 new_length--) {
+              word_res_it_from.forward();
+            }
+          }
+          if (test_pt)
+            debug_fix_space_level.set_value(0);
+        }
+        fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
+                       block_res_it.data()->block);
+        // Last word in row
+      }
+    }
+  }
+}
+
+void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm,
+                                     ROW *row,
+                                     BLOCK* block) {
+  int16_t best_score;
+  WERD_RES_LIST current_perm;
+  int16_t current_score;
+  bool improved = false;
+
+  best_score = eval_word_spacing(best_perm);  // default score
+  dump_words(best_perm, best_score, 1, improved);
+
+  if (best_score != PERFECT_WERDS)
+    initialise_search(best_perm, current_perm);
+
+  while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
+    match_current_words(current_perm, row, block);
+    current_score = eval_word_spacing(current_perm);
+    dump_words(current_perm, current_score, 2, improved);
+    if (current_score > best_score) {
+      best_perm.clear();
+      best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
+      best_score = current_score;
+      improved = true;
+    }
+    if (current_score < PERFECT_WERDS)
+      transform_to_next_perm(current_perm);
+  }
+  dump_words(best_perm, best_score, 3, improved);
+}
+
+void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
+  WERD_RES_IT src_it(&src_list);
+  WERD_RES_IT new_it(&new_list);
+  WERD_RES *src_wd;
+  WERD_RES *new_wd;
+
+  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
+    src_wd = src_it.data();
+    if (!src_wd->combination) {
+      new_wd = WERD_RES::deep_copy(src_wd);
+      new_wd->combination = false;
+      new_wd->part_of_combo = false;
+      new_it.add_after_then_move(new_wd);
+    }
+  }
+}
+
+void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
+                                    BLOCK* block) {
+  WERD_RES_IT word_it(&words);
+  WERD_RES *word;
+  // Since we are not using PAGE_RES to iterate over words, we need to update
+  // prev_word_best_choice_ before calling classify_word_pass2().
+  prev_word_best_choice_ = nullptr;
+  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
+    word = word_it.data();
+    if ((!word->part_of_combo) && (word->box_word == nullptr)) {
+      WordData word_data(block, row, word);
+      SetupWordPassN(2, &word_data);
+      classify_word_and_language(2, nullptr, &word_data);
+    }
+    prev_word_best_choice_ = word->best_choice;
+  }
+}
+
+/**
+ * @name eval_word_spacing()
+ * The basic measure is the number of characters in contextually confirmed
+ * words. (I.e the word is done)
+ * If all words are contextually confirmed the evaluation is deemed perfect.
+ *
+ * Some fiddles are done to handle "1"s as these are VERY frequent causes of
+ * fuzzy spaces. The problem with the basic measure is that "561 63" would score
+ * the same as "56163", though given our knowledge that the space is fuzzy, and
+ * that there is a "1" next to the fuzzy space, we need to ensure that "56163"
+ * is preferred.
+ *
+ * The solution is to NOT COUNT the score of any word which has a digit at one
+ * end and a "1Il" as the character the other side of the space.
+ *
+ * Conversely, any character next to a "1" within a word is counted as a positive
+ * score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of
+ * the "1" joined).  "56163" would score 7 - all chars in a numeric word + 2
+ * sides of a "1" joined.
+ *
+ * The joined 1 rule is applied to any word REGARDLESS of contextual
+ * confirmation.  Thus "PS7a71 3/7a" scores 1 (neither word is contexutally
+ * confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.
+ *
+ */
+int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
+  WERD_RES_IT word_res_it(&word_res_list);
+  int16_t total_score = 0;
+  int16_t word_count = 0;
+  int16_t done_word_count = 0;
+  int16_t word_len;
+  int16_t i;
+  int16_t offset;
+  WERD_RES *word;                 // current word
+  int16_t prev_word_score = 0;
+  bool prev_word_done = false;
+  bool prev_char_1 = false;      // prev ch a "1/I/l"?
+  bool prev_char_digit = false;  // prev ch 2..9 or 0
+  bool current_char_1 = false;
+  bool current_word_ok_so_far;
+  STRING punct_chars = "!\"`',.:;";
+  bool prev_char_punct = false;
+  bool current_char_punct = false;
+  bool word_done = false;
+
+  do {
+    word = word_res_it.data();
+    word_done = fixspace_thinks_word_done(word);
+    word_count++;
+    if (word->tess_failed) {
+      total_score += prev_word_score;
+      if (prev_word_done)
+        done_word_count++;
+      prev_word_score = 0;
+      prev_char_1 = false;
+      prev_char_digit = false;
+      prev_word_done = false;
+    } else {
+      /*
+        Can we add the prev word score and potentially count this word?
+        Yes IF it didn't end in a 1 when the first char of this word is a digit
+          AND it didn't end in a digit when the first char of this word is a 1
+      */
+      word_len = word->reject_map.length();
+      current_word_ok_so_far = false;
+      if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
+            (prev_char_digit && (
+                (word_done &&
+                 word->best_choice->unichar_lengths().c_str()[0] == 1 &&
+                 word->best_choice->unichar_string()[0] == '1') ||
+                (!word_done && STRING(conflict_set_I_l_1).contains(
+                      word->best_choice->unichar_string()[0])))))) {
+        total_score += prev_word_score;
+        if (prev_word_done)
+          done_word_count++;
+        current_word_ok_so_far = word_done;
+      }
+
+      if (current_word_ok_so_far) {
+        prev_word_done = true;
+        prev_word_score = word_len;
+      } else {
+        prev_word_done = false;
+        prev_word_score = 0;
+      }
+
+      /* Add 1 to total score for every joined 1 regardless of context and
+         rejtn */
+      for (i = 0, prev_char_1 = false; i < word_len; i++) {
+        current_char_1 = word->best_choice->unichar_string()[i] == '1';
+        if (prev_char_1 || (current_char_1 && (i > 0)))
+          total_score++;
+        prev_char_1 = current_char_1;
+      }
+
+      /* Add 1 to total score for every joined punctuation regardless of context
+        and rejtn */
+      if (tessedit_prefer_joined_punct) {
+        for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
+             offset += word->best_choice->unichar_lengths()[i++]) {
+          current_char_punct =
+            punct_chars.contains(word->best_choice->unichar_string()[offset]);
+          if (prev_char_punct || (current_char_punct && i > 0))
+            total_score++;
+          prev_char_punct = current_char_punct;
+        }
+      }
+      prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
+      for (i = 0, offset = 0; i < word_len - 1;
+           offset += word->best_choice->unichar_lengths()[i++]);
+      prev_char_1 =
+          ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
+           || (!word_done && STRING(conflict_set_I_l_1).contains(
+                   word->best_choice->unichar_string()[offset])));
+    }
+    /* Find next word */
+    do {
+      word_res_it.forward();
+    } while (word_res_it.data()->part_of_combo);
+  } while (!word_res_it.at_first());
+  total_score += prev_word_score;
+  if (prev_word_done)
+    done_word_count++;
+  if (done_word_count == word_count)
+    return PERFECT_WERDS;
+  else
+    return total_score;
+}
+
+bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
+  int i;
+  int offset;
+
+  for (i = 0, offset = 0; i < char_position;
+       offset += word->best_choice->unichar_lengths()[i++]);
+  return (
+      word->uch_set->get_isdigit(
+          word->best_choice->unichar_string().c_str() + offset,
+          word->best_choice->unichar_lengths()[i]) ||
+      (word->best_choice->permuter() == NUMBER_PERM &&
+       STRING(numeric_punctuation).contains(
+           word->best_choice->unichar_string().c_str()[offset])));
+}
+
+/**
+ * @name transform_to_next_perm()
+ * Examines the current word list to find the smallest word gap size. Then walks
+ * the word list closing any gaps of this size by either inserted new
+ * combination words, or extending existing ones.
+ *
+ * The routine COULD be limited to stop it building words longer than N blobs.
+ *
+ * If there are no more gaps then it DELETES the entire list and returns the
+ * empty list to cause termination.
+ */
+void transform_to_next_perm(WERD_RES_LIST &words) {
+  WERD_RES_IT word_it(&words);
+  WERD_RES_IT prev_word_it(&words);
+  WERD_RES *word;
+  WERD_RES *prev_word;
+  WERD_RES *combo;
+  WERD *copy_word;
+  int16_t prev_right = -INT16_MAX;
+  TBOX box;
+  int16_t gap;
+  int16_t min_gap = INT16_MAX;
+
+  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
+    word = word_it.data();
+    if (!word->part_of_combo) {
+      box = word->word->bounding_box();
+      if (prev_right > -INT16_MAX) {
+        gap = box.left() - prev_right;
+        if (gap < min_gap)
+          min_gap = gap;
+      }
+      prev_right = box.right();
+    }
+  }
+  if (min_gap < INT16_MAX) {
+    prev_right = -INT16_MAX;        // back to start
+    word_it.set_to_list(&words);
+    // Note: we can't use cycle_pt due to inserted combos at start of list.
+    for (; (prev_right == -INT16_MAX) || !word_it.at_first();
+         word_it.forward()) {
+      word = word_it.data();
+      if (!word->part_of_combo) {
+        box = word->word->bounding_box();
+        if (prev_right > -INT16_MAX) {
+          gap = box.left() - prev_right;
+          if (gap <= min_gap) {
+            prev_word = prev_word_it.data();
+            if (prev_word->combination) {
+              combo = prev_word;
+            } else {
+              /* Make a new combination and insert before
+               * the first word being joined. */
+              copy_word = new WERD;
+              *copy_word = *(prev_word->word);
+              // deep copy
+              combo = new WERD_RES(copy_word);
+              combo->combination = true;
+              combo->x_height = prev_word->x_height;
+              prev_word->part_of_combo = true;
+              prev_word_it.add_before_then_move(combo);
+            }
+            combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
+            if (word->combination) {
+              combo->word->join_on(word->word);
+              // Move blobs to combo
+              // old combo no longer needed
+              delete word_it.extract();
+            } else {
+              // Copy current wd to combo
+              combo->copy_on(word);
+              word->part_of_combo = true;
+            }
+            combo->done = false;
+            combo->ClearResults();
+          } else {
+            prev_word_it = word_it;  // catch up
+          }
+        }
+        prev_right = box.right();
+      }
+    }
+  } else {
+    words.clear();  // signal termination
+  }
+}
+
+void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score,
+                           int16_t mode, bool improved) {
+  WERD_RES_IT word_res_it(&perm);
+
+  if (debug_fix_space_level > 0) {
+    if (mode == 1) {
+      stats_.dump_words_str = "";
+      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
+           word_res_it.forward()) {
+        if (!word_res_it.data()->part_of_combo) {
+          stats_.dump_words_str +=
+              word_res_it.data()->best_choice->unichar_string();
+          stats_.dump_words_str += ' ';
+        }
+      }
+    }
+
+    if (debug_fix_space_level > 1) {
+      switch (mode) {
+        case 1:
+          tprintf("EXTRACTED (%d): \"", score);
+          break;
+        case 2:
+          tprintf("TESTED (%d): \"", score);
+          break;
+        case 3:
+          tprintf("RETURNED (%d): \"", score);
+          break;
+      }
+
+      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
+           word_res_it.forward()) {
+        if (!word_res_it.data()->part_of_combo) {
+          tprintf("%s/%1d ",
+                  word_res_it.data()->best_choice->unichar_string().c_str(),
+                  static_cast<int>(word_res_it.data()->best_choice->permuter()));
+        }
+      }
+      tprintf("\"\n");
+    } else if (improved) {
+      tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.c_str());
+      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
+           word_res_it.forward()) {
+        if (!word_res_it.data()->part_of_combo) {
+          tprintf("%s/%1d ",
+                  word_res_it.data()->best_choice->unichar_string().c_str(),
+                  static_cast<int>(word_res_it.data()->best_choice->permuter()));
+        }
+      }
+      tprintf("\"\n");
+    }
+  }
+}
+
+bool Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
+  if (word->done)
+    return true;
+
+  /*
+    Use all the standard pass 2 conditions for mode 5 in set_done() in
+    reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
+    CARE WHETHER WE HAVE of/at on/an etc.
+  */
+  if (fixsp_done_mode > 0 &&
+      (word->tess_accepted ||
+       (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
+       fixsp_done_mode == 3) &&
+      (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr) &&
+      ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
+       (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
+       (word->best_choice->permuter() == USER_DAWG_PERM) ||
+       (word->best_choice->permuter() == NUMBER_PERM))) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+
+/**
+ * @name fix_sp_fp_word()
+ * Test the current word to see if it can be split by deleting noise blobs. If
+ * so, do the business.
+ * Return with the iterator pointing to the same place if the word is unchanged,
+ * or the last of the replacement words.
+ */
+void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
+                               BLOCK* block) {
+  WERD_RES *word_res;
+  WERD_RES_LIST sub_word_list;
+  WERD_RES_IT sub_word_list_it(&sub_word_list);
+  int16_t blob_index;
+  int16_t new_length;
+  float junk;
+
+  word_res = word_res_it.data();
+  if (word_res->word->flag(W_REP_CHAR) ||
+      word_res->combination ||
+      word_res->part_of_combo ||
+      !word_res->word->flag(W_DONT_CHOP))
+    return;
+
+  blob_index = worst_noise_blob(word_res, &junk);
+  if (blob_index < 0)
+    return;
+
+  if (debug_fix_space_level > 1) {
+    tprintf("FP fixspace working on \"%s\"\n",
+            word_res->best_choice->unichar_string().c_str());
+  }
+  word_res->word->rej_cblob_list()->sort(c_blob_comparator);
+  sub_word_list_it.add_after_stay_put(word_res_it.extract());
+  fix_noisy_space_list(sub_word_list, row, block);
+  new_length = sub_word_list.length();
+  word_res_it.add_list_before(&sub_word_list);
+  for (; !word_res_it.at_last() && new_length > 1; new_length--) {
+    word_res_it.forward();
+  }
+}
+
+void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
+                                     BLOCK* block) {
+  int16_t best_score;
+  WERD_RES_IT best_perm_it(&best_perm);
+  WERD_RES_LIST current_perm;
+  WERD_RES_IT current_perm_it(&current_perm);
+  WERD_RES *old_word_res;
+  int16_t current_score;
+  bool improved = false;
+
+  best_score = fp_eval_word_spacing(best_perm);  // default score
+
+  dump_words(best_perm, best_score, 1, improved);
+
+  old_word_res = best_perm_it.data();
+  // Even deep_copy doesn't copy the underlying WERD unless its combination
+  // flag is true!.
+  old_word_res->combination = true;   // Kludge to force deep copy
+  current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
+  old_word_res->combination = false;  // Undo kludge
+
+  break_noisiest_blob_word(current_perm);
+
+  while (best_score != PERFECT_WERDS && !current_perm.empty()) {
+    match_current_words(current_perm, row, block);
+    current_score = fp_eval_word_spacing(current_perm);
+    dump_words(current_perm, current_score, 2, improved);
+    if (current_score > best_score) {
+      best_perm.clear();
+      best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
+      best_score = current_score;
+      improved = true;
+    }
+    if (current_score < PERFECT_WERDS) {
+      break_noisiest_blob_word(current_perm);
+    }
+  }
+  dump_words(best_perm, best_score, 3, improved);
+}
+
+
+/**
+ * break_noisiest_blob_word()
+ * Find the word with the blob which looks like the worst noise.
+ * Break the word into two, deleting the noise blob.
+ */
+void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
+  WERD_RES_IT word_it(&words);
+  WERD_RES_IT worst_word_it;
+  float worst_noise_score = 9999;
+  int worst_blob_index = -1;     // Noisiest blob of noisiest wd
+  int blob_index;                // of wds noisiest blob
+  float noise_score;             // of wds noisiest blob
+  WERD_RES *word_res;
+  C_BLOB_IT blob_it;
+  C_BLOB_IT rej_cblob_it;
+  C_BLOB_LIST new_blob_list;
+  C_BLOB_IT new_blob_it;
+  C_BLOB_IT new_rej_cblob_it;
+  WERD *new_word;
+  int16_t start_of_noise_blob;
+  int16_t i;
+
+  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
+    blob_index = worst_noise_blob(word_it.data(), &noise_score);
+    if (blob_index > -1 && worst_noise_score > noise_score) {
+      worst_noise_score = noise_score;
+      worst_blob_index = blob_index;
+      worst_word_it = word_it;
+    }
+  }
+  if (worst_blob_index < 0) {
+    words.clear();          // signal termination
+    return;
+  }
+
+  /* Now split the worst_word_it */
+
+  word_res = worst_word_it.data();
+
+  /* Move blobs before noise blob to a new bloblist */
+
+  new_blob_it.set_to_list(&new_blob_list);
+  blob_it.set_to_list(word_res->word->cblob_list());
+  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
+    new_blob_it.add_after_then_move(blob_it.extract());
+  }
+  start_of_noise_blob = blob_it.data()->bounding_box().left();
+  delete blob_it.extract();     // throw out noise blob
+
+  new_word = new WERD(&new_blob_list, word_res->word);
+  new_word->set_flag(W_EOL, false);
+  word_res->word->set_flag(W_BOL, false);
+  word_res->word->set_blanks(1);  // After break
+
+  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
+  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
+  for (;
+       (!rej_cblob_it.empty() &&
+        (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
+       rej_cblob_it.forward()) {
+    new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
+  }
+
+  auto* new_word_res = new WERD_RES(new_word);
+  new_word_res->combination = true;
+  worst_word_it.add_before_then_move(new_word_res);
+
+  word_res->ClearResults();
+}
+
+int16_t Tesseract::worst_noise_blob(WERD_RES *word_res,
+                                  float *worst_noise_score) {
+  float noise_score[512];
+  int i;
+  int min_noise_blob;            // 1st contender
+  int max_noise_blob;            // last contender
+  int non_noise_count;
+  int worst_noise_blob;          // Worst blob
+  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
+  float non_noise_limit = kBlnXHeight * 0.8;
+
+  if (word_res->rebuild_word == nullptr)
+    return -1;  // Can't handle cube words.
+
+  // Normalised.
+  int blob_count = word_res->box_word->length();
+  ASSERT_HOST(blob_count <= 512);
+  if (blob_count < 5)
+    return -1;                   // too short to split
+
+  /* Get the noise scores for all blobs */
+
+  #ifndef SECURE_NAMES
+  if (debug_fix_space_level > 5)
+    tprintf("FP fixspace Noise metrics for \"%s\": ",
+            word_res->best_choice->unichar_string().c_str());
+  #endif
+
+  for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
+    TBLOB* blob = word_res->rebuild_word->blobs[i];
+    if (word_res->reject_map[i].accepted())
+      noise_score[i] = non_noise_limit;
+    else
+      noise_score[i] = blob_noise_score(blob);
+
+    if (debug_fix_space_level > 5)
+      tprintf("%1.1f ", noise_score[i]);
+  }
+  if (debug_fix_space_level > 5)
+    tprintf("\n");
+
+  /* Now find the worst one which is far enough away from the end of the word */
+
+  non_noise_count = 0;
+  for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
+    if (noise_score[i] >= non_noise_limit) {
+      non_noise_count++;
+    }
+  }
+  if (non_noise_count < fixsp_non_noise_limit)
+    return -1;
+
+  min_noise_blob = i;
+
+  non_noise_count = 0;
+  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
+       i--) {
+    if (noise_score[i] >= non_noise_limit) {
+      non_noise_count++;
+    }
+  }
+  if (non_noise_count < fixsp_non_noise_limit)
+    return -1;
+
+  max_noise_blob = i;
+
+  if (min_noise_blob > max_noise_blob)
+    return -1;
+
+  *worst_noise_score = small_limit;
+  worst_noise_blob = -1;
+  for (i = min_noise_blob; i <= max_noise_blob; i++) {
+    if (noise_score[i] < *worst_noise_score) {
+      worst_noise_blob = i;
+      *worst_noise_score = noise_score[i];
+    }
+  }
+  return worst_noise_blob;
+}
+
+float Tesseract::blob_noise_score(TBLOB *blob) {
+  TBOX box;                       // BB of outline
+  int16_t outline_count = 0;
+  int16_t max_dimension;
+  int16_t largest_outline_dimension = 0;
+
+  for (TESSLINE* ol = blob->outlines; ol != nullptr; ol= ol->next) {
+    outline_count++;
+    box = ol->bounding_box();
+    if (box.height() > box.width()) {
+      max_dimension = box.height();
+    } else {
+      max_dimension = box.width();
+    }
+
+    if (largest_outline_dimension < max_dimension)
+      largest_outline_dimension = max_dimension;
+  }
+
+  if (outline_count > 5) {
+    // penalise LOTS of blobs
+    largest_outline_dimension *= 2;
+  }
+
+  box = blob->bounding_box();
+  if (box.bottom() > kBlnBaselineOffset * 4 ||
+      box.top() < kBlnBaselineOffset / 2) {
+    // Lax blob is if high or low
+    largest_outline_dimension /= 2;
+  }
+
+  return largest_outline_dimension;
+}
+
+void fixspace_dbg(WERD_RES *word) {
+  TBOX box = word->word->bounding_box();
+  const bool show_map_detail = false;
+  int16_t i;
+
+  box.print();
+  tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
+  tprintf("Blob count: %d (word); %d/%d (rebuild word)\n",
+          word->word->cblob_list()->length(),
+          word->rebuild_word->NumBlobs(),
+          word->box_word->length());
+  word->reject_map.print(debug_fp);
+  tprintf("\n");
+  if (show_map_detail) {
+    tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
+    for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
+      tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
+      word->reject_map[i].full_print(debug_fp);
+    }
+  }
+
+  tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
+  tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
+}
+
+
+/**
+ * fp_eval_word_spacing()
+ * Evaluation function for fixed pitch word lists.
+ *
+ * Basically, count the number of "nice" characters - those which are in tess
+ * acceptable words or in dict words and are not rejected.
+ * Penalise any potential noise chars
+ */
+int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
+  WERD_RES_IT word_it(&word_res_list);
+  WERD_RES *word;
+  int16_t score = 0;
+  int16_t i;
+  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
+
+  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
+    word = word_it.data();
+    if (word->rebuild_word == nullptr)
+      continue;  // Can't handle cube words.
+    if (word->done ||
+        word->tess_accepted ||
+        word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
+        word->best_choice->permuter() == FREQ_DAWG_PERM ||
+        word->best_choice->permuter() == USER_DAWG_PERM ||
+        safe_dict_word(word) > 0) {
+      int num_blobs = word->rebuild_word->NumBlobs();
+      UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
+      for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
+        TBLOB* blob = word->rebuild_word->blobs[i];
+        if (word->best_choice->unichar_id(i) == space ||
+            blob_noise_score(blob) < small_limit) {
+          score -= 1;  // penalise possibly erroneous non-space
+        } else if (word->reject_map[i].accepted()) {
+          score++;
+        }
+      }
+    }
+  }
+  if (score < 0)
+    score = 0;
+  return score;
+}
+
+}  // namespace tesseract
diff --git a/tesseract/src/ccmain/fixspace.h b/tesseract/src/ccmain/fixspace.h
new file mode 100644
index 00000000..fd49bf29
--- /dev/null
+++ b/tesseract/src/ccmain/fixspace.h
@@ -0,0 +1,36 @@
+/******************************************************************
+ * File:        fixspace.h  (Formerly fixspace.h)
+ * Description: Implements a pass over the page res, exploring the alternative
+ *              spacing possibilities, trying to use context to improve the
+ *              word spacing
+ * Author:      Phil Cheatle
+ * Created:     Thu Oct 21 11:38:43 BST 1993
+ *
+ * (C) Copyright 1993, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef FIXSPACE_H
+#define FIXSPACE_H
+
+namespace tesseract {
+
+class WERD_RES;
+class WERD_RES_LIST;
+
+void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list);
+void transform_to_next_perm(WERD_RES_LIST &words);
+void fixspace_dbg(WERD_RES *word);
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/ccmain/fixxht.cpp b/tesseract/src/ccmain/fixxht.cpp
new file mode 100644
index 00000000..716ac385
--- /dev/null
+++ b/tesseract/src/ccmain/fixxht.cpp
@@ -0,0 +1,216 @@
+/**********************************************************************
+ * File:        fixxht.cpp  (Formerly fixxht.c)
+ * Description: Improve x_ht and look out for case inconsistencies
+ * Author:      Phil Cheatle
+ * Created:     Thu Aug  5 14:11:08 BST 1993
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "params.h"
+#include "float2int.h"
+#include "tesseractclass.h"
+
+#include <algorithm>
+#include <cstring>
+#include <cctype>
+
+namespace tesseract {
+
+// Fixxht overview.
+// Premise: Initial estimate of x-height is adequate most of the time, but
+// occasionally it is incorrect. Most notable causes of failure are:
+// 1. Small caps, where the top of the caps is the same as the body text
+// xheight. For small caps words the xheight needs to be reduced to correctly
+// recognize the caps in the small caps word.
+// 2. All xheight lines, such as summer. Here the initial estimate will have
+// guessed that the blob tops are caps and will have placed the xheight too low.
+// 3. Noise/logos beside words, or changes in font size on a line. Such
+// things can blow the statistics and cause an incorrect estimate.
+// 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged.
+// In this case the x-height is often still correct.
+//
+// Algorithm.
+// Compare the vertical position (top only) of alphnumerics in a word with
+// the range of positions in training data (in the unicharset).
+// See CountMisfitTops. If any characters disagree sufficiently with the
+// initial xheight estimate, then recalculate the xheight, re-run OCR on
+// the word, and if the number of vertical misfits goes down, along with
+// either the word rating or certainty, then keep the new xheight.
+// The new xheight is calculated as follows:ComputeCompatibleXHeight
+// For each alphanumeric character that has a vertically misplaced top
+// (a misfit), yet its bottom is within the acceptable range (ie it is not
+// likely a sub-or super-script) calculate the range of acceptable xheight
+// positions from its range of tops, and give each value in the range a
+// number of votes equal to the distance of its top from its acceptance range.
+// The x-height position with the median of the votes becomes the new
+// x-height. This assumes that most characters will be correctly recognized
+// even if the x-height is incorrect. This is not a terrible assumption, but
+// it is not great. An improvement would be to use a classifier that does
+// not care about vertical position or scaling at all.
+// Separately collect stats on shifted baselines and apply the same logic to
+// computing a best-fit shift to fix the error. If the baseline needs to be
+// shifted, but the x-height is OK, returns the original x-height along with
+// the baseline shift to indicate that recognition needs to re-run.
+
+// If the max-min top of a unicharset char is bigger than kMaxCharTopRange
+// then the char top cannot be used to judge misfits or suggest a new top.
+const int kMaxCharTopRange = 48;
+
+// Returns the number of misfit blob tops in this word.
+int Tesseract::CountMisfitTops(WERD_RES *word_res) {
+  int bad_blobs = 0;
+  int num_blobs = word_res->rebuild_word->NumBlobs();
+  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
+    TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
+    UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
+    if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
+      int top = blob->bounding_box().top();
+      if (top >= INT_FEAT_RANGE)
+        top = INT_FEAT_RANGE - 1;
+      int min_bottom, max_bottom, min_top, max_top;
+      unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
+                                &min_top, &max_top);
+      if (max_top - min_top > kMaxCharTopRange)
+        continue;
+      bool bad =  top < min_top - x_ht_acceptance_tolerance ||
+                  top > max_top + x_ht_acceptance_tolerance;
+      if (bad)
+        ++bad_blobs;
+      if (debug_x_ht_level >= 1) {
+        tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
+                unicharset.id_to_unichar(class_id),
+                bad ? "Misfit" : "OK", top, min_top, max_top,
+                static_cast<int>(x_ht_acceptance_tolerance));
+      }
+    }
+  }
+  return bad_blobs;
+}
+
+// Returns a new x-height maximally compatible with the result in word_res.
+// See comment above for overall algorithm.
+float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res,
+                                          float* baseline_shift) {
+  STATS top_stats(0, UINT8_MAX);
+  STATS shift_stats(-UINT8_MAX, UINT8_MAX);
+  int bottom_shift = 0;
+  int num_blobs = word_res->rebuild_word->NumBlobs();
+  do {
+    top_stats.clear();
+    shift_stats.clear();
+    for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
+      TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
+      UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
+      if (unicharset.get_isalpha(class_id) ||
+          unicharset.get_isdigit(class_id)) {
+        int top = blob->bounding_box().top() + bottom_shift;
+        // Clip the top to the limit of normalized feature space.
+        if (top >= INT_FEAT_RANGE)
+          top = INT_FEAT_RANGE - 1;
+        int bottom = blob->bounding_box().bottom() + bottom_shift;
+        int min_bottom, max_bottom, min_top, max_top;
+        unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
+                                  &min_top, &max_top);
+        // Chars with a wild top range would mess up the result so ignore them.
+        if (max_top - min_top > kMaxCharTopRange)
+          continue;
+        int misfit_dist = std::max((min_top - x_ht_acceptance_tolerance) - top,
+                            top - (max_top + x_ht_acceptance_tolerance));
+        int height = top - kBlnBaselineOffset;
+        if (debug_x_ht_level >= 2) {
+          tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
+                  unicharset.id_to_unichar(class_id),
+                  height, min_bottom, max_bottom, min_top, max_top,
+                  bottom, top);
+        }
+        // Use only chars that fit in the expected bottom range, and where
+        // the range of tops is sensibly near the xheight.
+        if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
+            bottom - x_ht_acceptance_tolerance <= max_bottom &&
+            min_top > kBlnBaselineOffset &&
+            max_top - kBlnBaselineOffset >= kBlnXHeight &&
+            misfit_dist > 0) {
+          // Compute the x-height position using proportionality between the
+          // actual height and expected height.
+          int min_xht = DivRounded(height * kBlnXHeight,
+                                   max_top - kBlnBaselineOffset);
+          int max_xht = DivRounded(height * kBlnXHeight,
+                                   min_top - kBlnBaselineOffset);
+          if (debug_x_ht_level >= 2) {
+            tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
+          }
+          // The range of expected heights gets a vote equal to the distance
+          // of the actual top from the expected top.
+          for (int y = min_xht; y <= max_xht; ++y)
+            top_stats.add(y, misfit_dist);
+        } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
+                    bottom - x_ht_acceptance_tolerance > max_bottom) &&
+                   bottom_shift == 0) {
+          // Get the range of required bottom shift.
+          int min_shift = min_bottom - bottom;
+          int max_shift = max_bottom - bottom;
+          if (debug_x_ht_level >= 2) {
+            tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
+          }
+          // The range of expected shifts gets a vote equal to the min distance
+          // of the actual bottom from the expected bottom, spread over the
+          // range of its acceptance.
+          int misfit_weight = abs(min_shift);
+          if (max_shift > min_shift)
+            misfit_weight /= max_shift - min_shift;
+          for (int y = min_shift; y <= max_shift; ++y)
+            shift_stats.add(y, misfit_weight);
+        } else {
+          if (bottom_shift == 0) {
+            // Things with bottoms that are already ok need to say so, on the
+            // 1st iteration only.
+            shift_stats.add(0, kBlnBaselineOffset);
+          }
+          if (debug_x_ht_level >= 2) {
+            tprintf(" already OK\n");
+          }
+        }
+      }
+    }
+    if (shift_stats.get_total() > top_stats.get_total()) {
+      bottom_shift = IntCastRounded(shift_stats.median());
+      if (debug_x_ht_level >= 2) {
+        tprintf("Applying bottom shift=%d\n", bottom_shift);
+      }
+    }
+  } while (bottom_shift != 0 &&
+           top_stats.get_total() < shift_stats.get_total());
+  // Baseline shift is opposite sign to the bottom shift.
+  *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
+  if (debug_x_ht_level >= 2) {
+    tprintf("baseline shift=%g\n", *baseline_shift);
+  }
+  if (top_stats.get_total() == 0)
+    return bottom_shift != 0 ? word_res->x_height : 0.0f;
+  // The new xheight is just the median vote, which is then scaled out
+  // of BLN space back to pixel space to get the x-height in pixel space.
+  float new_xht = top_stats.median();
+  if (debug_x_ht_level >= 2) {
+    tprintf("Median xht=%f\n", new_xht);
+    tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
+            new_xht, new_xht / word_res->denorm.y_scale());
+  }
+  // The xheight must change by at least x_ht_min_change to be used.
+  if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
+    return new_xht / word_res->denorm.y_scale();
+  else
+    return bottom_shift != 0 ? word_res->x_height : 0.0f;
+}
+
+}  // namespace tesseract
diff --git a/tesseract/src/ccmain/linerec.cpp b/tesseract/src/ccmain/linerec.cpp
new file mode 100644
index 00000000..4db50e03
--- /dev/null
+++ b/tesseract/src/ccmain/linerec.cpp
@@ -0,0 +1,307 @@
+///////////////////////////////////////////////////////////////////////
+// File:        linerec.cpp
+// Description: Top-level line-based recognition module for Tesseract.
+// Author:      Ray Smith
+//
+// (C) Copyright 2013, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#include "tesseractclass.h"
+
+#include "allheaders.h"
+#include "boxread.h"
+#include "imagedata.h"
+#include "lstmrecognizer.h"
+#include "recodebeam.h"
+#include "pageres.h"
+#include "tprintf.h"
+
+#include <algorithm>
+
+namespace tesseract {
+
+// Scale factor to make certainty more comparable to Tesseract.
+const float kCertaintyScale = 7.0f;
+// Worst acceptable certainty for a dictionary word.
+const float kWorstDictCertainty = -25.0f;
+
+// Generates training data for training a line recognizer, eg LSTM.
+// Breaks the page into lines, according to the boxes, and writes them to a
+// serialized DocumentData based on output_basename.
+// Return true if successful, false if an error occurred.
+bool Tesseract::TrainLineRecognizer(const char* input_imagename,
+                                    const STRING& output_basename,
+                                    BLOCK_LIST *block_list) {
+  STRING lstmf_name = output_basename + ".lstmf";
+  DocumentData images(lstmf_name);
+  if (applybox_page > 0) {
+    // Load existing document for the previous pages.
+    if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) {
+      tprintf("Failed to read training data from %s!\n", lstmf_name.c_str());
+      return false;
+    }
+  }
+  std::vector<TBOX> boxes;
+  std::vector<STRING> texts;
+  // Get the boxes for this page, if there are any.
+  if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr,
+                    nullptr) ||
+      boxes.empty()) {
+    tprintf("Failed to read boxes from %s\n", input_imagename);
+    return false;
+  }
+  TrainFromBoxes(boxes, texts, block_list, &images);
+  if (images.PagesSize() == 0) {
+    tprintf("Failed to read pages from %s\n", input_imagename);
+    return false;
+  }
+  images.Shuffle();
+  if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) {
+    tprintf("Failed to write training data to %s!\n", lstmf_name.c_str());
+    return false;
+  }
+  return true;
+}
+
+// Generates training data for training a line recognizer, eg LSTM.
+// Breaks the boxes into lines, normalizes them, converts to ImageData and
+// appends them to the given training_data.
+void Tesseract::TrainFromBoxes(const std::vector<TBOX>& boxes,
+                               const std::vector<STRING>& texts,
+                               BLOCK_LIST *block_list,
+                               DocumentData* training_data) {
+  int box_count = boxes.size();
+  // Process all the text lines in this page, as defined by the boxes.
+  int end_box = 0;
+  // Don't let \t, which marks newlines in the box file, get into the line
+  // content, as that makes the line unusable in training.
+  while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
+  for (int start_box = end_box; start_box < box_count; start_box = end_box) {
+    // Find the textline of boxes starting at start and their bounding box.
+    TBOX line_box = boxes[start_box];
+    STRING line_str = texts[start_box];
+    for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t";
+         ++end_box) {
+      line_box += boxes[end_box];
+      line_str += texts[end_box];
+    }
+    // Find the most overlapping block.
+    BLOCK* best_block = nullptr;
+    int best_overlap = 0;
+    BLOCK_IT b_it(block_list);
+    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+      BLOCK* block = b_it.data();
+      if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText())
+        continue;  // Not a text block.
+      TBOX block_box = block->pdblk.bounding_box();
+      block_box.rotate(block->re_rotation());
+      if (block_box.major_overlap(line_box)) {
+        TBOX overlap_box = line_box.intersection(block_box);
+        if (overlap_box.area() > best_overlap) {
+          best_overlap = overlap_box.area();
+          best_block = block;
+        }
+      }
+    }
+    ImageData* imagedata = nullptr;
+    if (best_block == nullptr) {
+      tprintf("No block overlapping textline: %s\n", line_str.c_str());
+    } else {
+      imagedata = GetLineData(line_box, boxes, texts, start_box, end_box,
+                              *best_block);
+    }
+    if (imagedata != nullptr)
+      training_data->AddPageToDocument(imagedata);
+    // Don't let \t, which marks newlines in the box file, get into the line
+    // content, as that makes the line unusable in training.
+    while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
+  }
+}
+
+// Returns an Imagedata containing the image of the given box,
+// and ground truth boxes/truth text if available in the input.
+// The image is not normalized in any way.
+ImageData* Tesseract::GetLineData(const TBOX& line_box,
+                                  const std::vector<TBOX>& boxes,
+                                  const std::vector<STRING>& texts,
+                                  int start_box, int end_box,
+                                  const BLOCK& block) {
+  TBOX revised_box;
+  ImageData* image_data = GetRectImage(line_box, block, kImagePadding,
+                                       &revised_box);
+  if (image_data == nullptr) return nullptr;
+  image_data->set_page_number(applybox_page);
+  // Copy the boxes and shift them so they are relative to the image.
+  FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
+  ICOORD shift = -revised_box.botleft();
+  std::vector<TBOX> line_boxes;
+  std::vector<STRING> line_texts;
+  for (int b = start_box; b < end_box; ++b) {
+    TBOX box = boxes[b];
+    box.rotate(block_rotation);
+    box.move(shift);
+    line_boxes.push_back(box);
+    line_texts.push_back(texts[b]);
+  }
+  std::vector<int> page_numbers;
+  page_numbers.resize(line_boxes.size(), applybox_page);
+  image_data->AddBoxes(line_boxes, line_texts, page_numbers);
+  return image_data;
+}
+
+// Helper gets the image of a rectangle, using the block.re_rotation() if
+// needed to get to the image, and rotating the result back to horizontal
+// layout. (CJK characters will be on their left sides) The vertical text flag
+// is set in the returned ImageData if the text was originally vertical, which
+// can be used to invoke a different CJK recognition engine. The revised_box
+// is also returned to enable calculation of output bounding boxes.
+ImageData* Tesseract::GetRectImage(const TBOX& box, const BLOCK& block,
+                                   int padding, TBOX* revised_box) const {
+  TBOX wbox = box;
+  wbox.pad(padding, padding);
+  *revised_box = wbox;
+  // Number of clockwise 90 degree rotations needed to get back to tesseract
+  // coords from the clipped image.
+  int num_rotations = 0;
+  if (block.re_rotation().y() > 0.0f)
+    num_rotations = 1;
+  else if (block.re_rotation().x() < 0.0f)
+    num_rotations = 2;
+  else if (block.re_rotation().y() < 0.0f)
+    num_rotations = 3;
+  // Handle two cases automatically: 1 the box came from the block, 2 the box
+  // came from a box file, and refers to the image, which the block may not.
+  if (block.pdblk.bounding_box().major_overlap(*revised_box))
+    revised_box->rotate(block.re_rotation());
+  // Now revised_box always refers to the image.
+  // BestPix is never colormapped, but may be of any depth.
+  Pix* pix = BestPix();
+  int width = pixGetWidth(pix);
+  int height = pixGetHeight(pix);
+  TBOX image_box(0, 0, width, height);
+  // Clip to image bounds;
+  *revised_box &= image_box;
+  if (revised_box->null_box()) return nullptr;
+  Box* clip_box = boxCreate(revised_box->left(), height - revised_box->top(),
+                            revised_box->width(), revised_box->height());
+  Pix* box_pix = pixClipRectangle(pix, clip_box, nullptr);
+  boxDestroy(&clip_box);
+  if (box_pix == nullptr) return nullptr;
+  if (num_rotations > 0) {
+    Pix* rot_pix = pixRotateOrth(box_pix, num_rotations);
+    pixDestroy(&box_pix);
+    box_pix = rot_pix;
+  }
+  // Convert sub-8-bit images to 8 bit.
+  int depth = pixGetDepth(box_pix);
+  if (depth < 8) {
+    Pix* grey;
+    grey = pixConvertTo8(box_pix, false);
+    pixDestroy(&box_pix);
+    box_pix = grey;
+  }
+  bool vertical_text = false;
+  if (num_rotations > 0) {
+    // Rotated the clipped revised box back to internal coordinates.
+    FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
+    revised_box->rotate(rotation);
+    if (num_rotations != 2)
+      vertical_text = true;
+  }
+  return new ImageData(vertical_text, box_pix);
+}
+
+// Recognizes a word or group of words, converting to WERD_RES in *words.
+// Analogous to classify_word_pass1, but can handle a group of words as well.
+void Tesseract::LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word,
+                                  PointerVector<WERD_RES>* words) {
+  TBOX word_box = word->word->bounding_box();
+  // Get the word image - no frills.
+  if (tessedit_pageseg_mode == PSM_SINGLE_WORD ||
+      tessedit_pageseg_mode == PSM_RAW_LINE) {
+    // In single word mode, use the whole image without any other row/word
+    // interpretation.
+    word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
+  } else {
+    float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
+    if (baseline + row->descenders() < word_box.bottom())
+      word_box.set_bottom(baseline + row->descenders());
+    if (baseline + row->x_height() + row->ascenders() > word_box.top())
+      word_box.set_top(baseline + row->x_height() + row->ascenders());
+  }
+  ImageData* im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
+  if (im_data == nullptr) return;
+
+  bool do_invert = tessedit_do_invert;
+  lstm_recognizer_->RecognizeLine(*im_data, do_invert, classify_debug_level > 0,
+                                  kWorstDictCertainty / kCertaintyScale,
+                                  word_box, words, lstm_choice_mode,
+                                  lstm_choice_iterations);
+  delete im_data;
+  SearchWords(words);
+}
+
+// Apply segmentation search to the given set of words, within the constraints
+// of the existing ratings matrix. If there is already a best_choice on a word
+// leaves it untouched and just sets the done/accepted etc flags.
+void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
+  // Run the segmentation search on the network outputs and make a BoxWord
+  // for each of the output words.
+  // If we drop a word as junk, then there is always a space in front of the
+  // next.
+  const Dict* stopper_dict = lstm_recognizer_->GetDict();
+  if (stopper_dict == nullptr) stopper_dict = &getDict();
+  bool any_nonspace_delimited = false;
+  for (int w = 0; w < words->size(); ++w) {
+    WERD_RES* word = (*words)[w];
+    if (word->best_choice != nullptr &&
+        word->best_choice->ContainsAnyNonSpaceDelimited()) {
+      any_nonspace_delimited = true;
+      break;
+    }
+  }
+  for (int w = 0; w < words->size(); ++w) {
+    WERD_RES* word = (*words)[w];
+    if (word->best_choice == nullptr) {
+      // It is a dud.
+      word->SetupFake(lstm_recognizer_->GetUnicharset());
+    } else {
+      // Set the best state.
+      for (int i = 0; i < word->best_choice->length(); ++i) {
+        int length = word->best_choice->state(i);
+        word->best_state.push_back(length);
+      }
+      word->reject_map.initialise(word->best_choice->length());
+      word->tess_failed = false;
+      word->tess_accepted = true;
+      word->tess_would_adapt = false;
+      word->done = true;
+      word->tesseract = this;
+      float word_certainty = std::min(word->space_certainty,
+                                 word->best_choice->certainty());
+      word_certainty *= kCertaintyScale;
+      if (getDict().stopper_debug_level >= 1) {
+        tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
+                word->best_choice->certainty(), word->space_certainty,
+                std::min(word->space_certainty, word->best_choice->certainty()) *
+                    kCertaintyScale,
+                word_certainty);
+        word->best_choice->print();
+      }
+      word->best_choice->set_certainty(word_certainty);
+
+      word->tess_accepted = stopper_dict->AcceptableResult(word);
+    }
+  }
+}
+
+}  // namespace tesseract.
diff --git a/tesseract/src/ccmain/ltrresultiterator.cpp b/tesseract/src/ccmain/ltrresultiterator.cpp
new file mode 100644
index 00000000..5b6cfaf5
--- /dev/null
+++ b/tesseract/src/ccmain/ltrresultiterator.cpp
@@ -0,0 +1,492 @@
+///////////////////////////////////////////////////////////////////////
+// File:        ltrresultiterator.cpp
+// Description: Iterator for tesseract results in strict left-to-right
+//              order that avoids using tesseract internal data structures.
+// Author:      Ray Smith
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include <tesseract/ltrresultiterator.h>
+
+#include "pageres.h"
+#include "tesseractclass.h"
+
+#include "allheaders.h"
+
+#include "strngs.h"
+
+namespace tesseract {
+
+LTRResultIterator::LTRResultIterator(PAGE_RES* page_res, Tesseract* tesseract,
+                                     int scale, int scaled_yres, int rect_left,
+                                     int rect_top, int rect_width,
+                                     int rect_height)
+    : PageIterator(page_res, tesseract, scale, scaled_yres, rect_left, rect_top,
+                   rect_width, rect_height),
+      line_separator_("\n"),
+      paragraph_separator_("\n") {}
+
+// Destructor.
+// It is defined here, so the compiler can create a single vtable
+// instead of weak vtables in every compilation unit.
+LTRResultIterator::~LTRResultIterator() = default;
+
+// Returns the null terminated UTF-8 encoded text string for the current
+// object at the given level. Use delete [] to free after use.
+char* LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {
+  if (it_->word() == nullptr)
+    return nullptr;  // Already at the end!
+  STRING text;
+  PAGE_RES_IT res_it(*it_);
+  WERD_CHOICE* best_choice = res_it.word()->best_choice;
+  ASSERT_HOST(best_choice != nullptr);
+  if (level == RIL_SYMBOL) {
+    text = res_it.word()->BestUTF8(blob_index_, false);
+  } else if (level == RIL_WORD) {
+    text = best_choice->unichar_string();
+  } else {
+    bool eol = false;  // end of line?
+    bool eop = false;  // end of paragraph?
+    do {               // for each paragraph in a block
+      do {             // for each text line in a paragraph
+        do {           // for each word in a text line
+          best_choice = res_it.word()->best_choice;
+          ASSERT_HOST(best_choice != nullptr);
+          text += best_choice->unichar_string();
+          text += " ";
+          res_it.forward();
+          eol = res_it.row() != res_it.prev_row();
+        } while (!eol);
+        text.truncate_at(text.length() - 1);
+        text += line_separator_;
+        eop = res_it.block() != res_it.prev_block() ||
+              res_it.row()->row->para() != res_it.prev_row()->row->para();
+      } while (level != RIL_TEXTLINE && !eop);
+      if (eop)
+        text += paragraph_separator_;
+    } while (level == RIL_BLOCK && res_it.block() == res_it.prev_block());
+  }
+  int length = text.length() + 1;
+  char* result = new char[length];
+  strncpy(result, text.c_str(), length);
+  return result;
+}
+
+// Set the string inserted at the end of each text line. "\n" by default.
+void LTRResultIterator::SetLineSeparator(const char* new_line) {
+  line_separator_ = new_line;
+}
+
+// Set the string inserted at the end of each paragraph. "\n" by default.
+void LTRResultIterator::SetParagraphSeparator(const char* new_para) {
+  paragraph_separator_ = new_para;
+}
+
+// Returns the mean confidence of the current object at the given level.
+// The number should be interpreted as a percent probability. (0.0f-100.0f)
+float LTRResultIterator::Confidence(PageIteratorLevel level) const {
+  if (it_->word() == nullptr)
+    return 0.0f;  // Already at the end!
+  float mean_certainty = 0.0f;
+  int certainty_count = 0;
+  PAGE_RES_IT res_it(*it_);
+  WERD_CHOICE* best_choice = res_it.word()->best_choice;
+  ASSERT_HOST(best_choice != nullptr);
+  switch (level) {
+    case RIL_BLOCK:
+      do {
+        best_choice = res_it.word()->best_choice;
+        ASSERT_HOST(best_choice != nullptr);
+        mean_certainty += best_choice->certainty();
+        ++certainty_count;
+        res_it.forward();
+      } while (res_it.block() == res_it.prev_block());
+      break;
+    case RIL_PARA:
+      do {
+        best_choice = res_it.word()->best_choice;
+        ASSERT_HOST(best_choice != nullptr);
+        mean_certainty += best_choice->certainty();
+        ++certainty_count;
+        res_it.forward();
+      } while (res_it.block() == res_it.prev_block() &&
+               res_it.row()->row->para() == res_it.prev_row()->row->para());
+      break;
+    case RIL_TEXTLINE:
+      do {
+        best_choice = res_it.word()->best_choice;
+        ASSERT_HOST(best_choice != nullptr);
+        mean_certainty += best_choice->certainty();
+        ++certainty_count;
+        res_it.forward();
+      } while (res_it.row() == res_it.prev_row());
+      break;
+    case RIL_WORD:
+      mean_certainty += best_choice->certainty();
+      ++certainty_count;
+      break;
+    case RIL_SYMBOL:
+      mean_certainty += best_choice->certainty(blob_index_);
+      ++certainty_count;
+  }
+  if (certainty_count > 0) {
+    mean_certainty /= certainty_count;
+    return ClipToRange(100 + 5 * mean_certainty, 0.0f, 100.0f);
+  }
+  return 0.0f;
+}
+
+void LTRResultIterator::RowAttributes(float* row_height, float* descenders,
+                                      float* ascenders) const {
+  *row_height = it_->row()->row->x_height() + it_->row()->row->ascenders() -
+                it_->row()->row->descenders();
+  *descenders = it_->row()->row->descenders();
+  *ascenders = it_->row()->row->ascenders();
+}
+
+// Returns the font attributes of the current word. If iterating at a higher
+// level object than words, eg textlines, then this will return the
+// attributes of the first word in that textline.
+// The actual return value is a string representing a font name. It points
+// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
+// the iterator itself, ie rendered invalid by various members of
+// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
+// Pointsize is returned in printers points (1/72 inch.)
+const char* LTRResultIterator::WordFontAttributes(
+    bool* is_bold, bool* is_italic, bool* is_underlined, bool* is_monospace,
+    bool* is_serif, bool* is_smallcaps, int* pointsize, int* font_id) const {
+  const char* result = nullptr;
+
+  if (it_->word() == nullptr) {
+    // Already at the end!
+    *pointsize = 0;
+  } else {
+    float row_height = it_->row()->row->x_height() +
+                       it_->row()->row->ascenders() -
+                       it_->row()->row->descenders();
+    // Convert from pixels to printers points.
+    *pointsize =
+        scaled_yres_ > 0
+            ? static_cast<int>(row_height * kPointsPerInch / scaled_yres_ + 0.5)
+            : 0;
+
+    #ifndef DISABLED_LEGACY_ENGINE
+    const FontInfo* font_info = it_->word()->fontinfo;
+    if (font_info) {
+      // Font information available.
+      *font_id = font_info->universal_id;
+      *is_bold = font_info->is_bold();
+      *is_italic = font_info->is_italic();
+      *is_underlined = false;  // TODO(rays) fix this!
+      *is_monospace = font_info->is_fixed_pitch();
+      *is_serif = font_info->is_serif();
+      result = font_info->name;
+    }
+    #endif  // ndef DISABLED_LEGACY_ENGINE
+
+    *is_smallcaps = it_->word()->small_caps;
+  }
+
+  if (!result) {
+    *is_bold = false;
+    *is_italic = false;
+    *is_underlined = false;
+    *is_monospace = false;
+    *is_serif = false;
+    *is_smallcaps = false;
+    *font_id = -1;
+  }
+
+  return result;
+}
+
+// Returns the name of the language used to recognize this word.
+const char* LTRResultIterator::WordRecognitionLanguage() const {
+  if (it_->word() == nullptr || it_->word()->tesseract == nullptr)
+    return nullptr;
+  return it_->word()->tesseract->lang.c_str();
+}
+
+// Return the overall directionality of this word.
+StrongScriptDirection LTRResultIterator::WordDirection() const {
+  if (it_->word() == nullptr)
+    return DIR_NEUTRAL;
+  bool has_rtl = it_->word()->AnyRtlCharsInWord();
+  bool has_ltr = it_->word()->AnyLtrCharsInWord();
+  if (has_rtl && !has_ltr)
+    return DIR_RIGHT_TO_LEFT;
+  if (has_ltr && !has_rtl)
+    return DIR_LEFT_TO_RIGHT;
+  if (!has_ltr && !has_rtl)
+    return DIR_NEUTRAL;
+  return DIR_MIX;
+}
+
+// Returns true if the current word was found in a dictionary.
+bool LTRResultIterator::WordIsFromDictionary() const {
+  if (it_->word() == nullptr)
+    return false;  // Already at the end!
+  int permuter = it_->word()->best_choice->permuter();
+  return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM ||
+         permuter == USER_DAWG_PERM;
+}
+
+// Returns the number of blanks before the current word.
+int LTRResultIterator::BlanksBeforeWord() const {
+  if (it_->word() == nullptr)
+    return 1;
+  return it_->word()->word->space();
+}
+
+// Returns true if the current word is numeric.
+bool LTRResultIterator::WordIsNumeric() const {
+  if (it_->word() == nullptr)
+    return false;  // Already at the end!
+  int permuter = it_->word()->best_choice->permuter();
+  return permuter == NUMBER_PERM;
+}
+
+// Returns true if the word contains blamer information.
+bool LTRResultIterator::HasBlamerInfo() const {
+  return it_->word() != nullptr && it_->word()->blamer_bundle != nullptr &&
+         it_->word()->blamer_bundle->HasDebugInfo();
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+// Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
+// of the current word.
+const void* LTRResultIterator::GetParamsTrainingBundle() const {
+  return (it_->word() != nullptr && it_->word()->blamer_bundle != nullptr)
+             ? &(it_->word()->blamer_bundle->params_training_bundle())
+             : nullptr;
+}
+#endif  // ndef DISABLED_LEGACY_ENGINE
+
+// Returns the pointer to the string with blamer information for this word.
+// Assumes that the word's blamer_bundle is not nullptr.
+const char* LTRResultIterator::GetBlamerDebug() const {
+  return it_->word()->blamer_bundle->debug().c_str();
+}
+
+// Returns the pointer to the string with misadaption information for this word.
+// Assumes that the word's blamer_bundle is not nullptr.
+const char* LTRResultIterator::GetBlamerMisadaptionDebug() const {
+  return it_->word()->blamer_bundle->misadaption_debug().c_str();
+}
+
+// Returns true if a truth string was recorded for the current word.
+bool LTRResultIterator::HasTruthString() const {
+  if (it_->word() == nullptr)
+    return false;  // Already at the end!
+  if (it_->word()->blamer_bundle == nullptr ||
+      it_->word()->blamer_bundle->NoTruth()) {
+    return false;  // no truth information for this word
+  }
+  return true;
+}
+
+// Returns true if the given string is equivalent to the truth string for
+// the current word.
+bool LTRResultIterator::EquivalentToTruth(const char* str) const {
+  if (!HasTruthString())
+    return false;
+  ASSERT_HOST(it_->word()->uch_set != nullptr);
+  WERD_CHOICE str_wd(str, *(it_->word()->uch_set));
+  return it_->word()->blamer_bundle->ChoiceIsCorrect(&str_wd);
+}
+
+// Returns the null terminated UTF-8 encoded truth string for the current word.
+// Use delete [] to free after use.
+char* LTRResultIterator::WordTruthUTF8Text() const {
+  if (!HasTruthString())
+    return nullptr;
+  STRING truth_text = it_->word()->blamer_bundle->TruthString();
+  int length = truth_text.length() + 1;
+  char* result = new char[length];
+  strncpy(result, truth_text.c_str(), length);
+  return result;
+}
+
+// Returns the null terminated UTF-8 encoded normalized OCR string for the
+// current word. Use delete [] to free after use.
+char* LTRResultIterator::WordNormedUTF8Text() const {
+  if (it_->word() == nullptr)
+    return nullptr;  // Already at the end!
+  STRING ocr_text;
+  WERD_CHOICE* best_choice = it_->word()->best_choice;
+  const UNICHARSET* unicharset = it_->word()->uch_set;
+  ASSERT_HOST(best_choice != nullptr);
+  for (int i = 0; i < best_choice->length(); ++i) {
+    ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i));
+  }
+  int length = ocr_text.length() + 1;
+  char* result = new char[length];
+  strncpy(result, ocr_text.c_str(), length);
+  return result;
+}
+
+// Returns a pointer to serialized choice lattice.
+// Fills lattice_size with the number of bytes in lattice data.
+const char* LTRResultIterator::WordLattice(int* lattice_size) const {
+  if (it_->word() == nullptr)
+    return nullptr;  // Already at the end!
+  if (it_->word()->blamer_bundle == nullptr)
+    return nullptr;
+  *lattice_size = it_->word()->blamer_bundle->lattice_size();
+  return it_->word()->blamer_bundle->lattice_data();
+}
+
+// Returns true if the current symbol is a superscript.
+// If iterating at a higher level object than symbols, eg words, then
+// this will return the attributes of the first symbol in that word.
+bool LTRResultIterator::SymbolIsSuperscript() const {
+  if (cblob_it_ == nullptr && it_->word() != nullptr)
+    return it_->word()->best_choice->BlobPosition(blob_index_) ==
+           SP_SUPERSCRIPT;
+  return false;
+}
+
+// Returns true if the current symbol is a subscript.
+// If iterating at a higher level object than symbols, eg words, then
+// this will return the attributes of the first symbol in that word.
+bool LTRResultIterator::SymbolIsSubscript() const {
+  if (cblob_it_ == nullptr && it_->word() != nullptr)
+    return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUBSCRIPT;
+  return false;
+}
+
+// Returns true if the current symbol is a dropcap.
+// If iterating at a higher level object than symbols, eg words, then
+// this will return the attributes of the first symbol in that word.
+bool LTRResultIterator::SymbolIsDropcap() const {
+  if (cblob_it_ == nullptr && it_->word() != nullptr)
+    return it_->word()->best_choice->BlobPosition(blob_index_) == SP_DROPCAP;
+  return false;
+}
+
+ChoiceIterator::ChoiceIterator(const LTRResultIterator& result_it) {
+  ASSERT_HOST(result_it.it_->word() != nullptr);
+  word_res_ = result_it.it_->word();
+  oemLSTM_ = word_res_->tesseract->AnyLSTMLang();
+  // Is there legacy engine related trained data?
+  bool oemLegacy = word_res_->tesseract->AnyTessLang();
+  // Is lstm_choice_mode activated?
+  bool lstm_choice_mode = word_res_->tesseract->lstm_choice_mode;
+  rating_coefficient_ = word_res_->tesseract->lstm_rating_coefficient;
+  blanks_before_word_ = result_it.BlanksBeforeWord();
+  BLOB_CHOICE_LIST* choices = nullptr; 
+  tstep_index_ = &result_it.blob_index_;
+  if (oemLSTM_ && !word_res_->CTC_symbol_choices.empty()) {
+    if (!word_res_->CTC_symbol_choices[0].empty() &&
+        strcmp(word_res_->CTC_symbol_choices[0][0].first, " ")) {
+      blanks_before_word_ = 0;
+    }
+    auto index = *tstep_index_;
+    index += blanks_before_word_;
+    if (index < word_res_->CTC_symbol_choices.size()) {
+      LSTM_choices_ = &word_res_->CTC_symbol_choices[index];
+      filterSpaces();
+    }
+  }
+  if ((oemLegacy || !lstm_choice_mode) && word_res_->ratings != nullptr)
+    choices = word_res_->GetBlobChoices(result_it.blob_index_);
+  if (choices != nullptr && !choices->empty()) {
+    choice_it_ = new BLOB_CHOICE_IT(choices);
+    choice_it_->mark_cycle_pt();
+  } else {
+    choice_it_ = nullptr;
+  }
+  if (LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
+    LSTM_choice_it_ = LSTM_choices_->begin();
+  }
+}
+ChoiceIterator::~ChoiceIterator() {
+  delete choice_it_;
+}
+
+// Moves to the next choice for the symbol and returns false if there
+// are none left.
+bool ChoiceIterator::Next() {
+  if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
+    if (LSTM_choice_it_ != LSTM_choices_->end() &&
+        next(LSTM_choice_it_) == LSTM_choices_->end()) {
+      return false;
+    } else {
+      ++LSTM_choice_it_;
+      return true;
+    }
+  } else {
+    if (choice_it_ == nullptr)
+      return false;
+    choice_it_->forward();
+    return !choice_it_->cycled_list();
+  }
+}
+
+// Returns the null terminated UTF-8 encoded text string for the current
+// choice. Do NOT use delete [] to free after use.
+const char* ChoiceIterator::GetUTF8Text() const {
+  if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
+    std::pair<const char*, float> choice = *LSTM_choice_it_;
+    return choice.first;
+  } else {
+    if (choice_it_ == nullptr)
+      return nullptr;
+    UNICHAR_ID id = choice_it_->data()->unichar_id();
+    return word_res_->uch_set->id_to_unichar_ext(id);
+  }
+}
+
+// Returns the confidence of the current choice depending on the used language
+// data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All
+// choices for one symbol should roughly add up to 1.0f.
+// If only traineddata of the legacy engine is used, the number should be
+// interpreted as a percent probability. (0.0f-100.0f) In this case
+// probabilities won't add up to 100. Each one stands on its own.
+float ChoiceIterator::Confidence() const {
+  float confidence;
+  if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
+    std::pair<const char*, float> choice = *LSTM_choice_it_;
+    confidence = 100 - rating_coefficient_ * choice.second;
+  } else {
+    if (choice_it_ == nullptr)
+      return 0.0f;
+    confidence = 100 + 5 * choice_it_->data()->certainty();
+  }
+  return ClipToRange(confidence, 0.0f, 100.0f);
+}
+
+// Returns the set of timesteps which belong to the current symbol
+std::vector<std::vector<std::pair<const char*, float>>>*
+ChoiceIterator::Timesteps() const {
+  int offset = *tstep_index_ + blanks_before_word_;
+  if (offset >= word_res_->segmented_timesteps.size() || !oemLSTM_) {
+    return nullptr;
+  }
+  return &word_res_->segmented_timesteps[offset];
+}
+
+void ChoiceIterator::filterSpaces() {
+  if (LSTM_choices_->empty())
+    return;
+  std::vector<std::pair<const char*, float>>::iterator it;
+  for (it = LSTM_choices_->begin(); it != LSTM_choices_->end();) {
+    if (!strcmp(it->first, " ")) {
+      it = LSTM_choices_->erase(it);
+    } else {
+      ++it;
+    }
+  }
+}
+}  // namespace tesseract.
diff --git a/tesseract/src/ccmain/mutableiterator.cpp b/tesseract/src/ccmain/mutableiterator.cpp
new file mode 100644
index 00000000..a472df18
--- /dev/null
+++ b/tesseract/src/ccmain/mutableiterator.cpp
@@ -0,0 +1,24 @@
+///////////////////////////////////////////////////////////////////////
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "mutableiterator.h"
+
+namespace tesseract {
+
+// Destructor.
+// It is defined here, so the compiler can create a single vtable
+// instead of weak vtables in every compilation unit.
+MutableIterator::~MutableIterator() = default;
+
+}  // namespace tesseract.
diff --git a/tesseract/src/ccmain/mutableiterator.h b/tesseract/src/ccmain/mutableiterator.h
new file mode 100644
index 00000000..de3a3612
--- /dev/null
+++ b/tesseract/src/ccmain/mutableiterator.h
@@ -0,0 +1,63 @@
+///////////////////////////////////////////////////////////////////////
+// File:        mutableiterator.h
+// Description: Iterator for tesseract results providing access to
+//              both high-level API and Tesseract internal data structures.
+// Author:      David Eger
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCMAIN_MUTABLEITERATOR_H_
+#define TESSERACT_CCMAIN_MUTABLEITERATOR_H_
+
+#include <tesseract/resultiterator.h>
+
+class BLOB_CHOICE_IT;
+
+namespace tesseract {
+
+class Tesseract;
+
+// Class to iterate over tesseract results, providing access to all levels
+// of the page hierarchy, without including any tesseract headers or having
+// to handle any tesseract structures.
+// WARNING! This class points to data held within the TessBaseAPI class, and
+// therefore can only be used while the TessBaseAPI class still exists and
+// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+// DetectOS, or anything else that changes the internal PAGE_RES.
+// See tesseract/publictypes.h for the definition of PageIteratorLevel.
+// See also base class PageIterator, which contains the bulk of the interface.
+// ResultIterator adds text-specific methods for access to OCR output.
+// MutableIterator adds access to internal data structures.
+
+class TESS_API MutableIterator : public ResultIterator {
+ public:
+  // See argument descriptions in ResultIterator()
+  MutableIterator(PAGE_RES* page_res, Tesseract* tesseract,
+                  int scale, int scaled_yres,
+                  int rect_left, int rect_top,
+                  int rect_width, int rect_height)
+      : ResultIterator(
+          LTRResultIterator(page_res, tesseract, scale, scaled_yres, rect_left,
+                            rect_top, rect_width, rect_height)) {}
+  ~MutableIterator() override;
+
+  // See PageIterator and ResultIterator for most calls.
+
+  // Return access to Tesseract internals.
+  const PAGE_RES_IT *PageResIt() const { return it_; }
+};
+
+}  // namespace tesseract.
+
+#endif  // TESSERACT_CCMAIN_MUTABLEITERATOR_H_
diff --git a/tesseract/src/ccmain/osdetect.cpp b/tesseract/src/ccmain/osdetect.cpp
new file mode 100644
index 00000000..99a5362c
--- /dev/null
+++ b/tesseract/src/ccmain/osdetect.cpp
@@ -0,0 +1,579 @@
+///////////////////////////////////////////////////////////////////////
+// File:        osdetect.cpp
+// Description: Orientation and script detection.
+// Author:      Samuel Charron
+//              Ranjith Unnikrishnan
+//
+// (C) Copyright 2008, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include <tesseract/osdetect.h>
+
+#include "blobbox.h"
+#include "blread.h"
+#include "colfind.h"
+#include "fontinfo.h"
+#include "imagefind.h"
+#include "linefind.h"
+#include "oldlist.h"
+#include "qrsequence.h"
+#include "ratngs.h"
+#include "tabvector.h"
+#include "tesseractclass.h"
+#include "textord.h"
+
+#include "strngs.h"
+
+#include <algorithm>
+#include <cmath>        // for std::fabs
+#include <memory>
+
+namespace tesseract {
+
+const float kSizeRatioToReject = 2.0;
+const int kMinAcceptableBlobHeight = 10;
+
+const float kScriptAcceptRatio = 1.3;
+
+const float kHanRatioInKorean = 0.7;
+const float kHanRatioInJapanese = 0.3;
+
+const float kNonAmbiguousMargin = 1.0;
+
+// General scripts
+static const char* han_script = "Han";
+static const char* latin_script = "Latin";
+static const char* katakana_script = "Katakana";
+static const char* hiragana_script = "Hiragana";
+static const char* hangul_script = "Hangul";
+
+// Pseudo-scripts Name
+const char* ScriptDetector::korean_script_ = "Korean";
+const char* ScriptDetector::japanese_script_ = "Japanese";
+const char* ScriptDetector::fraktur_script_ = "Fraktur";
+
+void OSResults::update_best_orientation() {
+  float first = orientations[0];
+  float second = orientations[1];
+  best_result.orientation_id = 0;
+  if (orientations[0] < orientations[1]) {
+    first = orientations[1];
+    second = orientations[0];
+    best_result.orientation_id = 1;
+  }
+  for (int i = 2; i < 4; ++i) {
+    if (orientations[i] > first) {
+      second = first;
+      first = orientations[i];
+      best_result.orientation_id = i;
+    } else if (orientations[i] > second) {
+      second = orientations[i];
+    }
+  }
+  // Store difference of top two orientation scores.
+  best_result.oconfidence = first - second;
+}
+
+void OSResults::set_best_orientation(int orientation_id) {
+  best_result.orientation_id = orientation_id;
+  best_result.oconfidence = 0;
+}
+
+void OSResults::update_best_script(int orientation) {
+  // We skip index 0 to ignore the "Common" script.
+  float first = scripts_na[orientation][1];
+  float second = scripts_na[orientation][2];
+  best_result.script_id = 1;
+  if (scripts_na[orientation][1] < scripts_na[orientation][2]) {
+    first = scripts_na[orientation][2];
+    second = scripts_na[orientation][1];
+    best_result.script_id = 2;
+  }
+  for (int i = 3; i < kMaxNumberOfScripts; ++i) {
+    if (scripts_na[orientation][i] > first) {
+      best_result.script_id = i;
+      second = first;
+      first = scripts_na[orientation][i];
+    } else if (scripts_na[orientation][i] > second) {
+      second = scripts_na[orientation][i];
+    }
+  }
+  best_result.sconfidence = (second == 0.0f) ? 2.0f :
+      (first / second - 1.0) / (kScriptAcceptRatio - 1.0);
+}
+
+int OSResults::get_best_script(int orientation_id) const {
+  int max_id = -1;
+  for (int j = 0; j < kMaxNumberOfScripts; ++j) {
+    const char *script = unicharset->get_script_from_script_id(j);
+    if (strcmp(script, "Common") && strcmp(script, "NULL")) {
+      if (max_id == -1 ||
+          scripts_na[orientation_id][j] > scripts_na[orientation_id][max_id])
+        max_id = j;
+    }
+  }
+  return max_id;
+}
+
+// Print the script scores for all possible orientations.
+void OSResults::print_scores(void) const {
+  for (int i = 0; i < 4; ++i) {
+    tprintf("Orientation id #%d", i);
+    print_scores(i);
+  }
+}
+
+// Print the script scores for the given candidate orientation.
+void OSResults::print_scores(int orientation_id) const {
+  for (int j = 0; j < kMaxNumberOfScripts; ++j) {
+    if (scripts_na[orientation_id][j]) {
+      tprintf("%12s\t: %f\n", unicharset->get_script_from_script_id(j),
+             scripts_na[orientation_id][j]);
+    }
+  }
+}
+
+// Accumulate scores with given OSResults instance and update the best script.
+void OSResults::accumulate(const OSResults& osr) {
+  for (int i = 0; i < 4; ++i) {
+    orientations[i] += osr.orientations[i];
+    for (int j = 0; j < kMaxNumberOfScripts; ++j)
+      scripts_na[i][j] += osr.scripts_na[i][j];
+  }
+  unicharset = osr.unicharset;
+  update_best_orientation();
+  update_best_script(best_result.orientation_id);
+}
+
+// Detect and erase horizontal/vertical lines and picture regions from the
+// image, so that non-text blobs are removed from consideration.
+static void remove_nontext_regions(tesseract::Tesseract *tess,
+                                   BLOCK_LIST *blocks,
+                                   TO_BLOCK_LIST *to_blocks) {
+  Pix *pix = tess->pix_binary();
+  ASSERT_HOST(pix != nullptr);
+  int vertical_x = 0;
+  int vertical_y = 1;
+  tesseract::TabVector_LIST v_lines;
+  tesseract::TabVector_LIST h_lines;
+  int resolution;
+  if (kMinCredibleResolution > pixGetXRes(pix)) {
+    resolution = kMinCredibleResolution;
+    tprintf("Warning. Invalid resolution %d dpi. Using %d instead.\n",
+            pixGetXRes(pix), resolution);
+  } else {
+    resolution = pixGetXRes(pix);
+  }
+
+  tesseract::LineFinder::FindAndRemoveLines(resolution, false, pix,
+                                            &vertical_x, &vertical_y,
+                                            nullptr, &v_lines, &h_lines);
+  Pix* im_pix = tesseract::ImageFind::FindImages(pix, nullptr);
+  if (im_pix != nullptr) {
+    pixSubtract(pix, pix, im_pix);
+    pixDestroy(&im_pix);
+  }
+  tess->mutable_textord()->find_components(tess->pix_binary(),
+                                           blocks, to_blocks);
+}
+
+// Find connected components in the page and process a subset until finished or
+// a stopping criterion is met.
+// Returns the number of blobs used in making the estimate. 0 implies failure.
+int orientation_and_script_detection(const char* filename,
+                                     OSResults* osr,
+                                     tesseract::Tesseract* tess) {
+  std::string name = filename;   //truncated name
+
+  const char* lastdot = strrchr(name.c_str(), '.');
+  if (lastdot != nullptr)
+    name[lastdot-name.c_str()] = '\0';
+
+  ASSERT_HOST(tess->pix_binary() != nullptr);
+  int width = pixGetWidth(tess->pix_binary());
+  int height = pixGetHeight(tess->pix_binary());
+
+  BLOCK_LIST blocks;
+  if (!read_unlv_file(name, width, height, &blocks))
+    FullPageBlock(width, height, &blocks);
+
+  // Try to remove non-text regions from consideration.
+  TO_BLOCK_LIST land_blocks, port_blocks;
+  remove_nontext_regions(tess, &blocks, &port_blocks);
+
+  if (port_blocks.empty()) {
+    // page segmentation did not succeed, so we need to find_components first.
+    tess->mutable_textord()->find_components(tess->pix_binary(),
+                                             &blocks, &port_blocks);
+  } else {
+    TBOX page_box(0, 0, width, height);
+    // Filter_blobs sets up the TO_BLOCKs the same as find_components does.
+    tess->mutable_textord()->filter_blobs(page_box.topright(),
+                                          &port_blocks, true);
+  }
+
+  return os_detect(&port_blocks, osr, tess);
+}
+
+// Filter and sample the blobs.
+// Returns a non-zero number of blobs if the page was successfully processed, or
+// zero if the page had too few characters to be reliable
+int os_detect(TO_BLOCK_LIST* port_blocks, OSResults* osr,
+              tesseract::Tesseract* tess) {
+  int blobs_total = 0;
+  TO_BLOCK_IT block_it;
+  block_it.set_to_list(port_blocks);
+
+  BLOBNBOX_CLIST filtered_list;
+  BLOBNBOX_C_IT filtered_it(&filtered_list);
+
+  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
+       block_it.forward ()) {
+    TO_BLOCK* to_block = block_it.data();
+    if (to_block->block->pdblk.poly_block() &&
+        !to_block->block->pdblk.poly_block()->IsText()) continue;
+    BLOBNBOX_IT bbox_it;
+    bbox_it.set_to_list(&to_block->blobs);
+    for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list ();
+         bbox_it.forward ()) {
+      BLOBNBOX* bbox = bbox_it.data();
+      C_BLOB*   blob = bbox->cblob();
+      TBOX      box = blob->bounding_box();
+      ++blobs_total;
+
+      // Catch illegal value of box width and avoid division by zero.
+      if (box.width() == 0) continue;
+      // TODO: Can height and width be negative? If not, remove fabs.
+      float y_x = std::fabs((box.height() * 1.0f) / box.width());
+      float x_y = 1.0f / y_x;
+      // Select a >= 1.0 ratio
+      float ratio = x_y > y_x ? x_y : y_x;
+      // Blob is ambiguous
+      if (ratio > kSizeRatioToReject) continue;
+      if (box.height() < kMinAcceptableBlobHeight) continue;
+      filtered_it.add_to_end(bbox);
+    }
+  }
+  return os_detect_blobs(nullptr, &filtered_list, osr, tess);
+}
+
+// Detect orientation and script from a list of blobs.
+// Returns a non-zero number of blobs if the list was successfully processed, or
+// zero if the list had too few characters to be reliable.
+// If allowed_scripts is non-null and non-empty, it is a list of scripts that
+// constrains both orientation and script detection to consider only scripts
+// from the list.
+int os_detect_blobs(const std::vector<int>* allowed_scripts,
+                    BLOBNBOX_CLIST* blob_list, OSResults* osr,
+                    tesseract::Tesseract* tess) {
+  OSResults osr_;
+  int minCharactersToTry = tess->min_characters_to_try;
+  int maxCharactersToTry = 5 * minCharactersToTry;
+  if (osr == nullptr)
+    osr = &osr_;
+
+  osr->unicharset = &tess->unicharset;
+  OrientationDetector o(allowed_scripts, osr);
+  ScriptDetector s(allowed_scripts, osr, tess);
+
+  BLOBNBOX_C_IT filtered_it(blob_list);
+  int real_max = std::min(filtered_it.length(), maxCharactersToTry);
+  // tprintf("Total blobs found = %d\n", blobs_total);
+  // tprintf("Number of blobs post-filtering = %d\n", filtered_it.length());
+  // tprintf("Number of blobs to try = %d\n", real_max);
+
+  // If there are too few characters, skip this page entirely.
+  if (real_max < minCharactersToTry / 2) {
+    tprintf("Too few characters. Skipping this page\n");
+    return 0;
+  }
+
+  auto** blobs = new BLOBNBOX*[filtered_it.length()];
+  int number_of_blobs = 0;
+  for (filtered_it.mark_cycle_pt (); !filtered_it.cycled_list ();
+       filtered_it.forward ()) {
+    blobs[number_of_blobs++] = filtered_it.data();
+  }
+  QRSequenceGenerator sequence(number_of_blobs);
+  int num_blobs_evaluated = 0;
+  for (int i = 0; i < real_max; ++i) {
+    if (os_detect_blob(blobs[sequence.GetVal()], &o, &s, osr, tess)
+        && i > minCharactersToTry) {
+      break;
+    }
+    ++num_blobs_evaluated;
+  }
+  delete [] blobs;
+
+  // Make sure the best_result is up-to-date
+  int orientation = o.get_orientation();
+  osr->update_best_script(orientation);
+  return num_blobs_evaluated;
+}
+
+// Processes a single blob to estimate script and orientation.
+// Return true if estimate of orientation and script satisfies stopping
+// criteria.
+bool os_detect_blob(BLOBNBOX* bbox, OrientationDetector* o,
+                    ScriptDetector* s, OSResults* osr,
+                    tesseract::Tesseract* tess) {
+  tess->tess_cn_matching.set_value(true); // turn it on
+  tess->tess_bn_matching.set_value(false);
+  C_BLOB* blob = bbox->cblob();
+  TBLOB* tblob = TBLOB::PolygonalCopy(tess->poly_allow_detailed_fx, blob);
+  TBOX box = tblob->bounding_box();
+  FCOORD current_rotation(1.0f, 0.0f);
+  FCOORD rotation90(0.0f, 1.0f);
+  BLOB_CHOICE_LIST ratings[4];
+  // Test the 4 orientations
+  for (int i = 0; i < 4; ++i) {
+    // Normalize the blob. Set the origin to the place we want to be the
+    // bottom-middle after rotation.
+    // Scaling is to make the rotated height the x-height.
+    float scaling = static_cast<float>(kBlnXHeight) / box.height();
+    float x_origin = (box.left() + box.right()) / 2.0f;
+    float y_origin = (box.bottom() + box.top()) / 2.0f;
+    if (i == 0 || i == 2) {
+      // Rotation is 0 or 180.
+      y_origin = i == 0 ? box.bottom() : box.top();
+    } else {
+      // Rotation is 90 or 270.
+      scaling = static_cast<float>(kBlnXHeight) / box.width();
+      x_origin = i == 1 ? box.left() : box.right();
+    }
+    std::unique_ptr<TBLOB> rotated_blob(new TBLOB(*tblob));
+    rotated_blob->Normalize(nullptr, &current_rotation, nullptr,
+                            x_origin, y_origin, scaling, scaling,
+                            0.0f, static_cast<float>(kBlnBaselineOffset),
+                            false, nullptr);
+    tess->AdaptiveClassifier(rotated_blob.get(), ratings + i);
+    current_rotation.rotate(rotation90);
+  }
+  delete tblob;
+
+  bool stop = o->detect_blob(ratings);
+  s->detect_blob(ratings);
+  int orientation = o->get_orientation();
+  stop = s->must_stop(orientation) && stop;
+  return stop;
+}
+
+
+OrientationDetector::OrientationDetector(
+    const std::vector<int>* allowed_scripts, OSResults* osr) {
+  osr_ = osr;
+  allowed_scripts_ = allowed_scripts;
+}
+
+// Score the given blob and return true if it is now sure of the orientation
+// after adding this block.
+bool OrientationDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
+  float blob_o_score[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+  float total_blob_o_score = 0.0f;
+
+  for (int i = 0; i < 4; ++i) {
+    BLOB_CHOICE_IT choice_it(scores + i);
+    if (!choice_it.empty()) {
+      BLOB_CHOICE* choice = nullptr;
+      if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {
+        // Find the top choice in an allowed script.
+        for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() &&
+             choice == nullptr; choice_it.forward()) {
+          int choice_script = choice_it.data()->script_id();
+          int s = 0;
+          for (s = 0; s < allowed_scripts_->size(); ++s) {
+            if ((*allowed_scripts_)[s] == choice_script) {
+              choice = choice_it.data();
+              break;
+            }
+          }
+        }
+      } else {
+        choice = choice_it.data();
+      }
+      if (choice != nullptr) {
+        // The certainty score ranges between [-20,0]. This is converted here to
+        // [0,1], with 1 indicating best match.
+        blob_o_score[i] = 1 + 0.05 * choice->certainty();
+        total_blob_o_score += blob_o_score[i];
+      }
+    }
+  }
+  if (total_blob_o_score == 0.0) return false;
+  // Fill in any blanks with the worst score of the others. This is better than
+  // picking an arbitrary probability for it and way better than -inf.
+  float worst_score = 0.0f;
+  int num_good_scores = 0;
+  for (float f : blob_o_score) {
+    if (f > 0.0f) {
+      ++num_good_scores;
+      if (worst_score == 0.0f || f < worst_score)
+        worst_score = f;
+    }
+  }
+  if (num_good_scores == 1) {
+    // Lower worst if there is only one.
+    worst_score /= 2.0f;
+  }
+  for (float& f : blob_o_score) {
+    if (f == 0.0f) {
+      f = worst_score;
+      total_blob_o_score += worst_score;
+    }
+  }
+  // Normalize the orientation scores for the blob and use them to
+  // update the aggregated orientation score.
+  for (int i = 0; total_blob_o_score != 0 && i < 4; ++i) {
+    osr_->orientations[i] += log(blob_o_score[i] / total_blob_o_score);
+  }
+
+  // TODO(ranjith) Add an early exit test, based on min_orientation_margin,
+  // as used in pagesegmain.cpp.
+  return false;
+}
+
+int OrientationDetector::get_orientation() {
+  osr_->update_best_orientation();
+  return osr_->best_result.orientation_id;
+}
+
+
+ScriptDetector::ScriptDetector(const std::vector<int>* allowed_scripts,
+                               OSResults* osr, tesseract::Tesseract* tess) {
+  osr_ = osr;
+  tess_ = tess;
+  allowed_scripts_ = allowed_scripts;
+  katakana_id_ = tess_->unicharset.add_script(katakana_script);
+  hiragana_id_ = tess_->unicharset.add_script(hiragana_script);
+  han_id_ = tess_->unicharset.add_script(han_script);
+  hangul_id_ = tess_->unicharset.add_script(hangul_script);
+  japanese_id_ = tess_->unicharset.add_script(japanese_script_);
+  korean_id_ = tess_->unicharset.add_script(korean_script_);
+  latin_id_ = tess_->unicharset.add_script(latin_script);
+  fraktur_id_ = tess_->unicharset.add_script(fraktur_script_);
+}
+
+
+// Score the given blob and return true if it is now sure of the script after
+// adding this blob.
+void ScriptDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
+  for (int i = 0; i < 4; ++i) {
+    bool done[kMaxNumberOfScripts] = { false };
+
+    BLOB_CHOICE_IT choice_it;
+    choice_it.set_to_list(scores + i);
+
+    float prev_score = -1;
+    int script_count = 0;
+    int prev_id = -1;
+    int prev_fontinfo_id = -1;
+    const char* prev_unichar = "";
+    const char* unichar = "";
+
+    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
+         choice_it.forward()) {
+      BLOB_CHOICE* choice = choice_it.data();
+      int id = choice->script_id();
+      if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {
+        // Check that the choice is in an allowed script.
+        int s = 0;
+        for (s = 0; s < allowed_scripts_->size(); ++s) {
+          if ((*allowed_scripts_)[s] == id) break;
+        }
+        if (s == allowed_scripts_->size()) continue;  // Not found in list.
+      }
+      // Script already processed before.
+      if (done[id]) continue;
+      done[id] = true;
+
+      unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
+      // Save data from the first match
+      if (prev_score < 0) {
+        prev_score = -choice->certainty();
+        script_count = 1;
+        prev_id = id;
+        prev_unichar = unichar;
+        prev_fontinfo_id = choice->fontinfo_id();
+      } else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {
+        ++script_count;
+      }
+
+      if (strlen(prev_unichar) == 1)
+        if (unichar[0] >= '0' && unichar[0] <= '9')
+          break;
+
+      // if script_count is >= 2, character is ambiguous, skip other matches
+      // since they are useless.
+      if (script_count >= 2)
+        break;
+    }
+    // Character is non ambiguous
+    if (script_count == 1) {
+      // Update the score of the winning script
+      osr_->scripts_na[i][prev_id] += 1.0;
+
+      // Workaround for Fraktur
+      if (prev_id == latin_id_) {
+        if (prev_fontinfo_id >= 0) {
+          const tesseract::FontInfo &fi =
+              tess_->get_fontinfo_table().get(prev_fontinfo_id);
+          //printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
+          //       fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
+          //       fi.is_serif(), fi.is_fraktur(),
+          //       prev_unichar);
+          if (fi.is_fraktur()) {
+            osr_->scripts_na[i][prev_id] -= 1.0;
+            osr_->scripts_na[i][fraktur_id_] += 1.0;
+          }
+        }
+      }
+
+      // Update Japanese / Korean pseudo-scripts
+      if (prev_id == katakana_id_)
+        osr_->scripts_na[i][japanese_id_] += 1.0;
+      if (prev_id == hiragana_id_)
+        osr_->scripts_na[i][japanese_id_] += 1.0;
+      if (prev_id == hangul_id_)
+        osr_->scripts_na[i][korean_id_] += 1.0;
+      if (prev_id == han_id_) {
+        osr_->scripts_na[i][korean_id_] += kHanRatioInKorean;
+        osr_->scripts_na[i][japanese_id_] += kHanRatioInJapanese;
+      }
+    }
+  }  // iterate over each orientation
+}
+
+bool ScriptDetector::must_stop(int orientation) {
+  osr_->update_best_script(orientation);
+  return osr_->best_result.sconfidence > 1;
+}
+
+// Helper method to convert an orientation index to its value in degrees.
+// The value represents the amount of clockwise rotation in degrees that must be
+// applied for the text to be upright (readable).
+int OrientationIdToValue(const int& id) {
+  switch (id) {
+    case 0:
+      return 0;
+    case 1:
+      return 270;
+    case 2:
+      return 180;
+    case 3:
+      return 90;
+    default:
+      return -1;
+  }
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/output.cpp b/tesseract/src/ccmain/output.cpp
new file mode 100644
index 00000000..d043e03a
--- /dev/null
+++ b/tesseract/src/ccmain/output.cpp
@@ -0,0 +1,418 @@
+/******************************************************************
+ * File:        output.cpp  (Formerly output.c)
+ * Description: Output pass
+ * Author:      Phil Cheatle
+ *
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "output.h"
+
+#include "control.h"
+#include "tesseractclass.h"
+#include "tessvars.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#include "docqual.h"
+#include "reject.h"
+#endif
+
+#include "helpers.h"
+
+#include <cctype>
+#include <cerrno>
+#include <cstring>
+
+#define CTRL_NEWLINE    '\012'   //newline
+#define CTRL_HARDLINE   '\015'   //cr
+
+namespace tesseract {
+void Tesseract::output_pass(  //Tess output pass //send to api
+                            PAGE_RES_IT &page_res_it,
+                            const TBOX *target_word_box) {
+  BLOCK_RES *block_of_last_word;
+  bool force_eol;               //During output
+  BLOCK *nextblock;              //block of next word
+  WERD *nextword;                //next word
+
+  page_res_it.restart_page ();
+  block_of_last_word = nullptr;
+  while (page_res_it.word () != nullptr) {
+    check_debug_pt (page_res_it.word (), 120);
+
+    if (target_word_box) {
+      TBOX current_word_box = page_res_it.word()->word->bounding_box();
+      FCOORD center_pt(
+          (current_word_box.right() + current_word_box.left()) / 2,
+          (current_word_box.bottom() + current_word_box.top()) / 2);
+      if (!target_word_box->contains(center_pt)) {
+        page_res_it.forward();
+        continue;
+      }
+    }
+    if (tessedit_write_block_separators &&
+    block_of_last_word != page_res_it.block ()) {
+      block_of_last_word = page_res_it.block ();
+    }
+
+    force_eol = (tessedit_write_block_separators &&
+      (page_res_it.block () != page_res_it.next_block ())) ||
+      (page_res_it.next_word () == nullptr);
+
+    if (page_res_it.next_word () != nullptr)
+      nextword = page_res_it.next_word ()->word;
+    else
+      nextword = nullptr;
+    if (page_res_it.next_block () != nullptr)
+      nextblock = page_res_it.next_block ()->block;
+    else
+      nextblock = nullptr;
+                                 //regardless of tilde crunching
+    write_results(page_res_it,
+                  determine_newline_type(page_res_it.word()->word,
+                                         page_res_it.block()->block,
+                                         nextword, nextblock), force_eol);
+    page_res_it.forward();
+  }
+}
+
+
+/*************************************************************************
+ * write_results()
+ *
+ * All recognition and rejection has now been done. Generate the following:
+ *   .txt file     - giving the final best choices with NO highlighting
+ *   .raw file     - giving the tesseract top choice output for each word
+ *   .map file     - showing how the .txt file has been rejected in the .ep file
+ *   epchoice list - a list of one element per word, containing the text for the
+ *                   epaper. Reject strings are inserted.
+ *   inset list    - a list of bounding boxes of reject insets - indexed by the
+ *                   reject strings in the epchoice text.
+ *************************************************************************/
+void Tesseract::write_results(PAGE_RES_IT& page_res_it,
+                              char newline_type,  // type of newline
+                              bool force_eol) {  // override tilde crunch?
+  WERD_RES *word = page_res_it.word();
+  const UNICHARSET &uchset = *word->uch_set;
+  int i;
+  bool need_reject = false;
+  UNICHAR_ID space = uchset.unichar_to_id(" ");
+
+  if ((word->unlv_crunch_mode != CR_NONE ||
+       word->best_choice->length() == 0) &&
+      !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
+    if ((word->unlv_crunch_mode != CR_DELETE) &&
+        (!stats_.tilde_crunch_written ||
+         ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
+          (word->word->space () > 0) &&
+          !word->word->flag (W_FUZZY_NON) &&
+          !word->word->flag (W_FUZZY_SP)))) {
+      if (!word->word->flag (W_BOL) &&
+          (word->word->space () > 0) &&
+          !word->word->flag (W_FUZZY_NON) &&
+          !word->word->flag (W_FUZZY_SP)) {
+        stats_.last_char_was_tilde = false;
+      }
+      need_reject = true;
+    }
+    if ((need_reject && !stats_.last_char_was_tilde) ||
+        (force_eol && stats_.write_results_empty_block)) {
+      /* Write a reject char - mark as rejected unless zero_rejection mode */
+      stats_.last_char_was_tilde = true;
+      stats_.tilde_crunch_written = true;
+      stats_.last_char_was_newline = false;
+      stats_.write_results_empty_block = false;
+    }
+
+    if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
+      stats_.tilde_crunch_written = false;
+      stats_.last_char_was_newline = true;
+      stats_.last_char_was_tilde = false;
+    }
+
+    if (force_eol)
+      stats_.write_results_empty_block = true;
+    return;
+  }
+
+  /* NORMAL PROCESSING of non tilde crunched words */
+
+  stats_.tilde_crunch_written = false;
+  if (newline_type)
+    stats_.last_char_was_newline = true;
+  else
+    stats_.last_char_was_newline = false;
+  stats_.write_results_empty_block = force_eol;  // about to write a real word
+
+  if (unlv_tilde_crunching &&
+      stats_.last_char_was_tilde &&
+      (word->word->space() == 0) &&
+      !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
+      (word->best_choice->unichar_id(0) == space)) {
+    /* Prevent adjacent tilde across words - we know that adjacent tildes within
+       words have been removed */
+    word->MergeAdjacentBlobs(0);
+  }
+  if (newline_type ||
+    (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
+    stats_.last_char_was_tilde = false;
+  else {
+    if (word->reject_map.length () > 0) {
+      if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
+        stats_.last_char_was_tilde = true;
+      else
+        stats_.last_char_was_tilde = false;
+    }
+    else if (word->word->space () > 0)
+      stats_.last_char_was_tilde = false;
+    /* else it is unchanged as there are no output chars */
+  }
+
+  ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
+
+  set_unlv_suspects(word);
+  check_debug_pt (word, 120);
+  if (tessedit_rejection_debug) {
+    tprintf ("Dict word: \"%s\": %d\n",
+             word->best_choice->debug_string().c_str(),
+             dict_word(*(word->best_choice)));
+  }
+  if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
+    if (tessedit_zero_rejection) {
+      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
+      for (i = 0; i < word->best_choice->length(); ++i) {
+        if (word->reject_map[i].rejected())
+          word->reject_map[i].setrej_minimal_rej_accept();
+      }
+    }
+    if (tessedit_minimal_rejection) {
+      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
+      for (i = 0; i < word->best_choice->length(); ++i) {
+        if ((word->best_choice->unichar_id(i) != space) &&
+            word->reject_map[i].rejected())
+          word->reject_map[i].setrej_minimal_rej_accept();
+      }
+    }
+  }
+}
+
+/**********************************************************************
+ * determine_newline_type
+ *
+ * Find whether we have a wrapping or hard newline.
+ * Return false if not at end of line.
+ **********************************************************************/
+
+char determine_newline_type(                   //test line ends
+                            WERD *word,        //word to do
+                            BLOCK *block,      //current block
+                            WERD *next_word,   //next word
+                            BLOCK *next_block  //block of next word
+                           ) {
+  int16_t end_gap;                 //to right edge
+  int16_t width;                   //of next word
+  TBOX word_box;                  //bounding
+  TBOX next_box;                  //next word
+  TBOX block_box;                 //block bounding
+
+  if (!word->flag (W_EOL))
+    return false;                //not end of line
+  if (next_word == nullptr || next_block == nullptr || block != next_block)
+    return CTRL_NEWLINE;
+  if (next_word->space () > 0)
+    return CTRL_HARDLINE;        //it is tabbed
+  word_box = word->bounding_box ();
+  next_box = next_word->bounding_box ();
+  block_box = block->pdblk.bounding_box ();
+                                 //gap to eol
+  end_gap = block_box.right () - word_box.right ();
+  end_gap -= static_cast<int32_t>(block->space ());
+  width = next_box.right () - next_box.left ();
+  //      tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
+  //              block_box.right(),word_box.right(),end_gap,
+  //              next_box.right(),next_box.left(),width,
+  //              end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
+  return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
+}
+
+/*************************************************************************
+ * get_rep_char()
+ * Return the first accepted character from the repetition string. This is the
+ * character which is repeated - as determined earlier by fix_rep_char()
+ *************************************************************************/
+UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) {  // what char is repeated?
+  int i;
+  for (i = 0; ((i < word->reject_map.length()) &&
+               (word->reject_map[i].rejected())); ++i);
+
+  if (i < word->reject_map.length()) {
+    return word->best_choice->unichar_id(i);
+  } else {
+    return word->uch_set->unichar_to_id(unrecognised_char.c_str());
+  }
+}
+
+/*************************************************************************
+ * SUSPECT LEVELS
+ *
+ * 0 - don't reject ANYTHING
+ * 1,2 - partial rejection
+ * 3 - BEST
+ *
+ * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
+ * tessedit_minimal_rejection.
+ *************************************************************************/
+void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
+  int len = word_res->reject_map.length();
+  const WERD_CHOICE &word = *(word_res->best_choice);
+  const UNICHARSET &uchset = *word.unicharset();
+  int i;
+  float rating_per_ch;
+
+  if (suspect_level == 0) {
+    for (i = 0; i < len; i++) {
+      if (word_res->reject_map[i].rejected())
+        word_res->reject_map[i].setrej_minimal_rej_accept();
+    }
+    return;
+  }
+
+  if (suspect_level >= 3)
+    return;                      //Use defaults
+
+  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
+
+  if (safe_dict_word(word_res) &&
+      (count_alphas(word) > suspect_short_words)) {
+    /* Unreject alphas in dictionary words */
+    for (i = 0; i < len; ++i) {
+      if (word_res->reject_map[i].rejected() &&
+          uchset.get_isalpha(word.unichar_id(i)))
+        word_res->reject_map[i].setrej_minimal_rej_accept();
+    }
+  }
+
+  rating_per_ch = word.rating() / word_res->reject_map.length();
+
+  if (rating_per_ch >= suspect_rating_per_ch)
+    return;  // Don't touch bad ratings
+
+  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
+    /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
+    for (i = 0; i < len; ++i) {
+      if (word_res->reject_map[i].rejected() &&
+          (!uchset.eq(word.unichar_id(i), " ")))
+        word_res->reject_map[i].setrej_minimal_rej_accept();
+    }
+  }
+
+  for (i = 0; i < len; i++) {
+    if (word_res->reject_map[i].rejected()) {
+      if (word_res->reject_map[i].flag(R_DOC_REJ))
+        word_res->reject_map[i].setrej_minimal_rej_accept();
+      if (word_res->reject_map[i].flag(R_BLOCK_REJ))
+        word_res->reject_map[i].setrej_minimal_rej_accept();
+      if (word_res->reject_map[i].flag(R_ROW_REJ))
+        word_res->reject_map[i].setrej_minimal_rej_accept();
+    }
+  }
+
+  if (suspect_level == 2)
+    return;
+
+  if (!suspect_constrain_1Il ||
+      (word_res->reject_map.length() <= suspect_short_words)) {
+    for (i = 0; i < len; i++) {
+      if (word_res->reject_map[i].rejected()) {
+        if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
+          word_res->reject_map[i].flag(R_POSTNN_1IL)))
+          word_res->reject_map[i].setrej_minimal_rej_accept();
+
+        if (!suspect_constrain_1Il &&
+          word_res->reject_map[i].flag(R_MM_REJECT))
+          word_res->reject_map[i].setrej_minimal_rej_accept();
+      }
+    }
+  }
+
+  if (acceptable_word_string(*word_res->uch_set,
+                             word.unichar_string().c_str(),
+                             word.unichar_lengths().c_str()) !=
+                                 AC_UNACCEPTABLE ||
+      acceptable_number_string(word.unichar_string().c_str(),
+                               word.unichar_lengths().c_str())) {
+    if (word_res->reject_map.length() > suspect_short_words) {
+      for (i = 0; i < len; i++) {
+        if (word_res->reject_map[i].rejected() &&
+          (!word_res->reject_map[i].perm_rejected() ||
+           word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
+           word_res->reject_map[i].flag (R_POSTNN_1IL) ||
+           word_res->reject_map[i].flag (R_MM_REJECT))) {
+          word_res->reject_map[i].setrej_minimal_rej_accept();
+        }
+      }
+    }
+  }
+}
+
+int16_t Tesseract::count_alphas(const WERD_CHOICE &word) {
+  int count = 0;
+  for (int i = 0; i < word.length(); ++i) {
+    if (word.unicharset()->get_isalpha(word.unichar_id(i)))
+      count++;
+  }
+  return count;
+}
+
+
+int16_t Tesseract::count_alphanums(const WERD_CHOICE &word) {
+  int count = 0;
+  for (int i = 0; i < word.length(); ++i) {
+    if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
+        word.unicharset()->get_isdigit(word.unichar_id(i)))
+      count++;
+  }
+  return count;
+}
+
+
+bool Tesseract::acceptable_number_string(const char* s,
+                                         const char* lengths) {
+  bool prev_digit = false;
+
+  if (*lengths == 1 && *s == '(')
+    s++;
+
+  if (*lengths == 1 &&
+      ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
+    s++;
+
+  for (; *s != '\0'; s += *(lengths++)) {
+    if (unicharset.get_isdigit(s, *lengths))
+      prev_digit = true;
+    else if (prev_digit &&
+             (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
+      prev_digit = false;
+    else if (prev_digit && *lengths == 1 &&
+             (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
+      return true;
+    else if (prev_digit &&
+             *lengths == 1 && (*s == '%') &&
+             (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
+             (*(s + *lengths + *(lengths + 1)) == '\0'))
+      return true;
+    else
+      return false;
+  }
+  return true;
+}
+}  // namespace tesseract
diff --git a/tesseract/src/ccmain/output.h b/tesseract/src/ccmain/output.h
new file mode 100644
index 00000000..00f59466
--- /dev/null
+++ b/tesseract/src/ccmain/output.h
@@ -0,0 +1,37 @@
+/******************************************************************
+ * File:        output.h  (Formerly output.h)
+ * Description: Output pass
+ * Author:      Phil Cheatle
+ * Created:     Thu Aug  4 10:56:08 BST 1994
+ *
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef OUTPUT_H
+#define OUTPUT_H
+
+namespace tesseract {
+
+class BLOCK;
+class WERD;
+
+/** test line ends */
+char determine_newline_type(WERD *word,        ///< word to do
+                            BLOCK *block,      ///< current block
+                            WERD *next_word,   ///< next word
+                            BLOCK *next_block  ///< block of next word
+                           );
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/ccmain/pageiterator.cpp b/tesseract/src/ccmain/pageiterator.cpp
new file mode 100644
index 00000000..75dd9b40
--- /dev/null
+++ b/tesseract/src/ccmain/pageiterator.cpp
@@ -0,0 +1,635 @@
+///////////////////////////////////////////////////////////////////////
+// File:        pageiterator.cpp
+// Description: Iterator for tesseract page structure that avoids using
+//              tesseract internal data structures.
+// Author:      Ray Smith
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include <tesseract/pageiterator.h>
+#include "allheaders.h"
+#include "helpers.h"
+#include "pageres.h"
+#include "tesseractclass.h"
+
+#include <algorithm>
+
+namespace tesseract {
+
+PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract, int scale,
+                           int scaled_yres, int rect_left, int rect_top,
+                           int rect_width, int rect_height)
+    : page_res_(page_res),
+      tesseract_(tesseract),
+      word_(nullptr),
+      word_length_(0),
+      blob_index_(0),
+      cblob_it_(nullptr),
+      include_upper_dots_(false),
+      include_lower_dots_(false),
+      scale_(scale),
+      scaled_yres_(scaled_yres),
+      rect_left_(rect_left),
+      rect_top_(rect_top),
+      rect_width_(rect_width),
+      rect_height_(rect_height) {
+  it_ = new PAGE_RES_IT(page_res);
+  PageIterator::Begin();
+}
+
+PageIterator::~PageIterator() {
+  delete it_;
+  delete cblob_it_;
+}
+
+/**
+ * PageIterators may be copied! This makes it possible to iterate over
+ * all the objects at a lower level, while maintaining an iterator to
+ * objects at a higher level.
+ */
+PageIterator::PageIterator(const PageIterator& src)
+    : page_res_(src.page_res_),
+      tesseract_(src.tesseract_),
+      word_(nullptr),
+      word_length_(src.word_length_),
+      blob_index_(src.blob_index_),
+      cblob_it_(nullptr),
+      include_upper_dots_(src.include_upper_dots_),
+      include_lower_dots_(src.include_lower_dots_),
+      scale_(src.scale_),
+      scaled_yres_(src.scaled_yres_),
+      rect_left_(src.rect_left_),
+      rect_top_(src.rect_top_),
+      rect_width_(src.rect_width_),
+      rect_height_(src.rect_height_) {
+  it_ = new PAGE_RES_IT(*src.it_);
+  BeginWord(src.blob_index_);
+}
+
+const PageIterator& PageIterator::operator=(const PageIterator& src) {
+  page_res_ = src.page_res_;
+  tesseract_ = src.tesseract_;
+  include_upper_dots_ = src.include_upper_dots_;
+  include_lower_dots_ = src.include_lower_dots_;
+  scale_ = src.scale_;
+  scaled_yres_ = src.scaled_yres_;
+  rect_left_ = src.rect_left_;
+  rect_top_ = src.rect_top_;
+  rect_width_ = src.rect_width_;
+  rect_height_ = src.rect_height_;
+  delete it_;
+  it_ = new PAGE_RES_IT(*src.it_);
+  BeginWord(src.blob_index_);
+  return *this;
+}
+
+bool PageIterator::PositionedAtSameWord(const PAGE_RES_IT* other) const {
+  return (it_ == nullptr && it_ == other) ||
+     ((other != nullptr) && (it_ != nullptr) && (*it_ == *other));
+}
+
+// ============= Moving around within the page ============.
+
+/** Resets the iterator to point to the start of the page. */
+void PageIterator::Begin() {
+  it_->restart_page_with_empties();
+  BeginWord(0);
+}
+
+void PageIterator::RestartParagraph() {
+  if (it_->block() == nullptr) return; // At end of the document.
+  PAGE_RES_IT para(page_res_);
+  PAGE_RES_IT next_para(para);
+  next_para.forward_paragraph();
+  while (next_para.cmp(*it_) <= 0) {
+    para = next_para;
+    next_para.forward_paragraph();
+  }
+  *it_ = para;
+  BeginWord(0);
+}
+
+bool PageIterator::IsWithinFirstTextlineOfParagraph() const {
+  PageIterator p_start(*this);
+  p_start.RestartParagraph();
+  return p_start.it_->row() == it_->row();
+}
+
+void PageIterator::RestartRow() {
+  it_->restart_row();
+  BeginWord(0);
+}
+
+/**
+ * Moves to the start of the next object at the given level in the
+ * page hierarchy, and returns false if the end of the page was reached.
+ * NOTE (CHANGED!) that ALL PageIteratorLevel level values will visit each
+ * non-text block at least once.
+ * Think of non text blocks as containing a single para, with at least one
+ * line, with a single imaginary word, containing a single symbol.
+ * The bounding boxes mark out any polygonal nature of the block, and
+ * PTIsTextType(BLockType()) is false for non-text blocks.
+ * Calls to Next with different levels may be freely intermixed.
+ * This function iterates words in right-to-left scripts correctly, if
+ * the appropriate language has been loaded into Tesseract.
+ */
+bool PageIterator::Next(PageIteratorLevel level) {
+  if (it_->block() == nullptr) return false;  // Already at the end!
+  if (it_->word() == nullptr)
+    level = RIL_BLOCK;
+
+  switch (level) {
+    case RIL_BLOCK:
+      it_->forward_block();
+      break;
+    case RIL_PARA:
+      it_->forward_paragraph();
+      break;
+    case RIL_TEXTLINE:
+      for (it_->forward_with_empties(); it_->row() == it_->prev_row();
+           it_->forward_with_empties());
+      break;
+    case RIL_WORD:
+      it_->forward_with_empties();
+      break;
+    case RIL_SYMBOL:
+      if (cblob_it_ != nullptr)
+        cblob_it_->forward();
+      ++blob_index_;
+      if (blob_index_ >= word_length_)
+        it_->forward_with_empties();
+      else
+        return true;
+      break;
+  }
+  BeginWord(0);
+  return it_->block() != nullptr;
+}
+
+/**
+ * Returns true if the iterator is at the start of an object at the given
+ * level. Possible uses include determining if a call to Next(RIL_WORD)
+ * moved to the start of a RIL_PARA.
+ */
+bool PageIterator::IsAtBeginningOf(PageIteratorLevel level) const {
+  if (it_->block() == nullptr) return false;  // Already at the end!
+  if (it_->word() == nullptr) return true;  // In an image block.
+  switch (level) {
+    case RIL_BLOCK:
+      return blob_index_ == 0 && it_->block() != it_->prev_block();
+    case RIL_PARA:
+      return blob_index_ == 0 &&
+          (it_->block() != it_->prev_block() ||
+           it_->row()->row->para() != it_->prev_row()->row->para());
+    case RIL_TEXTLINE:
+      return blob_index_ == 0 && it_->row() != it_->prev_row();
+    case RIL_WORD:
+      return blob_index_ == 0;
+    case RIL_SYMBOL:
+      return true;
+  }
+  return false;
+}
+
+/**
+ * Returns whether the iterator is positioned at the last element in a
+ * given level. (e.g. the last word in a line, the last line in a block)
+ */
+bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
+                                    PageIteratorLevel element) const {
+  if (Empty(element)) return true;  // Already at the end!
+  // The result is true if we step forward by element and find we are
+  // at the the end of the page or at beginning of *all* levels in:
+  // [level, element).
+  // When there is more than one level difference between element and level,
+  // we could for instance move forward one symbol and still be at the first
+  // word on a line, so we also have to be at the first symbol in a word.
+  PageIterator next(*this);
+  next.Next(element);
+  if (next.Empty(element)) return true;  // Reached the end of the page.
+  while (element > level) {
+    element = static_cast<PageIteratorLevel>(element - 1);
+    if (!next.IsAtBeginningOf(element))
+      return false;
+  }
+  return true;
+}
+
+/**
+ * Returns whether this iterator is positioned
+ *   before other:   -1
+ *   equal to other:  0
+ *   after other:     1
+ */
+int PageIterator::Cmp(const PageIterator &other) const {
+  int word_cmp = it_->cmp(*other.it_);
+  if (word_cmp != 0)
+    return word_cmp;
+  if (blob_index_ < other.blob_index_)
+    return -1;
+  if (blob_index_ == other.blob_index_)
+    return 0;
+  return 1;
+}
+
+// ============= Accessing data ==============.
+// Coordinate system:
+// Integer coordinates are at the cracks between the pixels.
+// The top-left corner of the top-left pixel in the image is at (0,0).
+// The bottom-right corner of the bottom-right pixel in the image is at
+// (width, height).
+// Every bounding box goes from the top-left of the top-left contained
+// pixel to the bottom-right of the bottom-right contained pixel, so
+// the bounding box of the single top-left pixel in the image is:
+// (0,0)->(1,1).
+// If an image rectangle has been set in the API, then returned coordinates
+// relate to the original (full) image, rather than the rectangle.
+
+/**
+ * Returns the bounding rectangle of the current object at the given level in
+ * the coordinates of the working image that is pix_binary().
+ * See comment on coordinate system above.
+ * Returns false if there is no such object at the current position.
+ */
+bool PageIterator::BoundingBoxInternal(PageIteratorLevel level,
+                                       int* left, int* top,
+                                       int* right, int* bottom) const {
+  if (Empty(level))
+    return false;
+  TBOX box;
+  PARA *para = nullptr;
+  switch (level) {
+    case RIL_BLOCK:
+      box = it_->block()->block->restricted_bounding_box(include_upper_dots_,
+                                                         include_lower_dots_);
+      break;
+    case RIL_PARA:
+      para = it_->row()->row->para();
+      // Fall through.
+    case RIL_TEXTLINE:
+      box = it_->row()->row->restricted_bounding_box(include_upper_dots_,
+                                                     include_lower_dots_);
+      break;
+    case RIL_WORD:
+      box = it_->word()->word->restricted_bounding_box(include_upper_dots_,
+                                                       include_lower_dots_);
+      break;
+    case RIL_SYMBOL:
+      if (cblob_it_ == nullptr)
+        box = it_->word()->box_word->BlobBox(blob_index_);
+      else
+        box = cblob_it_->data()->bounding_box();
+  }
+  if (level == RIL_PARA) {
+    PageIterator other = *this;
+    other.Begin();
+    do {
+      if (other.it_->block() &&
+          other.it_->block()->block == it_->block()->block &&
+          other.it_->row() && other.it_->row()->row &&
+          other.it_->row()->row->para() == para) {
+        box = box.bounding_union(other.it_->row()->row->bounding_box());
+      }
+    } while (other.Next(RIL_TEXTLINE));
+  }
+  if (level != RIL_SYMBOL || cblob_it_ != nullptr)
+    box.rotate(it_->block()->block->re_rotation());
+  // Now we have a box in tesseract coordinates relative to the image rectangle,
+  // we have to convert the coords to a top-down system.
+  const int pix_height = pixGetHeight(tesseract_->pix_binary());
+  const int pix_width = pixGetWidth(tesseract_->pix_binary());
+  *left = ClipToRange(static_cast<int>(box.left()), 0, pix_width);
+  *top = ClipToRange(pix_height - box.top(), 0, pix_height);
+  *right = ClipToRange(static_cast<int>(box.right()), *left, pix_width);
+  *bottom = ClipToRange(pix_height - box.bottom(), *top, pix_height);
+  return true;
+}
+
+/**
+ * Returns the bounding rectangle of the current object at the given level in
+ * coordinates of the original image.
+ * See comment on coordinate system above.
+ * Returns false if there is no such object at the current position.
+ */
+bool PageIterator::BoundingBox(PageIteratorLevel level,
+                               int* left, int* top,
+                               int* right, int* bottom) const {
+  return BoundingBox(level, 0, left, top, right, bottom);
+}
+
+bool PageIterator::BoundingBox(PageIteratorLevel level, const int padding,
+                               int* left, int* top,
+                               int* right, int* bottom) const {
+  if (!BoundingBoxInternal(level, left, top, right, bottom))
+    return false;
+  // Convert to the coordinate system of the original image.
+  *left = ClipToRange(*left / scale_ + rect_left_ - padding,
+                      rect_left_, rect_left_ + rect_width_);
+  *top = ClipToRange(*top / scale_ + rect_top_ - padding,
+                     rect_top_, rect_top_ + rect_height_);
+  *right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_ + padding,
+                       *left, rect_left_ + rect_width_);
+  *bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_ + padding,
+                        *top, rect_top_ + rect_height_);
+  return true;
+}
+
+/** Return that there is no such object at a given level. */
+bool PageIterator::Empty(PageIteratorLevel level) const {
+  if (it_->block() == nullptr) return true;  // Already at the end!
+  if (it_->word() == nullptr && level != RIL_BLOCK) return true;  // image block
+  if (level == RIL_SYMBOL && blob_index_ >= word_length_)
+    return true;  // Zero length word, or already at the end of it.
+  return false;
+}
+
+/** Returns the type of the current block.
+ *  See tesseract/publictypes.h for PolyBlockType. */
+PolyBlockType PageIterator::BlockType() const {
+  if (it_->block() == nullptr || it_->block()->block == nullptr)
+    return PT_UNKNOWN;  // Already at the end!
+  if (it_->block()->block->pdblk.poly_block() == nullptr)
+    return PT_FLOWING_TEXT;  // No layout analysis used - assume text.
+  return it_->block()->block->pdblk.poly_block()->isA();
+}
+
+/** Returns the polygon outline of the current block. The returned Pta must
+ *  be ptaDestroy-ed after use. */
+Pta* PageIterator::BlockPolygon() const {
+  if (it_->block() == nullptr || it_->block()->block == nullptr)
+    return nullptr;  // Already at the end!
+  if (it_->block()->block->pdblk.poly_block() == nullptr)
+    return nullptr;  // No layout analysis used - no polygon.
+  // Copy polygon, so we can unrotate it to image coordinates.
+  POLY_BLOCK* internal_poly = it_->block()->block->pdblk.poly_block();
+  ICOORDELT_LIST vertices;
+  vertices.deep_copy(internal_poly->points(), ICOORDELT::deep_copy);
+  POLY_BLOCK poly(&vertices, internal_poly->isA());
+  poly.rotate(it_->block()->block->re_rotation());
+  ICOORDELT_IT it(poly.points());
+  Pta* pta = ptaCreate(it.length());
+  int num_pts = 0;
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++num_pts) {
+    ICOORD* pt = it.data();
+    // Convert to top-down coords within the input image.
+    int x = static_cast<float>(pt->x()) / scale_ + rect_left_;
+    int y = rect_top_ + rect_height_ - static_cast<float>(pt->y()) / scale_;
+    x = ClipToRange(x, rect_left_, rect_left_ + rect_width_);
+    y = ClipToRange(y, rect_top_, rect_top_ + rect_height_);
+    ptaAddPt(pta, x, y);
+  }
+  return pta;
+}
+
+/**
+ * Returns a binary image of the current object at the given level.
+ * The position and size match the return from BoundingBoxInternal, and so this
+ * could be upscaled with respect to the original input image.
+ * Use pixDestroy to delete the image after use.
+ * The following methods are used to generate the images:
+ * RIL_BLOCK: mask the page image with the block polygon.
+ * RIL_TEXTLINE: Clip the rectangle of the line box from the page image.
+ * TODO(rays) fix this to generate and use a line polygon.
+ * RIL_WORD: Clip the rectangle of the word box from the page image.
+ * RIL_SYMBOL: Render the symbol outline to an image for cblobs (prior
+ * to recognition) or the bounding box otherwise.
+ * A reconstruction of the original image (using xor to check for double
+ * representation) should be reasonably accurate,
+ * apart from removed noise, at the block level. Below the block level, the
+ * reconstruction will be missing images and line separators.
+ * At the symbol level, kerned characters will be invade the bounding box
+ * if rendered after recognition, making an xor reconstruction inaccurate, but
+ * an or construction better. Before recognition, symbol-level reconstruction
+ * should be good, even with xor, since the images come from the connected
+ * components.
+ */
+Pix* PageIterator::GetBinaryImage(PageIteratorLevel level) const {
+  int left, top, right, bottom;
+  if (!BoundingBoxInternal(level, &left, &top, &right, &bottom))
+    return nullptr;
+  if (level == RIL_SYMBOL && cblob_it_ != nullptr &&
+      cblob_it_->data()->area() != 0)
+    return cblob_it_->data()->render();
+  Box* box = boxCreate(left, top, right - left, bottom - top);
+  Pix* pix = pixClipRectangle(tesseract_->pix_binary(), box, nullptr);
+  boxDestroy(&box);
+  if (level == RIL_BLOCK || level == RIL_PARA) {
+    // Clip to the block polygon as well.
+    TBOX mask_box;
+    Pix* mask = it_->block()->block->render_mask(&mask_box);
+    int mask_x = left - mask_box.left();
+    int mask_y = top - (tesseract_->ImageHeight() - mask_box.top());
+    // AND the mask and pix, putting the result in pix.
+    pixRasterop(pix, std::max(0, -mask_x), std::max(0, -mask_y), pixGetWidth(pix),
+                pixGetHeight(pix), PIX_SRC & PIX_DST, mask, std::max(0, mask_x),
+                std::max(0, mask_y));
+    pixDestroy(&mask);
+  }
+  return pix;
+}
+
+/**
+ * Returns an image of the current object at the given level in greyscale
+ * if available in the input. To guarantee a binary image use BinaryImage.
+ * NOTE that in order to give the best possible image, the bounds are
+ * expanded slightly over the binary connected component, by the supplied
+ * padding, so the top-left position of the returned image is returned
+ * in (left,top). These will most likely not match the coordinates
+ * returned by BoundingBox.
+ * If you do not supply an original image, you will get a binary one.
+ * Use pixDestroy to delete the image after use.
+ */
+Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
+                            Pix* original_img,
+                            int* left, int* top) const {
+  int right, bottom;
+  if (!BoundingBox(level, left, top, &right, &bottom))
+    return nullptr;
+  if (original_img == nullptr)
+    return GetBinaryImage(level);
+
+  // Expand the box.
+  *left = std::max(*left - padding, 0);
+  *top = std::max(*top - padding, 0);
+  right = std::min(right + padding, rect_width_);
+  bottom = std::min(bottom + padding, rect_height_);
+  Box* box = boxCreate(*left, *top, right - *left, bottom - *top);
+  Pix* grey_pix = pixClipRectangle(original_img, box, nullptr);
+  boxDestroy(&box);
+  if (level == RIL_BLOCK || level == RIL_PARA) {
+    // Clip to the block polygon as well.
+    TBOX mask_box;
+    Pix* mask = it_->block()->block->render_mask(&mask_box);
+    // Copy the mask registered correctly into an image the size of grey_pix.
+    int mask_x = *left - mask_box.left();
+    int mask_y = *top - (pixGetHeight(original_img) - mask_box.top());
+    int width = pixGetWidth(grey_pix);
+    int height = pixGetHeight(grey_pix);
+    Pix* resized_mask = pixCreate(width, height, 1);
+    pixRasterop(resized_mask, std::max(0, -mask_x), std::max(0, -mask_y), width, height,
+                PIX_SRC, mask, std::max(0, mask_x), std::max(0, mask_y));
+    pixDestroy(&mask);
+    pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1,
+                   2 * padding + 1);
+    pixInvert(resized_mask, resized_mask);
+    pixSetMasked(grey_pix, resized_mask, UINT32_MAX);
+    pixDestroy(&resized_mask);
+  }
+  return grey_pix;
+}
+
+/**
+ * Returns the baseline of the current object at the given level.
+ * The baseline is the line that passes through (x1, y1) and (x2, y2).
+ * WARNING: with vertical text, baselines may be vertical!
+ */
+bool PageIterator::Baseline(PageIteratorLevel level,
+                            int* x1, int* y1, int* x2, int* y2) const {
+  if (it_->word() == nullptr) return false;  // Already at the end!
+  ROW* row = it_->row()->row;
+  WERD* word = it_->word()->word;
+  TBOX box = (level == RIL_WORD || level == RIL_SYMBOL)
+           ? word->bounding_box()
+           : row->bounding_box();
+  int left = box.left();
+  ICOORD startpt(left, static_cast<int16_t>(row->base_line(left) + 0.5));
+  int right = box.right();
+  ICOORD endpt(right, static_cast<int16_t>(row->base_line(right) + 0.5));
+  // Rotate to image coordinates and convert to global image coords.
+  startpt.rotate(it_->block()->block->re_rotation());
+  endpt.rotate(it_->block()->block->re_rotation());
+  *x1 = startpt.x() / scale_ + rect_left_;
+  *y1 = (rect_height_ - startpt.y()) / scale_ + rect_top_;
+  *x2 = endpt.x() / scale_ + rect_left_;
+  *y2 = (rect_height_ - endpt.y()) / scale_ + rect_top_;
+  return true;
+}
+
+void PageIterator::Orientation(tesseract::Orientation *orientation,
+                               tesseract::WritingDirection *writing_direction,
+                               tesseract::TextlineOrder *textline_order,
+                               float *deskew_angle) const {
+  BLOCK* block = it_->block()->block;
+
+  // Orientation
+  FCOORD up_in_image(0.0, 1.0);
+  up_in_image.unrotate(block->classify_rotation());
+  up_in_image.rotate(block->re_rotation());
+
+  if (up_in_image.x() == 0.0F) {
+    if (up_in_image.y() > 0.0F) {
+      *orientation = ORIENTATION_PAGE_UP;
+    } else {
+      *orientation = ORIENTATION_PAGE_DOWN;
+    }
+  } else if (up_in_image.x() > 0.0F) {
+    *orientation = ORIENTATION_PAGE_RIGHT;
+  } else {
+    *orientation = ORIENTATION_PAGE_LEFT;
+  }
+
+  // Writing direction
+  bool is_vertical_text = (block->classify_rotation().x() == 0.0);
+  bool right_to_left = block->right_to_left();
+  *writing_direction =
+      is_vertical_text
+          ? WRITING_DIRECTION_TOP_TO_BOTTOM
+          : (right_to_left
+                ? WRITING_DIRECTION_RIGHT_TO_LEFT
+                : WRITING_DIRECTION_LEFT_TO_RIGHT);
+
+  // Textline Order
+  const bool is_mongolian = false;  // TODO(eger): fix me
+  *textline_order = is_vertical_text
+      ? (is_mongolian
+         ? TEXTLINE_ORDER_LEFT_TO_RIGHT
+         : TEXTLINE_ORDER_RIGHT_TO_LEFT)
+      : TEXTLINE_ORDER_TOP_TO_BOTTOM;
+
+  // Deskew angle
+  FCOORD skew = block->skew();  // true horizontal for textlines
+  *deskew_angle = -skew.angle();
+}
+
+void PageIterator::ParagraphInfo(tesseract::ParagraphJustification *just,
+                                 bool *is_list_item,
+                                 bool *is_crown,
+                                 int *first_line_indent) const {
+  *just = tesseract::JUSTIFICATION_UNKNOWN;
+  if (!it_->row() || !it_->row()->row || !it_->row()->row->para() ||
+      !it_->row()->row->para()->model)
+    return;
+
+  PARA *para = it_->row()->row->para();
+  *is_list_item = para->is_list_item;
+  *is_crown = para->is_very_first_or_continuation;
+  *first_line_indent = para->model->first_indent() -
+      para->model->body_indent();
+  *just = para->model->justification();
+}
+
+/**
+ * Sets up the internal data for iterating the blobs of a new word, then
+ * moves the iterator to the given offset.
+ */
+void PageIterator::BeginWord(int offset) {
+  WERD_RES* word_res = it_->word();
+  if (word_res == nullptr) {
+    // This is a non-text block, so there is no word.
+    word_length_ = 0;
+    blob_index_ = 0;
+    word_ = nullptr;
+    return;
+  }
+  if (word_res->best_choice != nullptr) {
+    // Recognition has been done, so we are using the box_word, which
+    // is already baseline denormalized.
+    word_length_ = word_res->best_choice->length();
+    if (word_res->box_word != nullptr) {
+      if (word_res->box_word->length() != word_length_) {
+        tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ",
+                word_length_, word_res->best_choice->unichar_string().c_str(),
+                word_res->box_word->length());
+        word_res->box_word->bounding_box().print();
+      }
+      ASSERT_HOST(word_res->box_word->length() == word_length_);
+    }
+    word_ = nullptr;
+    // We will be iterating the box_word.
+    delete cblob_it_;
+    cblob_it_ = nullptr;
+  } else {
+    // No recognition yet, so a "symbol" is a cblob.
+    word_ = word_res->word;
+    ASSERT_HOST(word_->cblob_list() != nullptr);
+    word_length_ = word_->cblob_list()->length();
+    if (cblob_it_ == nullptr) cblob_it_ = new C_BLOB_IT;
+    cblob_it_->set_to_list(word_->cblob_list());
+  }
+  for (blob_index_ = 0; blob_index_ < offset; ++blob_index_) {
+    if (cblob_it_ != nullptr)
+      cblob_it_->forward();
+  }
+}
+
+bool PageIterator::SetWordBlamerBundle(BlamerBundle *blamer_bundle) {
+  if (it_->word() != nullptr) {
+    it_->word()->blamer_bundle = blamer_bundle;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+}  // namespace tesseract.
diff --git a/tesseract/src/ccmain/pagesegmain.cpp b/tesseract/src/ccmain/pagesegmain.cpp
new file mode 100644
index 00000000..d3a32fab
--- /dev/null
+++ b/tesseract/src/ccmain/pagesegmain.cpp
@@ -0,0 +1,420 @@
+/**********************************************************************
+ * File:        pagesegmain.cpp
+ * Description: Top-level page segmenter for Tesseract.
+ * Author:      Ray Smith
+ *
+ * (C) Copyright 2008, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifdef _WIN32
+#ifndef unlink
+#include <io.h>
+#endif
+#else
+#include <unistd.h>
+#endif  // _WIN32
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#include "allheaders.h"
+#include "blobbox.h"
+#include "blread.h"
+#include "colfind.h"
+#include "debugpixa.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#include "equationdetect.h"
+#endif
+#include "imagefind.h"
+#include "linefind.h"
+#include "makerow.h"
+#include <tesseract/osdetect.h>
+#include "tabvector.h"
+#include "tesseractclass.h"
+#include "tessvars.h"
+#include "textord.h"
+#include "tordmain.h"
+#include "wordseg.h"
+
+namespace tesseract {
+
+// Max erosions to perform in removing an enclosing circle.
+const int kMaxCircleErosions = 8;
+
+// Helper to remove an enclosing circle from an image.
+// If there isn't one, then the image will most likely get badly mangled.
+// The returned pix must be pixDestroyed after use. nullptr may be returned
+// if the image doesn't meet the trivial conditions that it uses to determine
+// success.
+static Pix* RemoveEnclosingCircle(Pix* pixs) {
+  Pix* pixsi = pixInvert(nullptr, pixs);
+  Pix* pixc = pixCreateTemplate(pixs);
+  pixSetOrClearBorder(pixc, 1, 1, 1, 1, PIX_SET);
+  pixSeedfillBinary(pixc, pixc, pixsi, 4);
+  pixInvert(pixc, pixc);
+  pixDestroy(&pixsi);
+  Pix* pixt = pixAnd(nullptr, pixs, pixc);
+  l_int32 max_count;
+  pixCountConnComp(pixt, 8, &max_count);
+  // The count has to go up before we start looking for the minimum.
+  l_int32 min_count = INT32_MAX;
+  Pix* pixout = nullptr;
+  for (int i = 1; i < kMaxCircleErosions; i++) {
+    pixDestroy(&pixt);
+    pixErodeBrick(pixc, pixc, 3, 3);
+    pixt = pixAnd(nullptr, pixs, pixc);
+    l_int32 count;
+    pixCountConnComp(pixt, 8, &count);
+    if (i == 1 || count > max_count) {
+      max_count = count;
+      min_count = count;
+    } else if (count < min_count) {
+      min_count = count;
+      pixDestroy(&pixout);
+      pixout = pixCopy(nullptr, pixt);  // Save the best.
+    } else if (count >= min_count) {
+      break;  // We have passed by the best.
+    }
+  }
+  pixDestroy(&pixt);
+  pixDestroy(&pixc);
+  return pixout;
+}
+
+/**
+ * Segment the page according to the current value of tessedit_pageseg_mode.
+ * pix_binary_ is used as the source image and should not be nullptr.
+ * On return the blocks list owns all the constructed page layout.
+ */
+int Tesseract::SegmentPage(const char* input_file, BLOCK_LIST* blocks,
+                           Tesseract* osd_tess, OSResults* osr) {
+  ASSERT_HOST(pix_binary_ != nullptr);
+  int width = pixGetWidth(pix_binary_);
+  int height = pixGetHeight(pix_binary_);
+  // Get page segmentation mode.
+  auto pageseg_mode = static_cast<PageSegMode>(
+      static_cast<int>(tessedit_pageseg_mode));
+  // If a UNLV zone file can be found, use that instead of segmentation.
+  if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
+      input_file != nullptr && input_file[0] != '\0') {
+    std::string name = input_file;
+    const char* lastdot = strrchr(name.c_str(), '.');
+    if (lastdot != nullptr)
+      name[lastdot - name.c_str()] = '\0';
+    read_unlv_file(name, width, height, blocks);
+  }
+  if (blocks->empty()) {
+    // No UNLV file present. Work according to the PageSegMode.
+    // First make a single block covering the whole image.
+    BLOCK_IT block_it(blocks);
+    auto* block = new BLOCK("", true, 0, 0, 0, 0, width, height);
+    block->set_right_to_left(right_to_left());
+    block_it.add_to_end(block);
+  } else {
+    // UNLV file present. Use PSM_SINGLE_BLOCK.
+    pageseg_mode = PSM_SINGLE_BLOCK;
+  }
+  // The diacritic_blobs holds noise blobs that may be diacritics. They
+  // are separated out on areas of the image that seem noisy and short-circuit
+  // the layout process, going straight from the initial partition creation
+  // right through to after word segmentation, where they are added to the
+  // rej_cblobs list of the most appropriate word. From there classification
+  // will determine whether they are used.
+  BLOBNBOX_LIST diacritic_blobs;
+  int auto_page_seg_ret_val = 0;
+  TO_BLOCK_LIST to_blocks;
+  if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
+      PSM_SPARSE(pageseg_mode)) {
+    auto_page_seg_ret_val = AutoPageSeg(
+        pageseg_mode, blocks, &to_blocks,
+        enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr);
+    if (pageseg_mode == PSM_OSD_ONLY)
+      return auto_page_seg_ret_val;
+    // To create blobs from the image region bounds uncomment this line:
+    //  to_blocks.clear();  // Uncomment to go back to the old mode.
+  } else {
+    deskew_ = FCOORD(1.0f, 0.0f);
+    reskew_ = FCOORD(1.0f, 0.0f);
+    if (pageseg_mode == PSM_CIRCLE_WORD) {
+      Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
+      if (pixcleaned != nullptr) {
+        pixDestroy(&pix_binary_);
+        pix_binary_ = pixcleaned;
+      }
+    }
+  }
+
+  if (auto_page_seg_ret_val < 0) {
+    return -1;
+  }
+
+  if (blocks->empty()) {
+    if (textord_debug_tabfind)
+      tprintf("Empty page\n");
+    return 0;  // AutoPageSeg found an empty page.
+  }
+  bool splitting =
+      pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT;
+  bool cjk_mode = textord_use_cjk_fp_model;
+
+  textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
+                       pix_thresholds_, pix_grey_, splitting || cjk_mode,
+                       &diacritic_blobs, blocks, &to_blocks);
+  return auto_page_seg_ret_val;
+}
+
+/**
+ * Auto page segmentation. Divide the page image into blocks of uniform
+ * text linespacing and images.
+ *
+ * Resolution (in ppi) is derived from the input image.
+ *
+ * The output goes in the blocks list with corresponding TO_BLOCKs in the
+ * to_blocks list.
+ *
+ * If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide
+ * the image into columns, but multiple blocks are still made if the text is
+ * of non-uniform linespacing.
+ *
+ * If diacritic_blobs is non-null, then diacritics/noise blobs, that would
+ * confuse layout analysis by causing textline overlap, are placed there,
+ * with the expectation that they will be reassigned to words later and
+ * noise/diacriticness determined via classification.
+ *
+ * If osd (orientation and script detection) is true then that is performed
+ * as well. If only_osd is true, then only orientation and script detection is
+ * performed. If osd is desired, (osd or only_osd) then osr_tess must be
+ * another Tesseract that was initialized especially for osd, and the results
+ * will be output into osr (orientation and script result).
+ */
+int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
+                           TO_BLOCK_LIST* to_blocks,
+                           BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess,
+                           OSResults* osr) {
+  Pix* photomask_pix = nullptr;
+  Pix* musicmask_pix = nullptr;
+  // The blocks made by the ColumnFinder. Moved to blocks before return.
+  BLOCK_LIST found_blocks;
+  TO_BLOCK_LIST temp_blocks;
+
+  ColumnFinder* finder = SetupPageSegAndDetectOrientation(
+      pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,
+      pageseg_apply_music_mask ? &musicmask_pix : nullptr);
+  int result = 0;
+  if (finder != nullptr) {
+    TO_BLOCK_IT to_block_it(&temp_blocks);
+    TO_BLOCK* to_block = to_block_it.data();
+    if (musicmask_pix != nullptr) {
+      // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
+      // blocks separately. For now combine with photomask_pix.
+      pixOr(photomask_pix, photomask_pix, musicmask_pix);
+    }
+  #ifndef DISABLED_LEGACY_ENGINE
+    if (equ_detect_) {
+      finder->SetEquationDetect(equ_detect_);
+    }
+  #endif  // ndef DISABLED_LEGACY_ENGINE
+    result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_,
+                                to_block, photomask_pix, pix_thresholds_,
+                                pix_grey_, &pixa_debug_, &found_blocks,
+                                diacritic_blobs, to_blocks);
+    if (result >= 0)
+      finder->GetDeskewVectors(&deskew_, &reskew_);
+    delete finder;
+  }
+  pixDestroy(&photomask_pix);
+  pixDestroy(&musicmask_pix);
+  if (result < 0) return result;
+
+  blocks->clear();
+  BLOCK_IT block_it(blocks);
+  // Move the found blocks to the input/output blocks.
+  block_it.add_list_after(&found_blocks);
+  return result;
+}
+
+// Helper adds all the scripts from sid_set converted to ids from osd_set to
+// allowed_ids.
+static void AddAllScriptsConverted(const UNICHARSET& sid_set,
+                                   const UNICHARSET& osd_set,
+                                   std::vector<int>* allowed_ids) {
+  for (int i = 0; i < sid_set.get_script_table_size(); ++i) {
+    if (i != sid_set.null_sid()) {
+      const char* script = sid_set.get_script_from_script_id(i);
+      allowed_ids->push_back(osd_set.get_script_id_from_name(script));
+    }
+  }
+}
+
+/**
+ * Sets up auto page segmentation, determines the orientation, and corrects it.
+ * Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to
+ * facilitate testing.
+ * photo_mask_pix is a pointer to a nullptr pointer that will be filled on return
+ * with the leptonica photo mask, which must be pixDestroyed by the caller.
+ * to_blocks is an empty list that will be filled with (usually a single)
+ * block that is used during layout analysis. This ugly API is required
+ * because of the possibility of a unlv zone file.
+ * TODO(rays) clean this up.
+ * See AutoPageSeg for other arguments.
+ * The returned ColumnFinder must be deleted after use.
+ */
+ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
+    PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess,
+    OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
+    Pix** music_mask_pix) {
+  int vertical_x = 0;
+  int vertical_y = 1;
+  TabVector_LIST v_lines;
+  TabVector_LIST h_lines;
+  ICOORD bleft(0, 0);
+
+  ASSERT_HOST(pix_binary_ != nullptr);
+  if (tessedit_dump_pageseg_images) {
+    pixa_debug_.AddPix(pix_binary_, "PageSegInput");
+  }
+  // Leptonica is used to find the rule/separator lines in the input.
+  LineFinder::FindAndRemoveLines(source_resolution_,
+                                 textord_tabfind_show_vlines, pix_binary_,
+                                 &vertical_x, &vertical_y, music_mask_pix,
+                                 &v_lines, &h_lines);
+  if (tessedit_dump_pageseg_images) {
+    pixa_debug_.AddPix(pix_binary_, "NoLines");
+  }
+  // Leptonica is used to find a mask of the photo regions in the input.
+  *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
+  if (tessedit_dump_pageseg_images) {
+    Pix* pix_no_image_ = nullptr;
+    if (*photo_mask_pix != nullptr) {
+      pix_no_image_ = pixSubtract(nullptr, pix_binary_, *photo_mask_pix);
+    } else {
+      pix_no_image_ = pixClone(pix_binary_);
+    }
+    pixa_debug_.AddPix(pix_no_image_, "NoImages");
+    pixDestroy(&pix_no_image_);
+  }
+  if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear();
+
+  // The rest of the algorithm uses the usual connected components.
+  textord_.find_components(pix_binary_, blocks, to_blocks);
+
+  TO_BLOCK_IT to_block_it(to_blocks);
+  // There must be exactly one input block.
+  // TODO(rays) handle new textline finding with a UNLV zone file.
+  ASSERT_HOST(to_blocks->singleton());
+  TO_BLOCK* to_block = to_block_it.data();
+  TBOX blkbox = to_block->block->pdblk.bounding_box();
+  ColumnFinder* finder = nullptr;
+  int estimated_resolution = source_resolution_;
+  if (source_resolution_ == kMinCredibleResolution) {
+    // Try to estimate resolution from typical body text size.
+    int res = IntCastRounded(to_block->line_size * kResolutionEstimationFactor);
+    if (res > estimated_resolution && res < kMaxCredibleResolution) {
+      estimated_resolution = res;
+      tprintf("Estimating resolution as %d\n", estimated_resolution);
+    }
+  }
+
+  if (to_block->line_size >= 2) {
+    finder = new ColumnFinder(static_cast<int>(to_block->line_size),
+                              blkbox.botleft(), blkbox.topright(),
+                              estimated_resolution, textord_use_cjk_fp_model,
+                              textord_tabfind_aligned_gap_fraction, &v_lines,
+                              &h_lines, vertical_x, vertical_y);
+
+    finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+    if (equ_detect_) {
+      equ_detect_->LabelSpecialText(to_block);
+    }
+
+    BLOBNBOX_CLIST osd_blobs;
+    // osd_orientation is the number of 90 degree rotations to make the
+    // characters upright. (See tesseract/osdetect.h for precise definition.)
+    // We want the text lines horizontal, (vertical text indicates vertical
+    // textlines) which may conflict (eg vertically written CJK).
+    int osd_orientation = 0;
+    bool vertical_text = textord_tabfind_force_vertical_text ||
+                         pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
+    if (!vertical_text && textord_tabfind_vertical_text &&
+        PSM_ORIENTATION_ENABLED(pageseg_mode)) {
+      vertical_text =
+          finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio,
+                                          to_block, &osd_blobs);
+    }
+    if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != nullptr && osr != nullptr) {
+      std::vector<int> osd_scripts;
+      if (osd_tess != this) {
+        // We are running osd as part of layout analysis, so constrain the
+        // scripts to those allowed by *this.
+        AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
+        for (int s = 0; s < sub_langs_.size(); ++s) {
+          AddAllScriptsConverted(sub_langs_[s]->unicharset,
+                                 osd_tess->unicharset, &osd_scripts);
+        }
+      }
+      os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
+      if (pageseg_mode == PSM_OSD_ONLY) {
+        delete finder;
+        return nullptr;
+      }
+      osd_orientation = osr->best_result.orientation_id;
+      double osd_score = osr->orientations[osd_orientation];
+      double osd_margin = min_orientation_margin * 2;
+      for (int i = 0; i < 4; ++i) {
+        if (i != osd_orientation &&
+            osd_score - osr->orientations[i] < osd_margin) {
+          osd_margin = osd_score - osr->orientations[i];
+        }
+      }
+      int best_script_id = osr->best_result.script_id;
+      const char* best_script_str =
+          osd_tess->unicharset.get_script_from_script_id(best_script_id);
+      bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
+          best_script_id == osd_tess->unicharset.hiragana_sid() ||
+          best_script_id == osd_tess->unicharset.katakana_sid() ||
+          strcmp("Japanese", best_script_str) == 0 ||
+          strcmp("Korean", best_script_str) == 0 ||
+          strcmp("Hangul", best_script_str) == 0;
+      if (cjk) {
+        finder->set_cjk_script(true);
+      }
+      if (osd_margin < min_orientation_margin) {
+        // The margin is weak.
+        if (!cjk && !vertical_text && osd_orientation == 2) {
+          // upside down latin text is improbable with such a weak margin.
+          tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
+                  "Don't rotate.\n", osd_margin);
+          osd_orientation = 0;
+        } else {
+          tprintf(
+              "OSD: Weak margin (%.2f) for %d blob text block, "
+              "but using orientation anyway: %d\n",
+              osd_margin, osd_blobs.length(), osd_orientation);
+        }
+      }
+    }
+    osd_blobs.shallow_clear();
+    finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
+
+#endif  // ndef DISABLED_LEGACY_ENGINE
+  }
+
+  return finder;
+}
+
+}  // namespace tesseract.
diff --git a/tesseract/src/ccmain/pagewalk.cpp b/tesseract/src/ccmain/pagewalk.cpp
new file mode 100644
index 00000000..a02fe5f4
--- /dev/null
+++ b/tesseract/src/ccmain/pagewalk.cpp
@@ -0,0 +1,43 @@
+/**********************************************************************
+ * File:        pagewalk.cpp  (Formerly walkers.c)
+ * Description: Block list processors
+ * Author:      Phil Cheatle
+ * Created:     Thu Oct 10 16:25:24 BST 1991
+ *
+ * (C) Copyright 1991, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "pageres.h"
+#include "tesseractclass.h"
+
+namespace tesseract {
+/**
+ * @name process_selected_words()
+ *
+ * Walk the current block list applying the specified word processor function
+ * to each word that overlaps the selection_box.
+ */
+void Tesseract::process_selected_words(
+        PAGE_RES* page_res, // blocks to check
+        TBOX& selection_box,
+        bool (tesseract::Tesseract::* word_processor)(PAGE_RES_IT* pr_it)) {
+  for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != nullptr;
+       page_res_it.forward()) {
+    WERD* word = page_res_it.word()->word;
+    if (word->bounding_box().overlap(selection_box)) {
+      if (!(this->*word_processor)(&page_res_it))
+        return;
+    }
+  }
+}
+}  // namespace tesseract
diff --git a/tesseract/src/ccmain/par_control.cpp b/tesseract/src/ccmain/par_control.cpp
new file mode 100644
index 00000000..c1c17298
--- /dev/null
+++ b/tesseract/src/ccmain/par_control.cpp
@@ -0,0 +1,73 @@
+///////////////////////////////////////////////////////////////////////
+// File:        par_control.cpp
+// Description: Control code for parallel implementation.
+// Author:      Ray Smith
+//
+// (C) Copyright 2013, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "tesseractclass.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif  // _OPENMP
+
+namespace tesseract {
+
+struct BlobData {
+  BlobData() = default;
+  BlobData(int index, Tesseract* tess, const WERD_RES& word)
+    : blob(word.chopped_word->blobs[index]),
+      tesseract(tess),
+      choices(&(*word.ratings)(index, index)) {}
+
+  TBLOB* blob = nullptr;
+  Tesseract* tesseract = nullptr;
+  BLOB_CHOICE_LIST** choices = nullptr;
+};
+
+void Tesseract::PrerecAllWordsPar(const std::vector<WordData>& words) {
+  // Prepare all the blobs.
+  std::vector<BlobData> blobs;
+  for (size_t w = 0; w < words.size(); ++w) {
+    if (words[w].word->ratings != nullptr &&
+        words[w].word->ratings->get(0, 0) == nullptr) {
+      for (int s = 0; s < words[w].lang_words.size(); ++s) {
+        Tesseract* sub = s < sub_langs_.size() ? sub_langs_[s] : this;
+        const WERD_RES& word = *words[w].lang_words[s];
+        for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
+          blobs.push_back(BlobData(b, sub, word));
+        }
+      }
+    }
+  }
+  // Pre-classify all the blobs.
+  if (tessedit_parallelize > 1) {
+#ifdef _OPENMP
+#pragma omp parallel for num_threads(10)
+#endif  // _OPENMP
+    for (size_t b = 0; b < blobs.size(); ++b) {
+      *blobs[b].choices =
+          blobs[b].tesseract->classify_blob(blobs[b].blob, "par",
+                                            ScrollView::WHITE, nullptr);
+    }
+  } else {
+    // TODO(AMD) parallelize this.
+    for (size_t b = 0; b < blobs.size(); ++b) {
+      *blobs[b].choices =
+          blobs[b].tesseract->classify_blob(blobs[b].blob, "par",
+                                            ScrollView::WHITE, nullptr);
+    }
+  }
+}
+
+}  // namespace tesseract.
diff --git a/tesseract/src/ccmain/paragraphs.cpp b/tesseract/src/ccmain/paragraphs.cpp
new file mode 100644
index 00000000..28576579
--- /dev/null
+++ b/tesseract/src/ccmain/paragraphs.cpp
@@ -0,0 +1,2590 @@
+/**********************************************************************
+ * File:        paragraphs.cpp
+ * Description: Paragraph detection for tesseract.
+ * Author:      David Eger
+ *
+ * (C) Copyright 2011, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "paragraphs.h"
+
+#include "genericvector.h"        // for GenericVector, GenericVectorEqEq
+#include "helpers.h"              // for UpdateRange, ClipToRange
+#include "host.h"                 // for NearlyEqual
+#include "mutableiterator.h"      // for MutableIterator
+#include "ocrblock.h"             // for BLOCK
+#include "ocrpara.h"              // for ParagraphModel, PARA, PARA_IT, PARA...
+#include "ocrrow.h"               // for ROW
+#include "pageres.h"              // for PAGE_RES_IT, WERD_RES, ROW_RES, BLO...
+#include "paragraphs_internal.h"  // for RowScratchRegisters, SetOfModels
+#include "pdblock.h"              // for PDBLK
+#include "polyblk.h"              // for POLY_BLOCK
+#include "ratngs.h"               // for WERD_CHOICE
+#include "rect.h"                 // for TBOX
+#include "statistc.h"             // for STATS
+#include "strngs.h"               // for STRING
+#include "tprintf.h"              // for tprintf
+#include "unicharset.h"           // for UNICHARSET
+#include "werd.h"                 // for WERD, W_REP_CHAR
+
+#include <tesseract/pageiterator.h>         // for PageIterator
+#include <tesseract/publictypes.h>          // for JUSTIFICATION_LEFT, JUSTIFICATION_R...
+#include <tesseract/unichar.h>              // for UNICHAR, UNICHAR_ID
+
+#include <cctype>                 // for isspace
+#include <cmath>                  // for abs
+#include <cstdio>                 // for snprintf
+#include <cstdlib>                // for abs
+#include <cstring>                // for strchr, strlen
+#include <algorithm>              // for max
+#include <memory>                 // for unique_ptr
+
+static const char * const kRLE = "\u202A";  // Right-to-Left Embedding
+static const char * const kPDF = "\u202C";  // Pop Directional Formatting
+
+namespace tesseract {
+
+// Special "weak" ParagraphModels.
+const ParagraphModel *kCrownLeft
+    = reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD111F));
+const ParagraphModel *kCrownRight
+    = reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD888F));
+
+// Do the text and geometry of two rows support a paragraph break between them?
+static bool LikelyParagraphStart(const RowScratchRegisters &before,
+                                 const RowScratchRegisters &after,
+                                 tesseract::ParagraphJustification j);
+
+// Given the width of a typical space between words, what is the threshold
+// by which by which we think left and right alignments for paragraphs
+// can vary and still be aligned.
+static int Epsilon(int space_pix) {
+  return space_pix * 4 / 5;
+}
+
+static bool AcceptableRowArgs(
+    int debug_level, int min_num_rows, const char *function_name,
+    const GenericVector<RowScratchRegisters> *rows,
+    int row_start, int row_end) {
+  if (row_start < 0 || row_end > rows->size() || row_start > row_end) {
+    tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n",
+            row_start, row_end, rows->size());
+    return false;
+  }
+  if (row_end - row_start < min_num_rows) {
+    if (debug_level > 1) {
+      tprintf("# Too few rows[%d, %d) for %s.\n",
+              row_start, row_end, function_name);
+    }
+    return false;
+  }
+  return true;
+}
+
+// =============================== Debug Code ================================
+
+// Convert an integer to a decimal string.
+static STRING StrOf(int num) {
+  char buffer[30];
+  snprintf(buffer, sizeof(buffer), "%d", num);
+  return STRING(buffer);
+}
+
+// Given a row-major matrix of unicode text and a column separator, print
+// a formatted table.  For ASCII, we get good column alignment.
+static void PrintTable(const std::vector<std::vector<STRING> > &rows,
+                       const STRING &colsep) {
+  std::vector<int> max_col_widths;
+  for (const auto& row : rows) {
+    int num_columns = row.size();
+    for (int c = 0; c < num_columns; c++) {
+      int num_unicodes = 0;
+      for (int i = 0; i < row[c].size(); i++) {
+        if ((row[c][i] & 0xC0) != 0x80) num_unicodes++;
+      }
+      if (c >= max_col_widths.size()) {
+        max_col_widths.push_back(num_unicodes);
+      } else {
+        if (num_unicodes > max_col_widths[c])
+          max_col_widths[c] = num_unicodes;
+      }
+    }
+  }
+
+  std::vector<STRING> col_width_patterns;
+  for (int c = 0; c < max_col_widths.size(); c++) {
+    col_width_patterns.push_back(
+        STRING("%-") + StrOf(max_col_widths[c]) + "s");
+  }
+
+  for (int r = 0; r < rows.size(); r++) {
+    for (int c = 0; c < rows[r].size(); c++) {
+      if (c > 0)
+        tprintf("%s", colsep.c_str());
+      tprintf(col_width_patterns[c].c_str(), rows[r][c].c_str());
+    }
+    tprintf("\n");
+  }
+}
+
+static STRING RtlEmbed(const STRING &word, bool rtlify) {
+  if (rtlify)
+    return STRING(kRLE) + word + STRING(kPDF);
+  return word;
+}
+
+// Print the current thoughts of the paragraph detector.
+static void PrintDetectorState(const ParagraphTheory &theory,
+                               const GenericVector<RowScratchRegisters> &rows) {
+  std::vector<std::vector<STRING> > output;
+  output.push_back(std::vector<STRING>());
+  output.back().push_back("#row");
+  output.back().push_back("space");
+  output.back().push_back("..");
+  output.back().push_back("lword[widthSEL]");
+  output.back().push_back("rword[widthSEL]");
+  RowScratchRegisters::AppendDebugHeaderFields(&output.back());
+  output.back().push_back("text");
+
+  for (int i = 0; i < rows.size(); i++) {
+    output.push_back(std::vector<STRING>());
+    std::vector<STRING> &row = output.back();
+    const RowInfo& ri = *rows[i].ri_;
+    row.push_back(StrOf(i));
+    row.push_back(StrOf(ri.average_interword_space));
+    row.push_back(ri.has_leaders ? ".." : " ");
+    row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) +
+                  "[" + StrOf(ri.lword_box.width()) +
+                  (ri.lword_likely_starts_idea ? "S" : "s") +
+                  (ri.lword_likely_ends_idea ? "E" : "e") +
+                  (ri.lword_indicates_list_item ? "L" : "l") +
+                  "]");
+    row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) +
+                  "[" + StrOf(ri.rword_box.width()) +
+                  (ri.rword_likely_starts_idea ? "S" : "s") +
+                  (ri.rword_likely_ends_idea ? "E" : "e") +
+                  (ri.rword_indicates_list_item ? "L" : "l") +
+                  "]");
+    rows[i].AppendDebugInfo(theory, &row);
+    row.push_back(RtlEmbed(ri.text, !ri.ltr));
+  }
+  PrintTable(output, " ");
+
+  tprintf("Active Paragraph Models:\n");
+  unsigned m = 0;
+  for (const auto& model : theory.models()) {
+    tprintf(" %d: %s\n", ++m, model->ToString().c_str());
+  }
+}
+
+static void DebugDump(
+    bool should_print,
+    const STRING &phase,
+    const ParagraphTheory &theory,
+    const GenericVector<RowScratchRegisters> &rows) {
+  if (!should_print)
+    return;
+  tprintf("# %s\n", phase.c_str());
+  PrintDetectorState(theory, rows);
+}
+
+// Print out the text for rows[row_start, row_end)
+static void PrintRowRange(const GenericVector<RowScratchRegisters> &rows,
+                          int row_start, int row_end) {
+  tprintf("======================================\n");
+  for (int row = row_start; row < row_end; row++) {
+    tprintf("%s\n", rows[row].ri_->text.c_str());
+  }
+  tprintf("======================================\n");
+}
+
+// ============= Brain Dead Language Model (ASCII Version) ===================
+
+static bool IsLatinLetter(int ch) {
+  return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
+}
+
+static bool IsDigitLike(int ch) {
+  return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
+}
+
+static bool IsOpeningPunct(int ch) {
+  return strchr("'\"({[", ch) != nullptr;
+}
+
+static bool IsTerminalPunct(int ch) {
+  return strchr(":'\".?!]})", ch) != nullptr;
+}
+
+// Return a pointer after consuming as much text as qualifies as roman numeral.
+static const char *SkipChars(const char *str, const char *toskip) {
+  while (*str != '\0' && strchr(toskip, *str)) { str++; }
+  return str;
+}
+
+static const char *SkipChars(const char *str, bool (*skip)(int)) {
+  while (*str != '\0' && skip(*str)) { str++; }
+  return str;
+}
+
+static const char *SkipOne(const char *str, const char *toskip) {
+  if (*str != '\0' && strchr(toskip, *str)) return str + 1;
+  return str;
+}
+
+// Return whether it is very likely that this is a numeral marker that could
+// start a list item.  Some examples include:
+//   A   I   iii.   VI   (2)   3.5.   [C-4]
+static bool LikelyListNumeral(const STRING &word) {
+  const char *kRomans = "ivxlmdIVXLMD";
+  const char *kDigits = "012345789";
+  const char *kOpen = "[{(";
+  const char *kSep = ":;-.,";
+  const char *kClose = "]})";
+
+  int num_segments = 0;
+  const char *pos = word.c_str();
+  while (*pos != '\0' && num_segments < 3) {
+    // skip up to two open parens.
+    const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
+    const char *numeral_end = SkipChars(numeral_start, kRomans);
+    if (numeral_end != numeral_start) {
+      // Got Roman Numeral. Great.
+    } else {
+      numeral_end = SkipChars(numeral_start, kDigits);
+      if (numeral_end == numeral_start) {
+        // If there's a single latin letter, we can use that.
+        numeral_end = SkipChars(numeral_start, IsLatinLetter);
+        if (numeral_end - numeral_start != 1)
+          break;
+      }
+    }
+    // We got some sort of numeral.
+    num_segments++;
+    // Skip any trailing parens or punctuation.
+    pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
+    if (pos == numeral_end)
+      break;
+  }
+  return *pos == '\0';
+}
+
+static bool LikelyListMark(const STRING &word) {
+  const char *kListMarks = "0Oo*.,+.";
+  return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr;
+}
+
+bool AsciiLikelyListItem(const STRING &word) {
+  return LikelyListMark(word) || LikelyListNumeral(word);
+}
+
+// ========== Brain Dead Language Model (Tesseract Version) ================
+
+// Return the first Unicode Codepoint from werd[pos].
+int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
+  if (!u || !werd || pos > werd->length())
+    return 0;
+  return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
+}
+
+// A useful helper class for finding the first j >= i so that word[j]
+// does not have given character type.
+class UnicodeSpanSkipper {
+ public:
+  UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
+      : u_(unicharset), word_(word) { wordlen_ = word->length(); }
+
+  // Given an input position, return the first position >= pos not punc.
+  int SkipPunc(int pos);
+  // Given an input position, return the first position >= pos not digit.
+  int SkipDigits(int pos);
+  // Given an input position, return the first position >= pos not roman.
+  int SkipRomans(int pos);
+  // Given an input position, return the first position >= pos not alpha.
+  int SkipAlpha(int pos);
+
+ private:
+  const UNICHARSET *u_;
+  const WERD_CHOICE *word_;
+  int wordlen_;
+};
+
+int UnicodeSpanSkipper::SkipPunc(int pos) {
+  while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) pos++;
+  return pos;
+}
+
+int UnicodeSpanSkipper::SkipDigits(int pos) {
+  while (pos < wordlen_ && (u_->get_isdigit(word_->unichar_id(pos)) ||
+                            IsDigitLike(UnicodeFor(u_, word_, pos)))) pos++;
+  return pos;
+}
+
+int UnicodeSpanSkipper::SkipRomans(int pos) {
+  const char *kRomans = "ivxlmdIVXLMD";
+  while (pos < wordlen_) {
+    int ch = UnicodeFor(u_, word_, pos);
+    if (ch >= 0xF0 || strchr(kRomans, ch) == nullptr) break;
+    pos++;
+  }
+  return pos;
+}
+
+int UnicodeSpanSkipper::SkipAlpha(int pos) {
+  while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) pos++;
+  return pos;
+}
+
+static bool LikelyListMarkUnicode(int ch) {
+  if (ch < 0x80) {
+    STRING single_ch;
+    single_ch += ch;
+    return LikelyListMark(single_ch);
+  }
+  switch (ch) {
+    // TODO(eger) expand this list of unicodes as needed.
+    case 0x00B0:  // degree sign
+    case 0x2022:  // bullet
+    case 0x25E6:  // white bullet
+    case 0x00B7:  // middle dot
+    case 0x25A1:  // white square
+    case 0x25A0:  // black square
+    case 0x25AA:  // black small square
+    case 0x2B1D:  // black very small square
+    case 0x25BA:  // black right-pointing pointer
+    case 0x25CF:  // black circle
+    case 0x25CB:  // white circle
+      return true;
+    default:
+      break;  // fall through
+  }
+  return false;
+}
+
+// Return whether it is very likely that this is a numeral marker that could
+// start a list item.  Some examples include:
+//   A   I   iii.   VI   (2)   3.5.   [C-4]
+static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
+  if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0)))
+    return true;
+
+  UnicodeSpanSkipper m(u, werd);
+  int num_segments = 0;
+  int pos = 0;
+  while (pos < werd->length() && num_segments < 3) {
+    int numeral_start = m.SkipPunc(pos);
+    if (numeral_start > pos + 1) break;
+    int numeral_end = m.SkipRomans(numeral_start);
+    if (numeral_end == numeral_start) {
+      numeral_end = m.SkipDigits(numeral_start);
+      if (numeral_end == numeral_start) {
+        // If there's a single latin letter, we can use that.
+        numeral_end = m.SkipAlpha(numeral_start);
+        if (numeral_end - numeral_start != 1)
+          break;
+      }
+    }
+    // We got some sort of numeral.
+    num_segments++;
+    // Skip any trailing punctuation.
+    pos = m.SkipPunc(numeral_end);
+    if (pos == numeral_end)
+      break;
+  }
+  return pos == werd->length();
+}
+
+// ========= Brain Dead Language Model (combined entry points) ================
+
+// Given the leftmost word of a line either as a Tesseract unicharset + werd
+// or a utf8 string, set the following attributes for it:
+//   is_list -      this word might be a list number or bullet.
+//   starts_idea -  this word is likely to start a sentence.
+//   ends_idea -    this word is likely to end a sentence.
+void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
+                        const STRING &utf8,
+                        bool *is_list, bool *starts_idea, bool *ends_idea) {
+  *is_list = false;
+  *starts_idea = false;
+  *ends_idea = false;
+  if (utf8.size() == 0 || (werd != nullptr && werd->length() == 0)) {  // Empty
+    *ends_idea = true;
+    return;
+  }
+
+  if (unicharset && werd) {  // We have a proper werd and unicharset so use it.
+    if (UniLikelyListItem(unicharset, werd)) {
+      *is_list = true;
+      *starts_idea = true;
+      *ends_idea = true;
+    }
+    if (unicharset->get_isupper(werd->unichar_id(0))) {
+      *starts_idea = true;
+    }
+    if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
+      *starts_idea = true;
+      *ends_idea = true;
+    }
+  } else {  // Assume utf8 is mostly ASCII
+    if (AsciiLikelyListItem(utf8)) {
+      *is_list = true;
+      *starts_idea = true;
+    }
+    int start_letter = utf8[0];
+    if (IsOpeningPunct(start_letter)) {
+      *starts_idea = true;
+    }
+    if (IsTerminalPunct(start_letter)) {
+      *ends_idea = true;
+    }
+    if (start_letter >= 'A' && start_letter <= 'Z') {
+      *starts_idea = true;
+    }
+  }
+}
+
+// Given the rightmost word of a line either as a Tesseract unicharset + werd
+// or a utf8 string, set the following attributes for it:
+//   is_list -      this word might be a list number or bullet.
+//   starts_idea -  this word is likely to start a sentence.
+//   ends_idea -    this word is likely to end a sentence.
+void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
+                         const STRING &utf8,
+                         bool *is_list, bool *starts_idea, bool *ends_idea) {
+  *is_list = false;
+  *starts_idea = false;
+  *ends_idea = false;
+  if (utf8.size() == 0 || (werd != nullptr && werd->length() == 0)) {  // Empty
+    *ends_idea = true;
+    return;
+  }
+
+  if (unicharset && werd) {  // We have a proper werd and unicharset so use it.
+    if (UniLikelyListItem(unicharset, werd)) {
+      *is_list = true;
+      *starts_idea = true;
+    }
+    UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
+    if (unicharset->get_ispunctuation(last_letter)) {
+      *ends_idea = true;
+    }
+  } else {  // Assume utf8 is mostly ASCII
+    if (AsciiLikelyListItem(utf8)) {
+      *is_list = true;
+      *starts_idea = true;
+    }
+    int last_letter = utf8[utf8.size() - 1];
+    if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
+      *ends_idea = true;
+    }
+  }
+}
+
+// =============== Implementation of RowScratchRegisters =====================
+/* static */
+void RowScratchRegisters::AppendDebugHeaderFields(
+    std::vector<STRING> *header) {
+  header->push_back("[lmarg,lind;rind,rmarg]");
+  header->push_back("model");
+}
+
+void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,
+                                          std::vector<STRING> *dbg) const {
+  char s[30];
+  snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]",
+           lmargin_, lindent_, rindent_, rmargin_);
+  dbg->push_back(s);
+  STRING model_string;
+  model_string += static_cast<char>(GetLineType());
+  model_string += ":";
+
+  int model_numbers = 0;
+  for (int h = 0; h < hypotheses_.size(); h++) {
+    if (hypotheses_[h].model == nullptr)
+      continue;
+    if (model_numbers > 0)
+      model_string += ",";
+    if (StrongModel(hypotheses_[h].model)) {
+      model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model));
+    } else if (hypotheses_[h].model == kCrownLeft) {
+      model_string += "CrL";
+    } else if (hypotheses_[h].model == kCrownRight) {
+      model_string += "CrR";
+    }
+    model_numbers++;
+  }
+  if (model_numbers == 0)
+    model_string += "0";
+
+  dbg->push_back(model_string);
+}
+
+void RowScratchRegisters::Init(const RowInfo &row) {
+  ri_ = &row;
+  lmargin_ = 0;
+  lindent_ = row.pix_ldistance;
+  rmargin_ = 0;
+  rindent_ = row.pix_rdistance;
+}
+
+LineType RowScratchRegisters::GetLineType() const {
+  if (hypotheses_.empty())
+    return LT_UNKNOWN;
+  bool has_start = false;
+  bool has_body = false;
+  for (int i = 0; i < hypotheses_.size(); i++) {
+    switch (hypotheses_[i].ty) {
+      case LT_START: has_start = true; break;
+      case LT_BODY: has_body = true; break;
+      default:
+        tprintf("Encountered bad value in hypothesis list: %c\n",
+                hypotheses_[i].ty);
+        break;
+    }
+  }
+  if (has_start && has_body)
+    return LT_MULTIPLE;
+  return has_start ? LT_START : LT_BODY;
+}
+
+LineType RowScratchRegisters::GetLineType(const ParagraphModel *model) const {
+  if (hypotheses_.empty())
+    return LT_UNKNOWN;
+  bool has_start = false;
+  bool has_body = false;
+  for (int i = 0; i < hypotheses_.size(); i++) {
+    if (hypotheses_[i].model != model)
+      continue;
+    switch (hypotheses_[i].ty) {
+      case LT_START: has_start = true; break;
+      case LT_BODY: has_body = true; break;
+      default:
+        tprintf("Encountered bad value in hypothesis list: %c\n",
+                hypotheses_[i].ty);
+        break;
+    }
+  }
+  if (has_start && has_body)
+    return LT_MULTIPLE;
+  return has_start ? LT_START : LT_BODY;
+}
+
+void RowScratchRegisters::SetStartLine() {
+  LineType current_lt = GetLineType();
+  if (current_lt != LT_UNKNOWN && current_lt != LT_START) {
+    tprintf("Trying to set a line to be START when it's already BODY.\n");
+  }
+  if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
+    hypotheses_.push_back_new(LineHypothesis(LT_START, nullptr));
+  }
+}
+
+void RowScratchRegisters::SetBodyLine() {
+  LineType current_lt = GetLineType();
+  if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) {
+    tprintf("Trying to set a line to be BODY when it's already START.\n");
+  }
+  if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
+    hypotheses_.push_back_new(LineHypothesis(LT_BODY, nullptr));
+  }
+}
+
+void RowScratchRegisters::AddStartLine(const ParagraphModel *model) {
+  hypotheses_.push_back_new(LineHypothesis(LT_START, model));
+  int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, nullptr));
+  if (old_idx >= 0)
+    hypotheses_.remove(old_idx);
+}
+
+void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) {
+  hypotheses_.push_back_new(LineHypothesis(LT_BODY, model));
+  int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, nullptr));
+  if (old_idx >= 0)
+    hypotheses_.remove(old_idx);
+}
+
+void RowScratchRegisters::StartHypotheses(SetOfModels *models) const {
+  for (int h = 0; h < hypotheses_.size(); h++) {
+    if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model))
+      models->push_back_new(hypotheses_[h].model);
+  }
+}
+
+void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const {
+  for (int h = 0; h < hypotheses_.size(); h++) {
+    if (StrongModel(hypotheses_[h].model))
+      models->push_back_new(hypotheses_[h].model);
+  }
+}
+
+void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const {
+  for (int h = 0; h < hypotheses_.size(); h++) {
+    if (hypotheses_[h].model != nullptr)
+      models->push_back_new(hypotheses_[h].model);
+  }
+}
+
+const ParagraphModel *RowScratchRegisters::UniqueStartHypothesis() const {
+  if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START)
+    return nullptr;
+  return hypotheses_[0].model;
+}
+
+const ParagraphModel *RowScratchRegisters::UniqueBodyHypothesis() const {
+  if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY)
+    return nullptr;
+  return hypotheses_[0].model;
+}
+
+// Discard any hypotheses whose model is not in the given list.
+void RowScratchRegisters::DiscardNonMatchingHypotheses(
+    const SetOfModels &models) {
+  if (models.empty())
+    return;
+  for (int h = hypotheses_.size() - 1; h >= 0; h--) {
+    if (!models.contains(hypotheses_[h].model)) {
+      hypotheses_.remove(h);
+    }
+  }
+}
+
+// ============ Geometry based Paragraph Detection Algorithm =================
+
+struct Cluster {
+  Cluster() : center(0), count(0) {}
+  Cluster(int cen, int num) : center(cen), count(num) {}
+
+  int center;  // The center of the cluster.
+  int count;   // The number of entries within the cluster.
+};
+
+class SimpleClusterer {
+ public:
+  explicit SimpleClusterer(int max_cluster_width)
+      : max_cluster_width_(max_cluster_width) {}
+  void Add(int value) { values_.push_back(value); }
+  int size() const { return values_.size(); }
+  void GetClusters(GenericVector<Cluster> *clusters);
+
+ private:
+  int max_cluster_width_;
+  GenericVector<int> values_;
+};
+
+// Return the index of the cluster closest to value.
+static int ClosestCluster(const GenericVector<Cluster> &clusters, int value) {
+  int best_index = 0;
+  for (int i = 0; i < clusters.size(); i++) {
+    if (abs(value - clusters[i].center) <
+        abs(value - clusters[best_index].center))
+        best_index = i;
+  }
+  return best_index;
+}
+
+void SimpleClusterer::GetClusters(GenericVector<Cluster> *clusters) {
+  clusters->clear();
+  values_.sort();
+  for (int i = 0; i < values_.size();) {
+    int orig_i = i;
+    int lo = values_[i];
+    int hi = lo;
+    while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {
+      hi = values_[i];
+    }
+    clusters->push_back(Cluster((hi + lo) / 2, i - orig_i));
+  }
+}
+
+// Calculate left- and right-indent tab stop values seen in
+// rows[row_start, row_end) given a tolerance of tolerance.
+static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows,
+                              int row_start, int row_end, int tolerance,
+                              GenericVector<Cluster> *left_tabs,
+                              GenericVector<Cluster> *right_tabs) {
+  if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
+    return;
+  // First pass: toss all left and right indents into clusterers.
+  SimpleClusterer initial_lefts(tolerance);
+  SimpleClusterer initial_rights(tolerance);
+  GenericVector<Cluster> initial_left_tabs;
+  GenericVector<Cluster> initial_right_tabs;
+  for (int i = row_start; i < row_end; i++) {
+    initial_lefts.Add((*rows)[i].lindent_);
+    initial_rights.Add((*rows)[i].rindent_);
+  }
+  initial_lefts.GetClusters(&initial_left_tabs);
+  initial_rights.GetClusters(&initial_right_tabs);
+
+  // Second pass: cluster only lines that are not "stray"
+  //   An example of a stray line is a page number -- a line whose start
+  //   and end tab-stops are far outside the typical start and end tab-stops
+  //   for the block.
+  //   Put another way, we only cluster data from lines whose start or end
+  //   tab stop is frequent.
+  SimpleClusterer lefts(tolerance);
+  SimpleClusterer rights(tolerance);
+
+  // Outlier elimination.  We might want to switch this to test outlier-ness
+  // based on how strange a position an outlier is in instead of or in addition
+  // to how rare it is.  These outliers get re-added if we end up having too
+  // few tab stops, to work with, however.
+  int infrequent_enough_to_ignore = 0;
+  if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;
+  if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;
+
+  for (int i = row_start; i < row_end; i++) {
+    int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
+    int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
+    if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
+        initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
+      lefts.Add((*rows)[i].lindent_);
+      rights.Add((*rows)[i].rindent_);
+    }
+  }
+  lefts.GetClusters(left_tabs);
+  rights.GetClusters(right_tabs);
+
+  if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
+      (right_tabs->size() == 1 && left_tabs->size() >= 4)) {
+    // One side is really ragged, and the other only has one tab stop,
+    // so those "insignificant outliers" are probably important, actually.
+    // This often happens on a page of an index.  Add back in the ones
+    // we omitted in the first pass.
+    for (int i = row_start; i < row_end; i++) {
+      int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
+      int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
+      if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
+            initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
+        lefts.Add((*rows)[i].lindent_);
+        rights.Add((*rows)[i].rindent_);
+      }
+    }
+  }
+  lefts.GetClusters(left_tabs);
+  rights.GetClusters(right_tabs);
+
+  // If one side is almost a two-indent aligned side, and the other clearly
+  // isn't, try to prune out the least frequent tab stop from that side.
+  if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
+    int to_prune = -1;
+    for (int i = left_tabs->size() - 1; i >= 0; i--) {
+      if (to_prune < 0 ||
+          (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
+        to_prune = i;
+      }
+    }
+    if (to_prune >= 0 &&
+        (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
+      left_tabs->remove(to_prune);
+    }
+  }
+  if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
+    int to_prune = -1;
+    for (int i = right_tabs->size() - 1; i >= 0; i--) {
+      if (to_prune < 0 ||
+          (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
+        to_prune = i;
+      }
+    }
+    if (to_prune >= 0 &&
+        (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
+      right_tabs->remove(to_prune);
+    }
+  }
+}
+
+// Given a paragraph model mark rows[row_start, row_end) as said model
+// start or body lines.
+//
+// Case 1: model->first_indent_ != model->body_indent_
+//   Differentiating the paragraph start lines from the paragraph body lines in
+//   this case is easy, we just see how far each line is indented.
+//
+// Case 2: model->first_indent_ == model->body_indent_
+//   Here, we find end-of-paragraph lines by looking for "short lines."
+//   What constitutes a "short line" changes depending on whether the text
+//   ragged-right[left] or fully justified (aligned left and right).
+//
+//   Case 2a: Ragged Right (or Left) text.  (eop_threshold == 0)
+//     We have a new paragraph it the first word would have at the end
+//     of the previous line.
+//
+//   Case 2b: Fully Justified.  (eop_threshold > 0)
+//     We mark a line as short (end of paragraph) if the offside indent
+//     is greater than eop_threshold.
+static void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows,
+                              int row_start, int row_end,
+                              const ParagraphModel *model,
+                              bool ltr, int eop_threshold) {
+  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
+    return;
+  for (int row = row_start; row < row_end; row++) {
+    bool valid_first = ValidFirstLine(rows, row, model);
+    bool valid_body = ValidBodyLine(rows, row, model);
+    if (valid_first && !valid_body) {
+      (*rows)[row].AddStartLine(model);
+    } else if (valid_body && !valid_first) {
+      (*rows)[row].AddBodyLine(model);
+    } else if (valid_body && valid_first) {
+      bool after_eop = (row == row_start);
+      if (row > row_start) {
+        if (eop_threshold > 0) {
+          if (model->justification() == JUSTIFICATION_LEFT) {
+            after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
+          } else {
+            after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
+          }
+        } else {
+         after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row],
+                                           model->justification());
+        }
+      }
+      if (after_eop) {
+        (*rows)[row].AddStartLine(model);
+      } else {
+        (*rows)[row].AddBodyLine(model);
+      }
+    } else {
+      // Do nothing. Stray row.
+    }
+  }
+}
+
+// GeometricClassifierState holds all of the information we'll use while
+// trying to determine a paragraph model for the text lines in a block of
+// text:
+//   + the rows under consideration [row_start, row_end)
+//   + the common left- and right-indent tab stops
+//   + does the block start out left-to-right or right-to-left
+// Further, this struct holds the data we amass for the (single) ParagraphModel
+// we'll assign to the text lines (assuming we get that far).
+struct GeometricClassifierState {
+  GeometricClassifierState(int dbg_level,
+                           GenericVector<RowScratchRegisters> *r,
+                           int r_start, int r_end)
+      : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) {
+    tolerance = InterwordSpace(*r, r_start, r_end);
+    CalculateTabStops(r, r_start, r_end, tolerance,
+                      &left_tabs, &right_tabs);
+    if (debug_level >= 3) {
+      tprintf("Geometry: TabStop cluster tolerance = %d; "
+              "%d left tabs; %d right tabs\n",
+              tolerance, left_tabs.size(), right_tabs.size());
+    }
+    ltr = (*r)[r_start].ri_->ltr;
+  }
+
+  void AssumeLeftJustification() {
+    just = tesseract::JUSTIFICATION_LEFT;
+    margin = (*rows)[row_start].lmargin_;
+  }
+
+  void AssumeRightJustification() {
+    just = tesseract::JUSTIFICATION_RIGHT;
+    margin = (*rows)[row_start].rmargin_;
+  }
+
+  // Align tabs are the tab stops the text is aligned to.
+  const GenericVector<Cluster> &AlignTabs() const {
+    if (just == tesseract::JUSTIFICATION_RIGHT) return right_tabs;
+    return left_tabs;
+  }
+
+  // Offside tabs are the tab stops opposite the tabs used to align the text.
+  //
+  // Note that for a left-to-right text which is aligned to the right such as
+  //     this function comment, the offside tabs are the horizontal tab stops
+  //                 marking the beginning of ("Note", "this" and "marking").
+  const GenericVector<Cluster> &OffsideTabs() const {
+    if (just == tesseract::JUSTIFICATION_RIGHT) return left_tabs;
+    return right_tabs;
+  }
+
+  // Return whether the i'th row extends from the leftmost left tab stop
+  // to the right most right tab stop.
+  bool IsFullRow(int i) const {
+    return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 &&
+        ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0;
+  }
+
+  int AlignsideTabIndex(int row_idx) const {
+    return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));
+  }
+
+  // Given what we know about the paragraph justification (just), would the
+  // first word of row_b have fit at the end of row_a?
+  bool FirstWordWouldHaveFit(int row_a, int row_b) {
+    return ::tesseract::FirstWordWouldHaveFit(
+        (*rows)[row_a], (*rows)[row_b], just);
+  }
+
+  void PrintRows() const { PrintRowRange(*rows, row_start, row_end); }
+
+  void Fail(int min_debug_level, const char *why) const {
+    if (debug_level < min_debug_level) return;
+    tprintf("# %s\n", why);
+    PrintRows();
+  }
+
+  ParagraphModel Model() const {
+    return ParagraphModel(just, margin, first_indent, body_indent, tolerance);
+  }
+
+  // We print out messages with a debug level at least as great as debug_level.
+  int debug_level = 0;
+
+  // The Geometric Classifier was asked to find a single paragraph model
+  // to fit the text rows (*rows)[row_start, row_end)
+  GenericVector<RowScratchRegisters> *rows;
+  int row_start = 0;
+  int row_end = 0;
+
+  // The amount by which we expect the text edge can vary and still be aligned.
+  int tolerance = 0;
+
+  // Is the script in this text block left-to-right?
+  // HORRIBLE ROUGH APPROXIMATION.  TODO(eger): Improve
+  bool ltr = false;
+
+  // These left and right tab stops were determined to be the common tab
+  // stops for the given text.
+  GenericVector<Cluster> left_tabs;
+  GenericVector<Cluster> right_tabs;
+
+  // These are parameters we must determine to create a ParagraphModel.
+  tesseract::ParagraphJustification just = JUSTIFICATION_UNKNOWN;
+  int margin = 0;
+  int first_indent = 0;
+  int body_indent = 0;
+
+  // eop_threshold > 0 if the text is fully justified.  See MarkRowsWithModel()
+  int eop_threshold = 0;
+};
+
+// Given a section of text where strong textual clues did not help identifying
+// paragraph breaks, and for which the left and right indents have exactly
+// three tab stops between them, attempt to find the paragraph breaks based
+// solely on the outline of the text and whether the script is left-to-right.
+//
+// Algorithm Detail:
+//   The selected rows are in the form of a rectangle except
+//   for some number of "short lines" of the same length:
+//
+//   (A1)  xxxxxxxxxxxxx  (B1) xxxxxxxxxxxx
+//           xxxxxxxxxxx       xxxxxxxxxx    # A "short" line.
+//         xxxxxxxxxxxxx       xxxxxxxxxxxx
+//         xxxxxxxxxxxxx       xxxxxxxxxxxx
+//
+//   We have a slightly different situation if the only short
+//   line is at the end of the excerpt.
+//
+//   (A2) xxxxxxxxxxxxx  (B2) xxxxxxxxxxxx
+//        xxxxxxxxxxxxx       xxxxxxxxxxxx
+//        xxxxxxxxxxxxx       xxxxxxxxxxxx
+//          xxxxxxxxxxx       xxxxxxxxxx     # A "short" line.
+//
+//   We'll interpret these as follows based on the reasoning in the comment for
+//   GeometricClassify():
+//       [script direction: first indent, body indent]
+//   (A1) LtR: 2,0  RtL: 0,0   (B1) LtR: 0,0  RtL: 2,0
+//   (A2) LtR: 2,0  RtL: CrR   (B2) LtR: CrL  RtL: 2,0
+static void GeometricClassifyThreeTabStopTextBlock(
+    int debug_level,
+    GeometricClassifierState &s,
+    ParagraphTheory *theory) {
+  int num_rows = s.row_end - s.row_start;
+  int num_full_rows = 0;
+  int last_row_full = 0;
+  for (int i = s.row_start; i < s.row_end; i++) {
+    if (s.IsFullRow(i)) {
+      num_full_rows++;
+      if (i == s.row_end - 1) last_row_full++;
+    }
+  }
+
+  if (num_full_rows < 0.7 * num_rows) {
+    s.Fail(1, "Not enough full lines to know which lines start paras.");
+    return;
+  }
+
+  // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
+  s.eop_threshold = 0;
+
+  if (s.ltr) {
+    s.AssumeLeftJustification();
+  } else {
+    s.AssumeRightJustification();
+  }
+
+  if (debug_level > 0) {
+    tprintf("# Not enough variety for clear outline classification. "
+            "Guessing these are %s aligned based on script.\n",
+            s.ltr ? "left" : "right");
+    s.PrintRows();
+  }
+
+  if (s.AlignTabs().size() == 2) {  // case A1 or A2
+    s.first_indent = s.AlignTabs()[1].center;
+    s.body_indent = s.AlignTabs()[0].center;
+  } else {                      // case B1 or B2
+    if (num_rows - 1 == num_full_rows - last_row_full) {
+      // case B2
+      const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
+      (*s.rows)[s.row_start].AddStartLine(model);
+      for (int i = s.row_start + 1; i < s.row_end; i++) {
+        (*s.rows)[i].AddBodyLine(model);
+      }
+      return;
+    } else {
+      // case B1
+      s.first_indent = s.body_indent = s.AlignTabs()[0].center;
+      s.eop_threshold = (s.OffsideTabs()[0].center +
+                         s.OffsideTabs()[1].center) / 2;
+    }
+  }
+  const ParagraphModel *model = theory->AddModel(s.Model());
+  MarkRowsWithModel(s.rows, s.row_start, s.row_end, model,
+                    s.ltr, s.eop_threshold);
+  return;
+}
+
+// This function is called if strong textual clues were not available, but
+// the caller hopes that the paragraph breaks will be super obvious just
+// by the outline of the text.
+//
+// The particularly difficult case is figuring out what's going on if you
+// don't have enough short paragraph end lines to tell us what's going on.
+//
+// For instance, let's say you have the following outline:
+//
+//   (A1)  xxxxxxxxxxxxxxxxxxxxxx
+//           xxxxxxxxxxxxxxxxxxxx
+//         xxxxxxxxxxxxxxxxxxxxxx
+//         xxxxxxxxxxxxxxxxxxxxxx
+//
+// Even if we know that the text is left-to-right and so will probably be
+// left-aligned, both of the following are possible texts:
+//
+//  (A1a)  1. Here our list item
+//           with two full lines.
+//         2. Here a second item.
+//         3. Here our third one.
+//
+//  (A1b)  so ends paragraph one.
+//           Here  starts another
+//         paragraph  we want  to
+//         read.  This  continues
+//
+// These examples are obvious from the text and should have been caught
+// by the StrongEvidenceClassify pass.  However, for languages where we don't
+// have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
+// it's worth guessing that (A1b) is the correct interpretation if there are
+// far more "full" lines than "short" lines.
+static void GeometricClassify(int debug_level,
+                              GenericVector<RowScratchRegisters> *rows,
+                              int row_start, int row_end,
+                              ParagraphTheory *theory) {
+  if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
+    return;
+  if (debug_level > 1) {
+    tprintf("###############################################\n");
+    tprintf("##### GeometricClassify( rows[%d:%d) )   ####\n",
+            row_start, row_end);
+    tprintf("###############################################\n");
+  }
+  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
+
+  GeometricClassifierState s(debug_level, rows, row_start, row_end);
+  if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
+    s.Fail(2, "Too much variety for simple outline classification.");
+    return;
+  }
+  if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
+    s.Fail(1, "Not enough variety for simple outline classification.");
+    return;
+  }
+  if (s.left_tabs.size() + s.right_tabs.size() == 3) {
+    GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
+    return;
+  }
+
+  // At this point, we know that one side has at least two tab stops, and the
+  // other side has one or two tab stops.
+  // Left to determine:
+  //   (1) Which is the body indent and which is the first line indent?
+  //   (2) Is the text fully justified?
+
+  // If one side happens to have three or more tab stops, assume that side
+  // is opposite of the aligned side.
+  if (s.right_tabs.size() > 2) {
+    s.AssumeLeftJustification();
+  } else if (s.left_tabs.size() > 2) {
+    s.AssumeRightJustification();
+  } else if (s.ltr) {  // guess based on script direction
+    s.AssumeLeftJustification();
+  } else {
+    s.AssumeRightJustification();
+  }
+
+  if (s.AlignTabs().size() == 2) {
+    // For each tab stop on the aligned side, how many of them appear
+    // to be paragraph start lines?  [first lines]
+    int firsts[2] = {0, 0};
+    // Count the first line as a likely paragraph start line.
+    firsts[s.AlignsideTabIndex(s.row_start)]++;
+    // For each line, if the first word would have fit on the previous
+    // line count it as a likely paragraph start line.
+    bool jam_packed = true;
+    for (int i = s.row_start + 1; i < s.row_end; i++) {
+      if (s.FirstWordWouldHaveFit(i - 1, i)) {
+        firsts[s.AlignsideTabIndex(i)]++;
+        jam_packed = false;
+      }
+    }
+    // Make an extra accounting for the last line of the paragraph just
+    // in case it's the only short line in the block.  That is, take its
+    // first word as typical and see if this looks like the *last* line
+    // of a paragraph.  If so, mark the *other* indent as probably a first.
+    if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
+      firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
+    }
+
+    int percent0firsts, percent1firsts;
+    percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
+    percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;
+
+    // TODO(eger): Tune these constants if necessary.
+    if ((percent0firsts < 20 && 30 < percent1firsts) ||
+        percent0firsts + 30 < percent1firsts) {
+      s.first_indent = s.AlignTabs()[1].center;
+      s.body_indent = s.AlignTabs()[0].center;
+    } else if ((percent1firsts < 20 && 30 < percent0firsts) ||
+               percent1firsts + 30 < percent0firsts) {
+      s.first_indent = s.AlignTabs()[0].center;
+      s.body_indent = s.AlignTabs()[1].center;
+    } else {
+      // Ambiguous! Probably lineated (poetry)
+      if (debug_level > 1) {
+        tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
+                s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
+        tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
+                s.AlignTabs()[0].center, percent0firsts);
+        tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
+                s.AlignTabs()[1].center, percent1firsts);
+        s.PrintRows();
+      }
+      return;
+    }
+  } else {
+    // There's only one tab stop for the "aligned to" side.
+    s.first_indent = s.body_indent = s.AlignTabs()[0].center;
+  }
+
+  // At this point, we have our model.
+  const ParagraphModel *model = theory->AddModel(s.Model());
+
+  // Now all we have to do is figure out if the text is fully justified or not.
+  // eop_threshold: default to fully justified unless we see evidence below.
+  //    See description on MarkRowsWithModel()
+  s.eop_threshold =
+      (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
+  // If the text is not fully justified, re-set the eop_threshold to 0.
+  if (s.AlignTabs().size() == 2) {
+    // Paragraphs with a paragraph-start indent.
+    for (int i = s.row_start; i < s.row_end - 1; i++) {
+      if (ValidFirstLine(s.rows, i + 1, model) &&
+          !NearlyEqual(s.OffsideTabs()[0].center,
+                       (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
+        // We found a non-end-of-paragraph short line: not fully justified.
+        s.eop_threshold = 0;
+        break;
+      }
+    }
+  } else {
+    // Paragraphs with no paragraph-start indent.
+    for (int i = s.row_start; i < s.row_end - 1; i++) {
+      if (!s.FirstWordWouldHaveFit(i, i + 1) &&
+          !NearlyEqual(s.OffsideTabs()[0].center,
+                       (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
+        // We found a non-end-of-paragraph short line: not fully justified.
+        s.eop_threshold = 0;
+        break;
+      }
+    }
+  }
+  MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
+}
+
+// =============== Implementation of ParagraphTheory =====================
+
+const ParagraphModel* ParagraphTheory::AddModel(const ParagraphModel &model) {
+  for (const auto& m : *models_) {
+    if (m->Comparable(model)) {
+      return m;
+    }
+  }
+  auto *m = new ParagraphModel(model);
+  models_->push_back(m);
+  models_we_added_.push_back_new(m);
+  return m;
+}
+
+void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {
+  size_t w = 0;
+  for (size_t r = 0; r < models_->size(); r++) {
+    ParagraphModel* m = (*models_)[r];
+    if (!used_models.contains(m) && models_we_added_.contains(m)) {
+      delete m;
+    } else {
+      if (r > w) {
+        (*models_)[w] = m;
+      }
+      w++;
+    }
+  }
+  models_->resize(w);
+}
+
+// Examine rows[start, end) and try to determine if an existing non-centered
+// paragraph model would fit them perfectly.  If so, return a pointer to it.
+// If not, return nullptr.
+const ParagraphModel *ParagraphTheory::Fits(
+    const GenericVector<RowScratchRegisters> *rows, int start, int end) const {
+  for (const auto* model : *models_) {
+    if (model->justification() != JUSTIFICATION_CENTER &&
+        RowsFitModel(rows, start, end, model))
+      return model;
+  }
+  return nullptr;
+}
+
+void ParagraphTheory::NonCenteredModels(SetOfModels *models) {
+  for (const auto* model : *models_) {
+    if (model->justification() != JUSTIFICATION_CENTER)
+      models->push_back_new(model);
+  }
+}
+
+int ParagraphTheory::IndexOf(const ParagraphModel *model) const {
+  int i = 0;
+  for (const auto* m : *models_) {
+    if (m == model)
+      return i;
+    i++;
+  }
+  return -1;
+}
+
+bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows,
+                    int row, const ParagraphModel *model) {
+  if (!StrongModel(model)) {
+    tprintf("ValidFirstLine() should only be called with strong models!\n");
+  }
+  return StrongModel(model) &&
+      model->ValidFirstLine(
+          (*rows)[row].lmargin_, (*rows)[row].lindent_,
+          (*rows)[row].rindent_, (*rows)[row].rmargin_);
+}
+
+bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows,
+                   int row, const ParagraphModel *model) {
+  if (!StrongModel(model)) {
+    tprintf("ValidBodyLine() should only be called with strong models!\n");
+  }
+  return StrongModel(model) &&
+      model->ValidBodyLine(
+          (*rows)[row].lmargin_, (*rows)[row].lindent_,
+          (*rows)[row].rindent_, (*rows)[row].rmargin_);
+}
+
+bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows,
+                     int a, int b, const ParagraphModel *model) {
+  if (model != kCrownRight && model != kCrownLeft) {
+    tprintf("CrownCompatible() should only be called with crown models!\n");
+    return false;
+  }
+  auto &row_a = (*rows)[a];
+  auto &row_b = (*rows)[b];
+  if (model == kCrownRight) {
+    return NearlyEqual(row_a.rindent_ + row_a.rmargin_,
+                       row_b.rindent_ + row_b.rmargin_,
+                       Epsilon(row_a.ri_->average_interword_space));
+  }
+  return NearlyEqual(row_a.lindent_ + row_a.lmargin_,
+                     row_b.lindent_ + row_b.lmargin_,
+                     Epsilon(row_a.ri_->average_interword_space));
+}
+
+
+// =============== Implementation of ParagraphModelSmearer ====================
+
+ParagraphModelSmearer::ParagraphModelSmearer(
+    GenericVector<RowScratchRegisters> *rows,
+    int row_start, int row_end, ParagraphTheory *theory)
+        : theory_(theory), rows_(rows), row_start_(row_start),
+          row_end_(row_end) {
+  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
+    row_start_ = 0;
+    row_end_ = 0;
+    return;
+  }
+  open_models_.resize(open_models_.size() + row_end - row_start + 2);
+}
+
+// see paragraphs_internal.h
+void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
+  SetOfModels no_models;
+  if (row_start < row_start_) row_start = row_start_;
+  if (row_end > row_end_) row_end = row_end_;
+
+  for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end;
+       row++) {
+    if ((*rows_)[row].ri_->num_words == 0) {
+      OpenModels(row + 1) = no_models;
+    } else {
+      SetOfModels &opened = OpenModels(row);
+      (*rows_)[row].StartHypotheses(&opened);
+
+      // Which models survive the transition from row to row + 1?
+      SetOfModels still_open;
+      for (int m = 0; m < opened.size(); m++) {
+        if (ValidFirstLine(rows_, row, opened[m]) ||
+            ValidBodyLine(rows_, row, opened[m])) {
+          // This is basic filtering; we check likely paragraph starty-ness down
+          // below in Smear() -- you know, whether the first word would have fit
+          // and such.
+          still_open.push_back_new(opened[m]);
+        }
+      }
+      OpenModels(row + 1) = still_open;
+    }
+  }
+}
+
+// see paragraphs_internal.h
+void ParagraphModelSmearer::Smear() {
+  CalculateOpenModels(row_start_, row_end_);
+
+  // For each row which we're unsure about (that is, it is LT_UNKNOWN or
+  // we have multiple LT_START hypotheses), see if there's a model that
+  // was recently used (an "open" model) which might model it well.
+  for (int i = row_start_; i < row_end_; i++) {
+    RowScratchRegisters &row = (*rows_)[i];
+    if (row.ri_->num_words == 0)
+      continue;
+
+    // Step One:
+    //   Figure out if there are "open" models which are left-alined or
+    //   right-aligned.  This is important for determining whether the
+    //   "first" word in a row would fit at the "end" of the previous row.
+    bool left_align_open = false;
+    bool right_align_open = false;
+    for (int m = 0; m < OpenModels(i).size(); m++) {
+      switch (OpenModels(i)[m]->justification()) {
+        case JUSTIFICATION_LEFT: left_align_open = true; break;
+        case JUSTIFICATION_RIGHT: right_align_open = true; break;
+        default: left_align_open = right_align_open = true;
+      }
+    }
+    // Step Two:
+    //   Use that knowledge to figure out if this row is likely to
+    //   start a paragraph.
+    bool likely_start;
+    if (i == 0) {
+      likely_start = true;
+    } else {
+      if ((left_align_open && right_align_open) ||
+          (!left_align_open && !right_align_open)) {
+        likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
+                                            JUSTIFICATION_LEFT) ||
+                       LikelyParagraphStart((*rows_)[i - 1], row,
+                                            JUSTIFICATION_RIGHT);
+      } else if (left_align_open) {
+        likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
+                                            JUSTIFICATION_LEFT);
+      } else {
+        likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
+                                            JUSTIFICATION_RIGHT);
+      }
+    }
+
+    // Step Three:
+    //   If this text line seems like an obvious first line of an
+    //   open model, or an obvious continuation of an existing
+    //   modelled paragraph, mark it up.
+    if (likely_start) {
+      // Add Start Hypotheses for all Open models that fit.
+      for (int m = 0; m < OpenModels(i).size(); m++) {
+        if (ValidFirstLine(rows_, i, OpenModels(i)[m])) {
+          row.AddStartLine(OpenModels(i)[m]);
+        }
+      }
+    } else {
+      // Add relevant body line hypotheses.
+      SetOfModels last_line_models;
+      if (i > 0) {
+        (*rows_)[i - 1].StrongHypotheses(&last_line_models);
+      } else {
+        theory_->NonCenteredModels(&last_line_models);
+      }
+      for (int m = 0; m < last_line_models.size(); m++) {
+        const ParagraphModel *model = last_line_models[m];
+        if (ValidBodyLine(rows_, i, model))
+          row.AddBodyLine(model);
+      }
+    }
+
+    // Step Four:
+    //   If we're still quite unsure about this line, go through all
+    //   models in our theory and see if this row could be the start
+    //   of any of our  models.
+    if (row.GetLineType() == LT_UNKNOWN ||
+        (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) {
+      SetOfModels all_models;
+      theory_->NonCenteredModels(&all_models);
+      for (int m = 0; m < all_models.size(); m++) {
+        if (ValidFirstLine(rows_, i, all_models[m])) {
+          row.AddStartLine(all_models[m]);
+        }
+      }
+    }
+    // Step Five:
+    //   Since we may have updated the hypotheses about this row, we need
+    //   to recalculate the Open models for the rest of rows[i + 1, row_end)
+    if (row.GetLineType() != LT_UNKNOWN) {
+      CalculateOpenModels(i + 1, row_end_);
+    }
+  }
+}
+
+// ================ Main Paragraph Detection Algorithm =======================
+
+// Find out what ParagraphModels are actually used, and discard any
+// that are not.
+static void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows,
+                                ParagraphTheory *theory) {
+  SetOfModels used_models;
+  for (int i = 0; i < rows.size(); i++) {
+    rows[i].StrongHypotheses(&used_models);
+  }
+  theory->DiscardUnusedModels(used_models);
+}
+
+// DowngradeWeakestToCrowns:
+//   Forget any flush-{left, right} models unless we see two or more
+//   of them in sequence.
+//
+// In pass 3, we start to classify even flush-left paragraphs (paragraphs
+// where the first line and body indent are the same) as having proper Models.
+// This is generally dangerous, since if you start imagining that flush-left
+// is a typical paragraph model when it is not, it will lead you to chop normal
+// indented paragraphs in the middle whenever a sentence happens to start on a
+// new line (see "This" above).  What to do?
+//   What we do is to take any paragraph which is flush left and is not
+// preceded by another paragraph of the same model and convert it to a "Crown"
+// paragraph.  This is a weak pseudo-ParagraphModel which is a placeholder
+// for later.  It means that the paragraph is flush, but it would be desirable
+// to mark it as the same model as following text if it fits.  This downgrade
+// FlushLeft -> CrownLeft -> Model of following paragraph.  Means that we
+// avoid making flush left Paragraph Models whenever we see a top-of-the-page
+// half-of-a-paragraph. and instead we mark it the same as normal body text.
+//
+// Implementation:
+//
+//   Comb backwards through the row scratch registers, and turn any
+//   sequences of body lines of equivalent type abutted against the beginning
+//   or a body or start line of a different type into a crown paragraph.
+static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,
+                                     GenericVector<RowScratchRegisters> *rows) {
+  int start;
+  for (int end = rows->size(); end > 0; end = start) {
+    // Search back for a body line of a unique type.
+    const ParagraphModel *model = nullptr;
+    while (end > 0 &&
+           (model = (*rows)[end - 1].UniqueBodyHypothesis()) == nullptr) {
+      end--;
+    }
+    if (end == 0) break;
+    start = end - 1;
+    while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
+      start--;  // walk back to the first line that is not the same body type.
+    }
+    if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
+        StrongModel(model) &&
+        NearlyEqual(model->first_indent(), model->body_indent(),
+                    model->tolerance())) {
+        start--;
+    }
+    start++;
+    // Now rows[start, end) is a sequence of unique body hypotheses of model.
+    if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER)
+      continue;
+    if (!StrongModel(model)) {
+      while (start > 0 &&
+             CrownCompatible(rows, start - 1, start, model))
+        start--;
+    }
+    if (start == 0 ||
+        (!StrongModel(model)) ||
+        (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
+      // crownify rows[start, end)
+      const ParagraphModel *crown_model = model;
+      if (StrongModel(model)) {
+          if (model->justification() == JUSTIFICATION_LEFT)
+            crown_model = kCrownLeft;
+          else
+            crown_model = kCrownRight;
+      }
+      (*rows)[start].SetUnknown();
+      (*rows)[start].AddStartLine(crown_model);
+      for (int row = start + 1; row < end; row++) {
+        (*rows)[row].SetUnknown();
+        (*rows)[row].AddBodyLine(crown_model);
+      }
+    }
+  }
+  DiscardUnusedModels(*rows, theory);
+}
+
+
+// Clear all hypotheses about lines [start, end) and reset margins.
+//
+// The empty space between the left of a row and the block boundary (and
+// similarly for the right) is split into two pieces: margin and indent.
+// In initial processing, we assume the block is tight and the margin for
+// all lines is set to zero.   However, if our first pass does not yield
+// models for  everything,  it may be  due to an  inset paragraph like a
+// block-quote.   In that case, we make a second pass over that unmarked
+// section of the page and reset the "margin" portion of the empty space
+// to the common amount of space at  the ends of the lines under consid-
+// eration.    This would be equivalent to percentile set to 0. However,
+// sometimes we have a single character sticking out in the right margin
+// of a text block  (like the 'r' in 'for' on line 3 above),  and we can
+// really  just ignore it as an outlier.   To express this, we allow the
+// user to specify  the percentile (0..100)  of indent values  to use as
+// the common margin for each row in the run of rows[start, end).
+void RecomputeMarginsAndClearHypotheses(
+    GenericVector<RowScratchRegisters> *rows, int start, int end,
+    int percentile) {
+  if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
+    return;
+
+  int lmin, lmax, rmin, rmax;
+  lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
+  rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
+  for (int i = start; i < end; i++) {
+    RowScratchRegisters &sr = (*rows)[i];
+    sr.SetUnknown();
+    if (sr.ri_->num_words == 0)
+      continue;
+    UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
+    UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
+  }
+  STATS lefts(lmin, lmax + 1);
+  STATS rights(rmin, rmax + 1);
+  for (int i = start; i < end; i++) {
+    RowScratchRegisters &sr = (*rows)[i];
+    if (sr.ri_->num_words == 0)
+      continue;
+    lefts.add(sr.lmargin_ + sr.lindent_, 1);
+    rights.add(sr.rmargin_ + sr.rindent_, 1);
+  }
+  int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
+  int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
+  for (int i = start; i < end; i++) {
+    RowScratchRegisters &sr = (*rows)[i];
+    int ldelta = ignorable_left - sr.lmargin_;
+    sr.lmargin_ += ldelta;
+    sr.lindent_ -= ldelta;
+    int rdelta = ignorable_right - sr.rmargin_;
+    sr.rmargin_ += rdelta;
+    sr.rindent_ -= rdelta;
+  }
+}
+
+// Return the median inter-word space in rows[row_start, row_end).
+int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,
+                   int row_start, int row_end) {
+  if (row_end < row_start + 1) return 1;
+  int word_height = (rows[row_start].ri_->lword_box.height() +
+                     rows[row_end - 1].ri_->lword_box.height()) / 2;
+  int word_width = (rows[row_start].ri_->lword_box.width() +
+                    rows[row_end - 1].ri_->lword_box.width())  / 2;
+  STATS spacing_widths(0, 5 + word_width);
+  for (int i = row_start; i < row_end; i++) {
+    if (rows[i].ri_->num_words > 1) {
+      spacing_widths.add(rows[i].ri_->average_interword_space, 1);
+    }
+  }
+  int minimum_reasonable_space = word_height / 3;
+  if (minimum_reasonable_space < 2)
+    minimum_reasonable_space = 2;
+  int median = spacing_widths.median();
+  return (median > minimum_reasonable_space)
+      ? median : minimum_reasonable_space;
+}
+
+// Return whether the first word on the after line can fit in the space at
+// the end of the before line (knowing which way the text is aligned and read).
+bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
+                           const RowScratchRegisters &after,
+                           tesseract::ParagraphJustification justification) {
+  if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
+    return true;
+
+  if (justification == JUSTIFICATION_UNKNOWN) {
+    tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
+  }
+  int available_space;
+  if (justification == JUSTIFICATION_CENTER) {
+    available_space = before.lindent_ + before.rindent_;
+  } else {
+    available_space = before.OffsideIndent(justification);
+  }
+  available_space -= before.ri_->average_interword_space;
+
+  if (before.ri_->ltr)
+    return after.ri_->lword_box.width() < available_space;
+  return after.ri_->rword_box.width() < available_space;
+}
+
+// Return whether the first word on the after line can fit in the space at
+// the end of the before line (not knowing which way the text goes) in a left
+// or right alignment.
+bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
+                           const RowScratchRegisters &after) {
+  if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
+    return true;
+
+  int available_space = before.lindent_;
+  if (before.rindent_ > available_space)
+    available_space = before.rindent_;
+  available_space -= before.ri_->average_interword_space;
+
+  if (before.ri_->ltr)
+    return after.ri_->lword_box.width() < available_space;
+  return after.ri_->rword_box.width() < available_space;
+}
+
+static bool TextSupportsBreak(const RowScratchRegisters &before,
+                              const RowScratchRegisters &after) {
+  if (before.ri_->ltr) {
+    return before.ri_->rword_likely_ends_idea &&
+           after.ri_->lword_likely_starts_idea;
+  } else {
+    return before.ri_->lword_likely_ends_idea &&
+           after.ri_->rword_likely_starts_idea;
+  }
+}
+
+static bool LikelyParagraphStart(const RowScratchRegisters &before,
+                                 const RowScratchRegisters &after,
+                                 tesseract::ParagraphJustification j) {
+  return before.ri_->num_words == 0 ||
+      (FirstWordWouldHaveFit(before, after, j) &&
+       TextSupportsBreak(before, after));
+}
+
+// Examine rows[start, end) and try to determine what sort of ParagraphModel
+// would fit them as a single paragraph.
+// If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN.
+// If the rows given could be a consistent start to a paragraph, set *consistent
+// true.
+static ParagraphModel InternalParagraphModelByOutline(
+    const GenericVector<RowScratchRegisters> *rows,
+    int start, int end, int tolerance, bool *consistent) {
+  int ltr_line_count = 0;
+  for (int i = start; i < end; i++) {
+    ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
+  }
+  bool ltr = (ltr_line_count >= (end - start) / 2);
+
+  *consistent = true;
+  if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
+    return ParagraphModel();
+
+  // Ensure the caller only passed us a region with a common rmargin and
+  // lmargin.
+  int lmargin = (*rows)[start].lmargin_;
+  int rmargin = (*rows)[start].rmargin_;
+  int lmin, lmax, rmin, rmax, cmin, cmax;
+  lmin = lmax = (*rows)[start + 1].lindent_;
+  rmin = rmax = (*rows)[start + 1].rindent_;
+  cmin = cmax = 0;
+  for (int i = start + 1; i < end; i++) {
+    if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
+      tprintf("Margins don't match! Software error.\n");
+      *consistent = false;
+      return ParagraphModel();
+    }
+    UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
+    UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
+    UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
+  }
+  int ldiff = lmax - lmin;
+  int rdiff = rmax - rmin;
+  int cdiff = cmax - cmin;
+  if (rdiff > tolerance && ldiff > tolerance) {
+    if (cdiff < tolerance * 2) {
+      if (end - start < 3)
+        return ParagraphModel();
+      return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
+    }
+    *consistent = false;
+    return ParagraphModel();
+  }
+  if (end - start < 3)  // Don't return a model for two line paras.
+    return ParagraphModel();
+
+  // These booleans keep us from saying something is aligned left when the body
+  // left variance is too large.
+  bool body_admits_left_alignment = ldiff < tolerance;
+  bool body_admits_right_alignment = rdiff < tolerance;
+
+  ParagraphModel left_model =
+      ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
+                     (lmin + lmax) / 2, tolerance);
+  ParagraphModel right_model =
+      ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
+                     (rmin + rmax) / 2, tolerance);
+
+  // These booleans keep us from having an indent on the "wrong side" for the
+  // first line.
+  bool text_admits_left_alignment = ltr || left_model.is_flush();
+  bool text_admits_right_alignment = !ltr || right_model.is_flush();
+
+  // At least one of the edges is less than tolerance in variance.
+  // If the other is obviously ragged, it can't be the one aligned to.
+  // [Note the last line is included in this raggedness.]
+  if (tolerance < rdiff) {
+    if (body_admits_left_alignment && text_admits_left_alignment)
+      return left_model;
+    *consistent = false;
+    return ParagraphModel();
+  }
+  if (tolerance < ldiff) {
+    if (body_admits_right_alignment && text_admits_right_alignment)
+      return right_model;
+    *consistent = false;
+    return ParagraphModel();
+  }
+
+  // At this point, we know the body text doesn't vary much on either side.
+
+  // If the first line juts out oddly in one direction or the other,
+  // that likely indicates the side aligned to.
+  int first_left = (*rows)[start].lindent_;
+  int first_right = (*rows)[start].rindent_;
+
+  if (ltr && body_admits_left_alignment &&
+      (first_left < lmin || first_left > lmax))
+    return left_model;
+  if (!ltr && body_admits_right_alignment &&
+      (first_right < rmin || first_right > rmax))
+    return right_model;
+
+  *consistent = false;
+  return ParagraphModel();
+}
+
+// Examine rows[start, end) and try to determine what sort of ParagraphModel
+// would fit them as a single paragraph.   If nothing fits,
+// justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
+// output if we're debugging.
+static ParagraphModel ParagraphModelByOutline(
+    int debug_level,
+    const GenericVector<RowScratchRegisters> *rows,
+    int start, int end, int tolerance) {
+  bool unused_consistent;
+  ParagraphModel retval = InternalParagraphModelByOutline(
+      rows, start, end, tolerance, &unused_consistent);
+  if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
+    tprintf("Could not determine a model for this paragraph:\n");
+    PrintRowRange(*rows, start, end);
+  }
+  return retval;
+}
+
+// Do rows[start, end) form a single instance of the given paragraph model?
+bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows,
+                  int start, int end, const ParagraphModel *model) {
+  if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
+    return false;
+  if (!ValidFirstLine(rows, start, model)) return false;
+  for (int i = start + 1 ; i < end; i++) {
+    if (!ValidBodyLine(rows, i, model)) return false;
+  }
+  return true;
+}
+
+// Examine rows[row_start, row_end) as an independent section of text,
+// and mark rows that are exceptionally clear as start-of-paragraph
+// and paragraph-body lines.
+//
+// We presume that any lines surrounding rows[row_start, row_end) may
+// have wildly different paragraph models, so we don't key any data off
+// of those lines.
+//
+// We only take the very strongest signals, as we don't want to get
+// confused and marking up centered text, poetry, or source code as
+// clearly part of a typical paragraph.
+static void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows,
+                               int row_start, int row_end) {
+  // Record patently obvious body text.
+  for (int i = row_start + 1; i < row_end; i++) {
+    const RowScratchRegisters &prev = (*rows)[i - 1];
+    RowScratchRegisters &curr = (*rows)[i];
+    tesseract::ParagraphJustification typical_justification =
+        prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
+    if (!curr.ri_->rword_likely_starts_idea &&
+        !curr.ri_->lword_likely_starts_idea &&
+        !FirstWordWouldHaveFit(prev, curr, typical_justification)) {
+      curr.SetBodyLine();
+    }
+  }
+
+  // Record patently obvious start paragraph lines.
+  //
+  // It's an extremely good signal of the start of a paragraph that
+  // the first word would have fit on the end of the previous line.
+  // However, applying just that signal would have us mark random
+  // start lines of lineated text (poetry and source code) and some
+  // centered headings as paragraph start lines.  Therefore, we use
+  // a second qualification for a paragraph start: Not only should
+  // the first word of this line have fit on the previous line,
+  // but also, this line should go full to the right of the block,
+  // disallowing a subsequent word from having fit on this line.
+
+  // First row:
+  {
+    RowScratchRegisters &curr = (*rows)[row_start];
+    RowScratchRegisters &next = (*rows)[row_start + 1];
+    tesseract::ParagraphJustification j =
+        curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
+    if (curr.GetLineType() == LT_UNKNOWN &&
+        !FirstWordWouldHaveFit(curr, next, j) &&
+        (curr.ri_->lword_likely_starts_idea ||
+         curr.ri_->rword_likely_starts_idea)) {
+      curr.SetStartLine();
+    }
+  }
+  // Middle rows
+  for (int i = row_start + 1; i < row_end - 1; i++) {
+    RowScratchRegisters &prev = (*rows)[i - 1];
+    RowScratchRegisters &curr = (*rows)[i];
+    RowScratchRegisters &next = (*rows)[i + 1];
+    tesseract::ParagraphJustification j =
+        curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
+    if (curr.GetLineType() == LT_UNKNOWN &&
+        !FirstWordWouldHaveFit(curr, next, j) &&
+        LikelyParagraphStart(prev, curr, j)) {
+      curr.SetStartLine();
+    }
+  }
+  // Last row
+  {  // the short circuit at the top means we have at least two lines.
+    RowScratchRegisters &prev = (*rows)[row_end - 2];
+    RowScratchRegisters &curr = (*rows)[row_end - 1];
+    tesseract::ParagraphJustification j =
+        curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
+    if (curr.GetLineType() == LT_UNKNOWN &&
+        !FirstWordWouldHaveFit(curr, curr, j) &&
+        LikelyParagraphStart(prev, curr, j)) {
+      curr.SetStartLine();
+    }
+  }
+}
+
+// Look for sequences of a start line followed by some body lines in
+// rows[row_start, row_end) and create ParagraphModels for them if
+// they seem coherent.
+static void ModelStrongEvidence(int debug_level,
+                                GenericVector<RowScratchRegisters> *rows,
+                                int row_start, int row_end,
+                                bool allow_flush_models,
+                                ParagraphTheory *theory) {
+  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
+    return;
+
+  int start = row_start;
+  while (start < row_end) {
+    while (start < row_end && (*rows)[start].GetLineType() != LT_START)
+      start++;
+    if (start >= row_end - 1)
+      break;
+
+    int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
+    int end = start;
+    ParagraphModel last_model;
+    bool next_consistent;
+    do {
+      ++end;
+      // rows[row, end) was consistent.
+      // If rows[row, end + 1) is not consistent,
+      //   just model rows[row, end)
+      if (end < row_end - 1) {
+        RowScratchRegisters &next = (*rows)[end];
+        LineType lt = next.GetLineType();
+        next_consistent = lt == LT_BODY ||
+            (lt == LT_UNKNOWN &&
+             !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
+      } else {
+        next_consistent = false;
+      }
+      if (next_consistent) {
+        ParagraphModel next_model = InternalParagraphModelByOutline(
+            rows, start, end + 1, tolerance, &next_consistent);
+        if (((*rows)[start].ri_->ltr &&
+             last_model.justification() == JUSTIFICATION_LEFT &&
+             next_model.justification() != JUSTIFICATION_LEFT) ||
+            (!(*rows)[start].ri_->ltr &&
+             last_model.justification() == JUSTIFICATION_RIGHT &&
+             next_model.justification() != JUSTIFICATION_RIGHT)) {
+          next_consistent = false;
+        }
+        last_model = next_model;
+      } else {
+        next_consistent = false;
+      }
+    } while (next_consistent && end < row_end);
+    // At this point, rows[start, end) looked like it could have been a
+    // single paragraph.  If we can make a good ParagraphModel for it,
+    // do so and mark this sequence with that model.
+    if (end > start + 1) {
+      // emit a new paragraph if we have more than one line.
+      const ParagraphModel *model = nullptr;
+      ParagraphModel new_model = ParagraphModelByOutline(
+          debug_level, rows, start, end,
+          Epsilon(InterwordSpace(*rows, start, end)));
+      if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
+        // couldn't create a good model, oh well.
+      } else if (new_model.is_flush()) {
+        if (end == start + 2) {
+          // It's very likely we just got two paragraph starts in a row.
+          end = start + 1;
+        } else if (start == row_start) {
+          // Mark this as a Crown.
+          if (new_model.justification() == JUSTIFICATION_LEFT) {
+            model = kCrownLeft;
+          } else {
+            model = kCrownRight;
+          }
+        } else if (allow_flush_models) {
+          model = theory->AddModel(new_model);
+        }
+      } else {
+        model = theory->AddModel(new_model);
+      }
+      if (model) {
+        (*rows)[start].AddStartLine(model);
+        for (int i = start + 1; i < end; i++) {
+          (*rows)[i].AddBodyLine(model);
+        }
+      }
+    }
+    start = end;
+  }
+}
+
+// We examine rows[row_start, row_end) and do the following:
+//   (1) Clear all existing hypotheses for the rows being considered.
+//   (2) Mark up any rows as exceptionally likely to be paragraph starts
+//       or paragraph body lines as such using both geometric and textual
+//       clues.
+//   (3) Form models for any sequence of start + continuation lines.
+//   (4) Smear the paragraph models to cover surrounding text.
+static void StrongEvidenceClassify(int debug_level,
+                                   GenericVector<RowScratchRegisters> *rows,
+                                   int row_start, int row_end,
+                                   ParagraphTheory *theory) {
+  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
+    return;
+
+  if (debug_level > 1) {
+    tprintf("#############################################\n");
+    tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
+    tprintf("#############################################\n");
+  }
+
+  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
+  MarkStrongEvidence(rows, row_start, row_end);
+
+  DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);
+
+  // Create paragraph models.
+  ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);
+
+  DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);
+
+  // At this point, some rows are marked up as paragraphs with model numbers,
+  // and some rows are marked up as either LT_START or LT_BODY.  Now let's
+  // smear any good paragraph hypotheses forward and backward.
+  ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
+  smearer.Smear();
+}
+
+static void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows,
+                                      int row_start, int row_end,
+                                      ParagraphTheory *theory) {
+  for (int i = row_start + 1; i < row_end - 1; i++) {
+    if ((*rows)[i - 1].ri_->has_leaders &&
+        (*rows)[i].ri_->has_leaders &&
+        (*rows)[i + 1].ri_->has_leaders) {
+      const ParagraphModel *model = theory->AddModel(
+          ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
+      (*rows)[i].AddStartLine(model);
+    }
+  }
+}
+
+// Collect sequences of unique hypotheses in row registers and create proper
+// paragraphs for them, referencing the paragraphs in row_owners.
+static void ConvertHypothesizedModelRunsToParagraphs(
+    int debug_level,
+    GenericVector<RowScratchRegisters> &rows,
+    GenericVector<PARA *> *row_owners,
+    ParagraphTheory *theory) {
+  int end = rows.size();
+  int start;
+  for (; end > 0; end = start) {
+    start = end - 1;
+    const ParagraphModel *model = nullptr;
+    // TODO(eger): Be smarter about dealing with multiple hypotheses.
+    bool single_line_paragraph = false;
+    SetOfModels models;
+    rows[start].NonNullHypotheses(&models);
+    if (!models.empty()) {
+      model = models[0];
+      if (rows[start].GetLineType(model) != LT_BODY)
+        single_line_paragraph = true;
+    }
+    if (model && !single_line_paragraph) {
+      // walk back looking for more body lines and then a start line.
+      while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
+        // do nothing
+      }
+      if (start < 0 || rows[start].GetLineType(model) != LT_START) {
+        model = nullptr;
+      }
+    }
+    if (model == nullptr) {
+      continue;
+    }
+    // rows[start, end) should be a paragraph.
+    PARA *p = new PARA();
+    if (model == kCrownLeft || model == kCrownRight) {
+      p->is_very_first_or_continuation = true;
+      // Crown paragraph.
+      //   If we can find an existing ParagraphModel that fits, use it,
+      //   else create a new one.
+      for (int row = end; row < rows.size(); row++) {
+        if ((*row_owners)[row] &&
+            (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
+            (start == 0 ||
+             ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
+          model = (*row_owners)[row]->model;
+          break;
+        }
+      }
+      if (model == kCrownLeft) {
+        // No subsequent model fits, so cons one up.
+        model = theory->AddModel(ParagraphModel(
+            JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_,
+            0, 0, Epsilon(rows[start].ri_->average_interword_space)));
+      } else if (model == kCrownRight) {
+        // No subsequent model fits, so cons one up.
+        model = theory->AddModel(ParagraphModel(
+            JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_,
+            0, 0, Epsilon(rows[start].ri_->average_interword_space)));
+      }
+    }
+    rows[start].SetUnknown();
+    rows[start].AddStartLine(model);
+    for (int i = start + 1; i < end; i++) {
+      rows[i].SetUnknown();
+      rows[i].AddBodyLine(model);
+    }
+    p->model = model;
+    p->has_drop_cap = rows[start].ri_->has_drop_cap;
+    p->is_list_item =
+        model->justification() == JUSTIFICATION_RIGHT
+            ? rows[start].ri_->rword_indicates_list_item
+            : rows[start].ri_->lword_indicates_list_item;
+    for (int row = start; row < end; row++) {
+      if ((*row_owners)[row] != nullptr) {
+        tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
+                "more than once!\n");
+        delete (*row_owners)[row];
+      }
+      (*row_owners)[row] = p;
+    }
+  }
+}
+
+struct Interval {
+  Interval() : begin(0), end(0) {}
+  Interval(int b, int e) : begin(b), end(e) {}
+
+  int begin;
+  int end;
+};
+
+// Return whether rows[row] appears to be stranded, meaning that the evidence
+// for this row is very weak due to context.  For instance, two lines of source
+// code may happen to be indented at the same tab vector as body text starts,
+// leading us to think they are two start-of-paragraph lines.  This is not
+// optimal.  However, we also don't want to mark a sequence of short dialog
+// as "weak," so our heuristic is:
+//   (1) If a line is surrounded by lines of unknown type, it's weak.
+//   (2) If two lines in a row are start lines for a given paragraph type, but
+//       after that the same paragraph type does not continue, they're weak.
+static bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows,
+                          int row) {
+  SetOfModels row_models;
+  rows[row].StrongHypotheses(&row_models);
+
+  for (int m = 0; m < row_models.size(); m++) {
+    bool all_starts = rows[row].GetLineType();
+    int run_length = 1;
+    bool continues = true;
+    for (int i = row - 1; i >= 0 && continues; i--) {
+      SetOfModels models;
+      rows[i].NonNullHypotheses(&models);
+      switch (rows[i].GetLineType(row_models[m])) {
+        case LT_START: run_length++; break;
+        case LT_MULTIPLE:  // explicit fall-through
+        case LT_BODY: run_length++; all_starts = false; break;
+        case LT_UNKNOWN:  // explicit fall-through
+        default: continues = false;
+      }
+    }
+    continues = true;
+    for (int i = row + 1; i < rows.size() && continues; i++) {
+      SetOfModels models;
+      rows[i].NonNullHypotheses(&models);
+      switch (rows[i].GetLineType(row_models[m])) {
+        case LT_START: run_length++; break;
+        case LT_MULTIPLE:  // explicit fall-through
+        case LT_BODY: run_length++; all_starts = false; break;
+        case LT_UNKNOWN:  // explicit fall-through
+        default: continues = false;
+      }
+    }
+    if (run_length > 2 || (!all_starts && run_length > 1)) return false;
+  }
+  return true;
+}
+
+// Go through rows[row_start, row_end) and gather up sequences that need better
+// classification.
+// + Sequences of non-empty rows without hypotheses.
+// + Crown paragraphs not immediately followed by a strongly modeled line.
+// + Single line paragraphs surrounded by text that doesn't match the
+//   model.
+static void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows,
+                             GenericVector<Interval> *to_fix,
+                             int row_start, int row_end) {
+  to_fix->clear();
+  for (int i = row_start; i < row_end; i++) {
+    bool needs_fixing = false;
+
+    SetOfModels models;
+    SetOfModels models_w_crowns;
+    rows[i].StrongHypotheses(&models);
+    rows[i].NonNullHypotheses(&models_w_crowns);
+    if (models.empty() && !models_w_crowns.empty()) {
+      // Crown paragraph.  Is it followed by a modeled line?
+      for (int end = i + 1; end < rows.size(); end++) {
+        SetOfModels end_models;
+        SetOfModels strong_end_models;
+        rows[end].NonNullHypotheses(&end_models);
+        rows[end].StrongHypotheses(&strong_end_models);
+        if (end_models.empty()) {
+          needs_fixing = true;
+          break;
+        } else if (!strong_end_models.empty()) {
+          needs_fixing = false;
+          break;
+        }
+      }
+    } else if (models.empty() && rows[i].ri_->num_words > 0) {
+      // No models at all.
+      needs_fixing = true;
+    }
+
+    if (!needs_fixing && !models.empty()) {
+      needs_fixing = RowIsStranded(rows, i);
+    }
+
+    if (needs_fixing) {
+      if (!to_fix->empty() && to_fix->back().end == i - 1)
+        to_fix->back().end = i;
+      else
+        to_fix->push_back(Interval(i, i));
+    }
+  }
+  // Convert inclusive intervals to half-open intervals.
+  for (int i = 0; i < to_fix->size(); i++) {
+    (*to_fix)[i].end = (*to_fix)[i].end + 1;
+  }
+}
+
+// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
+// normalize each row_owner to point to an actual PARA, and output the
+// paragraphs in order onto paragraphs.
+void CanonicalizeDetectionResults(
+    GenericVector<PARA *> *row_owners,
+    PARA_LIST *paragraphs) {
+  GenericVector<PARA *> &rows = *row_owners;
+  paragraphs->clear();
+  PARA_IT out(paragraphs);
+  PARA *formerly_null = nullptr;
+  for (int i = 0; i < rows.size(); i++) {
+    if (rows[i] == nullptr) {
+      if (i == 0 || rows[i - 1] != formerly_null) {
+        rows[i] = formerly_null = new PARA();
+      } else {
+        rows[i] = formerly_null;
+        continue;
+      }
+    } else if (i > 0 && rows[i - 1] == rows[i]) {
+      continue;
+    }
+    out.add_after_then_move(rows[i]);
+  }
+}
+
+// Main entry point for Paragraph Detection Algorithm.
+//
+// Given a set of equally spaced textlines (described by row_infos),
+// Split them into paragraphs.
+//
+// Output:
+//   row_owners - one pointer for each row, to the paragraph it belongs to.
+//   paragraphs - this is the actual list of PARA objects.
+//   models - the list of paragraph models referenced by the PARA objects.
+//            caller is responsible for deleting the models.
+void DetectParagraphs(int debug_level,
+                      std::vector<RowInfo> *row_infos,
+                      GenericVector<PARA *> *row_owners,
+                      PARA_LIST *paragraphs,
+                      std::vector<ParagraphModel *> *models) {
+  GenericVector<RowScratchRegisters> rows;
+  ParagraphTheory theory(models);
+
+  // Initialize row_owners to be a bunch of nullptr pointers.
+  row_owners->init_to_size(row_infos->size(), nullptr);
+
+  // Set up row scratch registers for the main algorithm.
+  rows.init_to_size(row_infos->size(), RowScratchRegisters());
+  for (int i = 0; i < row_infos->size(); i++) {
+    rows[i].Init((*row_infos)[i]);
+  }
+
+  // Pass 1:
+  //   Detect sequences of lines that all contain leader dots (.....)
+  //   These are likely Tables of Contents.  If there are three text lines in
+  //   a row with leader dots, it's pretty safe to say the middle one should
+  //   be a paragraph of its own.
+  SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
+
+  DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
+
+  GenericVector<Interval> leftovers;
+  LeftoverSegments(rows, &leftovers, 0, rows.size());
+  for (int i = 0; i < leftovers.size(); i++) {
+    // Pass 2a:
+    //   Find any strongly evidenced start-of-paragraph lines.  If they're
+    //   followed by two lines that look like body lines, make a paragraph
+    //   model for that and see if that model applies throughout the text
+    //   (that is, "smear" it).
+    StrongEvidenceClassify(debug_level, &rows,
+                           leftovers[i].begin, leftovers[i].end, &theory);
+
+    // Pass 2b:
+    //   If we had any luck in pass 2a, we got part of the page and didn't
+    //   know how to classify a few runs of rows. Take the segments that
+    //   didn't find a model and reprocess them individually.
+    GenericVector<Interval> leftovers2;
+    LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
+    bool pass2a_was_useful = leftovers2.size() > 1 ||
+        (leftovers2.size() == 1 &&
+         (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));
+    if (pass2a_was_useful) {
+      for (int j = 0; j < leftovers2.size(); j++) {
+        StrongEvidenceClassify(debug_level, &rows,
+                               leftovers2[j].begin, leftovers2[j].end,
+                               &theory);
+      }
+    }
+  }
+
+  DebugDump(debug_level > 1, "End of Pass 2", theory, rows);
+
+  // Pass 3:
+  //   These are the dregs for which we didn't have enough strong textual
+  //   and geometric clues to form matching models for.  Let's see if
+  //   the geometric clues are simple enough that we could just use those.
+  LeftoverSegments(rows, &leftovers, 0, rows.size());
+  for (int i = 0; i < leftovers.size(); i++) {
+    GeometricClassify(debug_level, &rows,
+                      leftovers[i].begin, leftovers[i].end, &theory);
+  }
+
+  // Undo any flush models for which there's little evidence.
+  DowngradeWeakestToCrowns(debug_level, &theory, &rows);
+
+  DebugDump(debug_level > 1, "End of Pass 3", theory, rows);
+
+  // Pass 4:
+  //   Take everything that's still not marked up well and clear all markings.
+  LeftoverSegments(rows, &leftovers, 0, rows.size());
+  for (int i = 0; i < leftovers.size(); i++) {
+    for (int j = leftovers[i].begin; j < leftovers[i].end; j++) {
+      rows[j].SetUnknown();
+    }
+  }
+
+  DebugDump(debug_level > 1, "End of Pass 4", theory, rows);
+
+  // Convert all of the unique hypothesis runs to PARAs.
+  ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners,
+                                           &theory);
+
+  DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);
+
+  // Finally, clean up any dangling nullptr row paragraph parents.
+  CanonicalizeDetectionResults(row_owners, paragraphs);
+}
+
+// ============ Code interfacing with the rest of Tesseract ==================
+
+static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it,
+                                                 RowInfo *info) {
+  // Set up text, lword_text, and rword_text (mostly for debug printing).
+  STRING fake_text;
+  PageIterator pit(static_cast<const PageIterator&>(it));
+  bool first_word = true;
+  if (!pit.Empty(RIL_WORD)) {
+    do {
+      fake_text += "x";
+      if (first_word) info->lword_text += "x";
+      info->rword_text += "x";
+      if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&
+          !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) {
+        fake_text += " ";
+        info->rword_text = "";
+        first_word = false;
+      }
+    } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) &&
+             pit.Next(RIL_SYMBOL));
+  }
+  if (fake_text.size() == 0) return;
+
+  int lspaces = info->pix_ldistance / info->average_interword_space;
+  for (int i = 0; i < lspaces; i++) {
+    info->text += ' ';
+  }
+  info->text += fake_text;
+
+  // Set up lword_box, rword_box, and num_words.
+  PAGE_RES_IT page_res_it = *it.PageResIt();
+  WERD_RES *word_res = page_res_it.restart_row();
+  ROW_RES *this_row = page_res_it.row();
+
+  WERD_RES *lword = nullptr;
+  WERD_RES *rword = nullptr;
+  info->num_words = 0;
+  do {
+    if (word_res) {
+      if (!lword) lword = word_res;
+      if (rword != word_res) info->num_words++;
+      rword = word_res;
+    }
+    word_res = page_res_it.forward();
+  } while (page_res_it.row() == this_row);
+
+  if (lword) info->lword_box = lword->word->bounding_box();
+  if (rword) info->rword_box = rword->word->bounding_box();
+}
+
+
+// Given a Tesseract Iterator pointing to a text line, fill in the paragraph
+// detector RowInfo with all relevant information from the row.
+static void InitializeRowInfo(bool after_recognition,
+                              const MutableIterator &it, RowInfo *info) {
+  if (it.PageResIt()->row() != nullptr) {
+    ROW *row = it.PageResIt()->row()->row;
+    info->pix_ldistance = row->lmargin();
+    info->pix_rdistance = row->rmargin();
+    info->average_interword_space =
+        row->space() > 0 ? row->space() : std::max(static_cast<int>(row->x_height()), 1);
+    info->pix_xheight = row->x_height();
+    info->has_leaders = false;
+    info->has_drop_cap = row->has_drop_cap();
+    info->ltr = true;  // set below depending on word scripts
+  } else {
+    info->pix_ldistance = info->pix_rdistance = 0;
+    info->average_interword_space = 1;
+    info->pix_xheight = 1.0;
+    info->has_leaders = false;
+    info->has_drop_cap = false;
+    info->ltr = true;
+  }
+
+  info->num_words = 0;
+  info->lword_indicates_list_item = false;
+  info->lword_likely_starts_idea = false;
+  info->lword_likely_ends_idea = false;
+  info->rword_indicates_list_item = false;
+  info->rword_likely_starts_idea = false;
+  info->rword_likely_ends_idea = false;
+  info->has_leaders = false;
+  info->ltr = true;
+
+  if (!after_recognition) {
+    InitializeTextAndBoxesPreRecognition(it, info);
+    return;
+  }
+  info->text = "";
+  const std::unique_ptr<const char[]> text(it.GetUTF8Text(RIL_TEXTLINE));
+  int trailing_ws_idx = strlen(text.get());  // strip trailing space
+  while (trailing_ws_idx > 0 &&
+         // isspace() only takes ASCII
+         isascii(text[trailing_ws_idx - 1]) &&
+         isspace(text[trailing_ws_idx - 1]))
+    trailing_ws_idx--;
+  if (trailing_ws_idx > 0) {
+    int lspaces = info->pix_ldistance / info->average_interword_space;
+    for (int i = 0; i < lspaces; i++)
+      info->text += ' ';
+    for (int i = 0; i < trailing_ws_idx; i++)
+      info->text += text[i];
+  }
+
+  if (info->text.size() == 0) {
+    return;
+  }
+
+  PAGE_RES_IT page_res_it = *it.PageResIt();
+  GenericVector<WERD_RES *> werds;
+  WERD_RES *word_res = page_res_it.restart_row();
+  ROW_RES *this_row = page_res_it.row();
+  int num_leaders = 0;
+  int ltr = 0;
+  int rtl = 0;
+  do {
+    if (word_res && word_res->best_choice->unichar_string().length() > 0) {
+      werds.push_back(word_res);
+      ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
+      rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
+      if (word_res->word->flag(W_REP_CHAR)) num_leaders++;
+    }
+    word_res = page_res_it.forward();
+  } while (page_res_it.row() == this_row);
+  info->ltr = ltr >= rtl;
+  info->has_leaders = num_leaders > 3;
+  info->num_words = werds.size();
+  if (!werds.empty()) {
+    WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
+    info->lword_text = lword->best_choice->unichar_string().c_str();
+    info->rword_text = rword->best_choice->unichar_string().c_str();
+    info->lword_box = lword->word->bounding_box();
+    info->rword_box = rword->word->bounding_box();
+    LeftWordAttributes(lword->uch_set, lword->best_choice,
+                       info->lword_text,
+                       &info->lword_indicates_list_item,
+                       &info->lword_likely_starts_idea,
+                       &info->lword_likely_ends_idea);
+    RightWordAttributes(rword->uch_set, rword->best_choice,
+                        info->rword_text,
+                        &info->rword_indicates_list_item,
+                        &info->rword_likely_starts_idea,
+                        &info->rword_likely_ends_idea);
+  }
+}
+
+// This is called after rows have been identified and words are recognized.
+// Much of this could be implemented before word recognition, but text helps
+// to identify bulleted lists and gives good signals for sentence boundaries.
+void DetectParagraphs(int debug_level,
+                      bool after_text_recognition,
+                      const MutableIterator *block_start,
+                      std::vector<ParagraphModel *> *models) {
+  // Clear out any preconceived notions.
+  if (block_start->Empty(RIL_TEXTLINE)) {
+    return;
+  }
+  BLOCK *block = block_start->PageResIt()->block()->block;
+  block->para_list()->clear();
+  bool is_image_block = block->pdblk.poly_block() && !block->pdblk.poly_block()->IsText();
+
+  // Convert the Tesseract structures to RowInfos
+  // for the paragraph detection algorithm.
+  MutableIterator row(*block_start);
+  if (row.Empty(RIL_TEXTLINE))
+    return;  // end of input already.
+
+  std::vector<RowInfo> row_infos;
+  do {
+    if (!row.PageResIt()->row())
+      continue;  // empty row.
+    row.PageResIt()->row()->row->set_para(nullptr);
+    row_infos.push_back(RowInfo());
+    RowInfo &ri = row_infos.back();
+    InitializeRowInfo(after_text_recognition, row, &ri);
+  } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) &&
+           row.Next(RIL_TEXTLINE));
+
+  // If we're called before text recognition, we might not have
+  // tight block bounding boxes, so trim by the minimum on each side.
+  if (!row_infos.empty()) {
+    int min_lmargin = row_infos[0].pix_ldistance;
+    int min_rmargin = row_infos[0].pix_rdistance;
+    for (int i = 1; i < row_infos.size(); i++) {
+      if (row_infos[i].pix_ldistance < min_lmargin)
+        min_lmargin = row_infos[i].pix_ldistance;
+      if (row_infos[i].pix_rdistance < min_rmargin)
+        min_rmargin = row_infos[i].pix_rdistance;
+    }
+    if (min_lmargin > 0 || min_rmargin > 0) {
+      for (int i = 0; i < row_infos.size(); i++) {
+        row_infos[i].pix_ldistance -= min_lmargin;
+        row_infos[i].pix_rdistance -= min_rmargin;
+      }
+    }
+  }
+
+  // Run the paragraph detection algorithm.
+  GenericVector<PARA *> row_owners;
+  GenericVector<PARA *> the_paragraphs;
+  if (!is_image_block) {
+    DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(),
+                     models);
+  } else {
+    row_owners.init_to_size(row_infos.size(), nullptr);
+    CanonicalizeDetectionResults(&row_owners, block->para_list());
+  }
+
+  // Now stitch in the row_owners into the rows.
+  row = *block_start;
+  for (int i = 0; i < row_owners.size(); i++) {
+    while (!row.PageResIt()->row())
+      row.Next(RIL_TEXTLINE);
+    row.PageResIt()->row()->row->set_para(row_owners[i]);
+    row.Next(RIL_TEXTLINE);
+  }
+}
+
+}  // namespace
diff --git a/tesseract/src/ccmain/paragraphs.h b/tesseract/src/ccmain/paragraphs.h
new file mode 100644
index 00000000..edf9b8cc
--- /dev/null
+++ b/tesseract/src/ccmain/paragraphs.h
@@ -0,0 +1,110 @@
+/**********************************************************************
+ * File:        paragraphs.h
+ * Description: Paragraph Detection data structures.
+ * Author:      David Eger
+ * Created:     25 February 2011
+ *
+ * (C) Copyright 2011, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_
+#define TESSERACT_CCMAIN_PARAGRAPHS_H_
+
+#include "rect.h"    // for TBOX
+#include "strngs.h"  // for STRING
+#include <list>
+
+namespace tesseract {
+
+class MutableIterator;
+class ParagraphModel;
+class PARA_LIST;
+struct PARA;
+
+template <typename T> class GenericVector;
+
+// This structure captures all information needed about a text line for the
+// purposes of paragraph detection.  It is meant to be exceedingly light-weight
+// so that we can easily test paragraph detection independent of the rest of
+// Tesseract.
+class RowInfo {
+ public:
+  // Constant data derived from Tesseract output.
+  STRING text;        // the full UTF-8 text of the line.
+  bool ltr;           // whether the majority of the text is left-to-right
+                      // TODO(eger) make this more fine-grained.
+
+  bool has_leaders;   // does the line contain leader dots (.....)?
+  bool has_drop_cap;  // does the line have a drop cap?
+  int pix_ldistance;  // distance to the left pblock boundary in pixels
+  int pix_rdistance;  // distance to the right pblock boundary in pixels
+  float pix_xheight;  // guessed xheight for the line
+  int average_interword_space; // average space between words in pixels.
+
+  int num_words;
+  TBOX lword_box;     // in normalized (horiz text rows) space
+  TBOX rword_box;     // in normalized (horiz text rows) space
+
+  STRING lword_text;   // the UTF-8 text of the leftmost werd
+  STRING rword_text;   // the UTF-8 text of the rightmost werd
+
+  //   The text of a paragraph typically starts with the start of an idea and
+  // ends with the end of an idea.  Here we define paragraph as something that
+  // may have a first line indent and a body indent which may be different.
+  // Typical words that start an idea are:
+  //   1. Words in western scripts that start with
+  //      a capital letter, for example "The"
+  //   2. Bulleted or numbered list items, for
+  //      example "2."
+  // Typical words which end an idea are words ending in punctuation marks. In
+  // this vocabulary, each list item is represented as a paragraph.
+  bool lword_indicates_list_item;
+  bool lword_likely_starts_idea;
+  bool lword_likely_ends_idea;
+
+  bool rword_indicates_list_item;
+  bool rword_likely_starts_idea;
+  bool rword_likely_ends_idea;
+};
+
+// Main entry point for Paragraph Detection Algorithm.
+//
+// Given a set of equally spaced textlines (described by row_infos),
+// Split them into paragraphs.  See http://goto/paragraphstalk
+//
+// Output:
+//   row_owners - one pointer for each row, to the paragraph it belongs to.
+//   paragraphs - this is the actual list of PARA objects.
+//   models - the list of paragraph models referenced by the PARA objects.
+//            caller is responsible for deleting the models.
+TESS_API
+void DetectParagraphs(int debug_level,
+                      std::vector<RowInfo> *row_infos,
+                      GenericVector<PARA *> *row_owners,
+                      PARA_LIST *paragraphs,
+                      std::vector<ParagraphModel *> *models);
+
+// Given a MutableIterator to the start of a block, run DetectParagraphs on
+// that block and commit the results to the underlying ROW and BLOCK structs,
+// saving the ParagraphModels in models.  Caller owns the models.
+// We use unicharset during the function to answer questions such as "is the
+// first letter of this word upper case?"
+TESS_API
+void DetectParagraphs(int debug_level,
+                      bool after_text_recognition,
+                      const MutableIterator *block_start,
+                      std::vector<ParagraphModel *> *models);
+
+}  // namespace
+
+#endif  // TESSERACT_CCMAIN_PARAGRAPHS_H_
diff --git a/tesseract/src/ccmain/paragraphs_internal.h b/tesseract/src/ccmain/paragraphs_internal.h
new file mode 100644
index 00000000..be1e2c9b
--- /dev/null
+++ b/tesseract/src/ccmain/paragraphs_internal.h
@@ -0,0 +1,314 @@
+/**********************************************************************
+ * File:        paragraphs_internal.h
+ * Description: Paragraph Detection internal data structures.
+ * Author:      David Eger
+ *
+ * (C) Copyright 2011, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
+#define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
+
+#include "paragraphs.h"
+#include <tesseract/publictypes.h>        // for ParagraphJustification
+
+// NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
+// DATA STRUCTURES OR FUNCTIONS IN THIS FILE.
+
+namespace tesseract {
+
+class UNICHARSET;
+class WERD_CHOICE;
+
+// Return whether the given word is likely to be a list item start word.
+TESS_API
+bool AsciiLikelyListItem(const STRING &word);
+
+// Return the first Unicode Codepoint from werd[pos].
+int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
+
+// Set right word attributes given either a unicharset and werd or a utf8
+// string.
+TESS_API
+void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
+                         const STRING &utf8,
+                         bool *is_list, bool *starts_idea, bool *ends_idea);
+
+// Set left word attributes given either a unicharset and werd or a utf8 string.
+TESS_API
+void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
+                        const STRING &utf8,
+                        bool *is_list, bool *starts_idea, bool *ends_idea);
+
+enum LineType {
+  LT_START = 'S',     // First line of a paragraph.
+  LT_BODY = 'C',      // Continuation line of a paragraph.
+  LT_UNKNOWN = 'U',   // No clues.
+  LT_MULTIPLE = 'M',  // Matches for both LT_START and LT_BODY.
+};
+
+// The first paragraph in a page of body text is often un-indented.
+// This is a typographic convention which is common to indicate either that:
+// (1) The paragraph is the continuation of a previous paragraph, or
+// (2) The paragraph is the first paragraph in a chapter.
+//
+// I refer to such paragraphs as "crown"s, and the output of the paragraph
+// detection algorithm attempts to give them the same paragraph model as
+// the rest of the body text.
+//
+// Nonetheless, while building hypotheses, it is useful to mark the lines
+// of crown paragraphs temporarily as crowns, either aligned left or right.
+extern const ParagraphModel *kCrownLeft;
+extern const ParagraphModel *kCrownRight;
+
+inline bool StrongModel(const ParagraphModel *model) {
+  return model != nullptr && model != kCrownLeft && model != kCrownRight;
+}
+
+struct LineHypothesis {
+  LineHypothesis() : ty(LT_UNKNOWN), model(nullptr) {}
+  LineHypothesis(LineType line_type, const ParagraphModel *m)
+      : ty(line_type), model(m) {}
+  LineHypothesis(const LineHypothesis &other)
+      : ty(other.ty), model(other.model) {}
+
+  // Copy assignment operator.
+  LineHypothesis& operator=(const LineHypothesis& other) {
+    ty = other.ty;
+    model = other.model;
+    return *this;
+  }
+
+  bool operator==(const LineHypothesis &other) const {
+    return ty == other.ty && model == other.model;
+  }
+
+  LineType ty;
+  const ParagraphModel *model;
+};
+
+class ParagraphTheory;  // Forward Declaration
+
+using SetOfModels = GenericVector<const ParagraphModel *>;
+
+// Row Scratch Registers are data generated by the paragraph detection
+// algorithm based on a RowInfo input.
+class RowScratchRegisters {
+ public:
+  // We presume row will outlive us.
+  void Init(const RowInfo &row);
+
+  LineType GetLineType() const;
+
+  LineType GetLineType(const ParagraphModel *model) const;
+
+  // Mark this as a start line type, sans model.  This is useful for the
+  // initial marking of probable body lines or paragraph start lines.
+  void SetStartLine();
+
+  // Mark this as a body line type, sans model.  This is useful for the
+  // initial marking of probably body lines or paragraph start lines.
+  void SetBodyLine();
+
+  // Record that this row fits as a paragraph start line in the given model,
+  void AddStartLine(const ParagraphModel *model);
+  // Record that this row fits as a paragraph body line in the given model,
+  void AddBodyLine(const ParagraphModel *model);
+
+  // Clear all hypotheses about this line.
+  void SetUnknown() { hypotheses_.truncate(0); }
+
+  // Append all hypotheses of strong models that match this row as a start.
+  void StartHypotheses(SetOfModels *models) const;
+
+  // Append all hypotheses of strong models matching this row.
+  void StrongHypotheses(SetOfModels *models) const;
+
+  // Append all hypotheses for this row.
+  void NonNullHypotheses(SetOfModels *models) const;
+
+  // Discard any hypotheses whose model is not in the given list.
+  void DiscardNonMatchingHypotheses(const SetOfModels &models);
+
+  // If we have only one hypothesis and that is that this line is a paragraph
+  // start line of a certain model, return that model.  Else return nullptr.
+  const ParagraphModel *UniqueStartHypothesis() const;
+
+  // If we have only one hypothesis and that is that this line is a paragraph
+  // body line of a certain model, return that model.  Else return nullptr.
+  const ParagraphModel *UniqueBodyHypothesis() const;
+
+  // Return the indentation for the side opposite of the aligned side.
+  int OffsideIndent(tesseract::ParagraphJustification just) const {
+    switch (just) {
+      case tesseract::JUSTIFICATION_RIGHT: return lindent_;
+      case tesseract::JUSTIFICATION_LEFT: return rindent_;
+      default: return lindent_ > rindent_ ? lindent_ : rindent_;
+    }
+  }
+
+  // Return the indentation for the side the text is aligned to.
+  int AlignsideIndent(tesseract::ParagraphJustification just) const {
+    switch (just) {
+      case tesseract::JUSTIFICATION_RIGHT: return rindent_;
+      case tesseract::JUSTIFICATION_LEFT: return lindent_;
+      default: return lindent_ > rindent_ ? lindent_ : rindent_;
+    }
+  }
+
+  // Append header fields to a vector of row headings.
+  static void AppendDebugHeaderFields(std::vector<STRING> *header);
+
+  // Append data for this row to a vector of debug strings.
+  void AppendDebugInfo(const ParagraphTheory &theory,
+                       std::vector<STRING> *dbg) const;
+
+  const RowInfo *ri_;
+
+  // These four constants form a horizontal box model for the white space
+  // on the edges of each line.  At each point in the algorithm, the following
+  // shall hold:
+  //   ri_->pix_ldistance = lmargin_ + lindent_
+  //   ri_->pix_rdistance = rindent_ + rmargin_
+  int lmargin_;
+  int lindent_;
+  int rindent_;
+  int rmargin_;
+
+ private:
+  // Hypotheses of either LT_START or LT_BODY
+  GenericVector<LineHypothesis> hypotheses_;
+};
+
+// A collection of convenience functions for wrapping the set of
+// Paragraph Models we believe correctly model the paragraphs in the image.
+class ParagraphTheory {
+ public:
+  // We presume models will outlive us, and that models will take ownership
+  // of any ParagraphModel *'s we add.
+  explicit ParagraphTheory(std::vector<ParagraphModel *> *models)
+      : models_(models) {}
+  std::vector<ParagraphModel *> &models() { return *models_; }
+  const std::vector<ParagraphModel *> &models() const { return *models_; }
+
+  // Return an existing model if one that is Comparable() can be found.
+  // Else, allocate a new copy of model to save and return a pointer to it.
+  const ParagraphModel *AddModel(const ParagraphModel &model);
+
+  // Discard any models we've made that are not in the list of used models.
+  void DiscardUnusedModels(const SetOfModels &used_models);
+
+  // Return the set of all non-centered models.
+  void NonCenteredModels(SetOfModels *models);
+
+  // If any of the non-centered paragraph models we know about fit
+  // rows[start, end), return it.  Else nullptr.
+  const ParagraphModel *Fits(const GenericVector<RowScratchRegisters> *rows,
+                             int start, int end) const;
+
+  int IndexOf(const ParagraphModel *model) const;
+
+ private:
+  std::vector<ParagraphModel *> *models_;
+  GenericVector<ParagraphModel *> models_we_added_;
+};
+
+bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows,
+                    int row, const ParagraphModel *model);
+bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows,
+                   int row, const ParagraphModel *model);
+bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows,
+                     int a, int b, const ParagraphModel *model);
+
+// A class for smearing Paragraph Model hypotheses to surrounding rows.
+// The idea here is that StrongEvidenceClassify first marks only exceedingly
+// obvious start and body rows and constructs models of them.  Thereafter,
+// we may have left over unmarked lines (mostly end-of-paragraph lines) which
+// were too short to have much confidence about, but which fit the models we've
+// constructed perfectly and which we ought to mark.  This class is used to
+// "smear" our models over the text.
+class ParagraphModelSmearer {
+ public:
+  ParagraphModelSmearer(GenericVector<RowScratchRegisters> *rows,
+                        int row_start, int row_end,
+                        ParagraphTheory *theory);
+
+  // Smear forward paragraph models from existing row markings to subsequent
+  // text lines if they fit, and mark any thereafter still unmodeled rows
+  // with any model in the theory that fits them.
+  void Smear();
+
+ private:
+  // Record in open_models_ for rows [start_row, end_row) the list of models
+  // currently open at each row.
+  // A model is still open in a row if some previous row has said model as a
+  // start hypothesis, and all rows since (including this row) would fit as
+  // either a body or start line in that model.
+  void CalculateOpenModels(int row_start, int row_end);
+
+  SetOfModels &OpenModels(int row) {
+    return open_models_[row - row_start_ + 1];
+  }
+
+  ParagraphTheory *theory_;
+  GenericVector<RowScratchRegisters> *rows_;
+  int row_start_;
+  int row_end_;
+
+  // open_models_ corresponds to rows[start_row_ - 1, end_row_]
+  //
+  // open_models_:  Contains models which there was an active (open) paragraph
+  //                as of the previous line and for which the left and right
+  //                indents admit the possibility that this text line continues
+  //                to fit the same model.
+  // TODO(eger): Think about whether we can get rid of "Open" models and just
+  //   use the current hypotheses on RowScratchRegisters.
+  std::vector<SetOfModels> open_models_;
+};
+
+// Clear all hypotheses about lines [start, end) and reset the margins to the
+// percentile (0..100) value of the left and right row edges for this run of
+// rows.
+void RecomputeMarginsAndClearHypotheses(
+    GenericVector<RowScratchRegisters> *rows, int start, int end,
+    int percentile);
+
+// Return the median inter-word space in rows[row_start, row_end).
+int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,
+                   int row_start, int row_end);
+
+// Return whether the first word on the after line can fit in the space at
+// the end of the before line (knowing which way the text is aligned and read).
+bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
+                           const RowScratchRegisters &after,
+                           tesseract::ParagraphJustification justification);
+
+// Return whether the first word on the after line can fit in the space at
+// the end of the before line (not knowing the text alignment).
+bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
+                           const RowScratchRegisters &after);
+
+// Do rows[start, end) form a single instance of the given paragraph model?
+bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows,
+                  int start, int end, const ParagraphModel *model);
+
+// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
+// normalize each row_owner to point to an actual PARA, and output the
+// paragraphs in order onto paragraphs.
+void CanonicalizeDetectionResults(
+    GenericVector<PARA *> *row_owners,
+    PARA_LIST *paragraphs);
+
+}  // namespace
+
+#endif  // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
diff --git a/tesseract/src/ccmain/paramsd.cpp b/tesseract/src/ccmain/paramsd.cpp
new file mode 100644
index 00000000..9c8b8990
--- /dev/null
+++ b/tesseract/src/ccmain/paramsd.cpp
@@ -0,0 +1,365 @@
+///////////////////////////////////////////////////////////////////////
+// File:        paramsd.cpp
+// Description: Tesseract parameter Editor
+// Author:      Joern Wanke
+//
+// (C) Copyright 2007, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+//
+// The parameters editor is used to edit all the parameters used within
+// tesseract from the ui.
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#ifndef GRAPHICS_DISABLED
+
+#include "paramsd.h"
+#include "params.h"          // for ParamsVectors, StringParam, BoolParam
+#include "scrollview.h"      // for SVEvent, ScrollView, SVET_POPUP
+#include "svmnode.h"         // for SVMenuNode
+#include "tesseractclass.h"  // for Tesseract
+
+#include "genericvector.h"   // for GenericVector
+
+#include <cstdio>            // for fclose, fopen, fprintf, sprintf, FILE
+#include <cstdlib>           // for atoi
+#include <cstring>           // for strcmp, strcspn, strlen, strncpy
+#include <locale>            // for std::locale::classic
+#include <map>               // for map, _Rb_tree_iterator, map<>::iterator
+#include <memory>            // for unique_ptr
+#include <sstream>           // for std::stringstream
+#include <utility>           // for pair
+
+namespace tesseract {
+
+#define VARDIR        "configs/" /*parameters files */
+#define MAX_ITEMS_IN_SUBMENU 30
+
+// The following variables should remain static globals, since they
+// are used by debug editor, which uses a single Tesseract instance.
+//
+// Contains the mappings from unique VC ids to their actual pointers.
+static std::map<int, ParamContent*> vcMap;
+static int nrParams = 0;
+static int writeCommands[2];
+
+ELISTIZE(ParamContent)
+
+// Constructors for the various ParamTypes.
+ParamContent::ParamContent(tesseract::StringParam* it) {
+  my_id_ = nrParams;
+  nrParams++;
+  param_type_ = VT_STRING;
+  sIt = it;
+  vcMap[my_id_] = this;
+}
+// Constructors for the various ParamTypes.
+ParamContent::ParamContent(tesseract::IntParam* it) {
+  my_id_ = nrParams;
+  nrParams++;
+  param_type_ = VT_INTEGER;
+  iIt = it;
+  vcMap[my_id_] = this;
+}
+// Constructors for the various ParamTypes.
+ParamContent::ParamContent(tesseract::BoolParam* it) {
+  my_id_ = nrParams;
+  nrParams++;
+  param_type_ = VT_BOOLEAN;
+  bIt = it;
+  vcMap[my_id_] = this;
+}
+// Constructors for the various ParamTypes.
+ParamContent::ParamContent(tesseract::DoubleParam* it) {
+  my_id_ = nrParams;
+  nrParams++;
+  param_type_ = VT_DOUBLE;
+  dIt = it;
+  vcMap[my_id_] = this;
+}
+
+// Gets a VC object identified by its ID.
+ParamContent* ParamContent::GetParamContentById(int id) {
+  return vcMap[id];
+}
+
+// Copy the first N words from the source string to the target string.
+// Words are delimited by "_".
+void ParamsEditor::GetFirstWords(
+                     const char *s,  // source string
+                     int n,          // number of words
+                     char *t         // target string
+                    ) {
+  int full_length = strlen(s);
+  int reqd_len = 0;              // No. of chars requird
+  const char *next_word = s;
+
+  while ((n > 0) && reqd_len < full_length) {
+    reqd_len += strcspn(next_word, "_") + 1;
+    next_word += reqd_len;
+    n--;
+  }
+  strncpy(t, s, reqd_len);
+  t[reqd_len] = '\0';            // ensure null terminal
+}
+
+// Getter for the name.
+const char* ParamContent::GetName() const {
+  if (param_type_ == VT_INTEGER) { return iIt->name_str(); }
+  else if (param_type_ == VT_BOOLEAN) { return bIt->name_str(); }
+  else if (param_type_ == VT_DOUBLE) { return dIt->name_str(); }
+  else if (param_type_ == VT_STRING) { return sIt->name_str(); }
+  else
+    return "ERROR: ParamContent::GetName()";
+}
+
+// Getter for the description.
+const char* ParamContent::GetDescription() const {
+  if (param_type_ == VT_INTEGER) { return iIt->info_str(); }
+  else if (param_type_ == VT_BOOLEAN) { return bIt->info_str(); }
+  else if (param_type_ == VT_DOUBLE) { return dIt->info_str(); }
+  else if (param_type_ == VT_STRING) { return sIt->info_str(); }
+  else return nullptr;
+}
+
+// Getter for the value.
+STRING ParamContent::GetValue() const {
+  STRING result;
+  if (param_type_ == VT_INTEGER) {
+    result.add_str_int("", *iIt);
+  } else if (param_type_ == VT_BOOLEAN) {
+    result.add_str_int("", *bIt);
+  } else if (param_type_ == VT_DOUBLE) {
+    result.add_str_double("", *dIt);
+  } else if (param_type_ == VT_STRING) {
+    if (STRING(*(sIt)).c_str() != nullptr) {
+      result = sIt->c_str();
+    } else {
+      result = "Null";
+    }
+  }
+  return result;
+}
+
+// Setter for the value.
+void ParamContent::SetValue(const char* val) {
+// TODO (wanke) Test if the values actually are properly converted.
+// (Quickly visible impacts?)
+  changed_ = true;
+  if (param_type_ == VT_INTEGER) {
+    iIt->set_value(atoi(val));
+  } else if (param_type_ == VT_BOOLEAN) {
+    bIt->set_value(atoi(val));
+  } else if (param_type_ == VT_DOUBLE) {
+    std::stringstream stream(val);
+    // Use "C" locale for reading double value.
+    stream.imbue(std::locale::classic());
+    double d = 0;
+    stream >> d;
+    dIt->set_value(d);
+  } else if (param_type_ == VT_STRING) {
+    sIt->set_value(val);
+  }
+}
+
+// Gets the up to the first 3 prefixes from s (split by _).
+// For example, tesseract_foo_bar will be split into tesseract,foo and bar.
+void ParamsEditor::GetPrefixes(const char* s, STRING* level_one,
+                               STRING* level_two,
+                               STRING* level_three) {
+  std::unique_ptr<char[]> p(new char[1024]);
+  GetFirstWords(s, 1, p.get());
+  *level_one = p.get();
+  GetFirstWords(s, 2, p.get());
+  *level_two = p.get();
+  GetFirstWords(s, 3, p.get());
+  *level_three = p.get();
+}
+
+// Compare two VC objects by their name.
+int ParamContent::Compare(const void* v1, const void* v2) {
+  const ParamContent* one = *static_cast<const ParamContent* const*>(v1);
+  const ParamContent* two = *static_cast<const ParamContent* const*>(v2);
+  return strcmp(one->GetName(), two->GetName());
+}
+
+// Find all editable parameters used within tesseract and create a
+// SVMenuNode tree from it.
+// TODO (wanke): This is actually sort of hackish.
+SVMenuNode* ParamsEditor::BuildListOfAllLeaves(tesseract::Tesseract *tess) {
+  auto* mr = new SVMenuNode();
+  ParamContent_LIST vclist;
+  ParamContent_IT vc_it(&vclist);
+  // Amount counts the number of entries for a specific char*.
+  // TODO(rays) get rid of the use of std::map.
+  std::map<const char*, int> amount;
+
+  // Add all parameters to a list.
+  int v, i;
+  int num_iterations = (tess->params() == nullptr) ? 1 : 2;
+  for (v = 0; v < num_iterations; ++v) {
+    tesseract::ParamsVectors *vec = (v == 0) ? GlobalParams() : tess->params();
+    for (i = 0; i < vec->int_params.size(); ++i) {
+      vc_it.add_after_then_move(new ParamContent(vec->int_params[i]));
+    }
+    for (i = 0; i < vec->bool_params.size(); ++i) {
+      vc_it.add_after_then_move(new ParamContent(vec->bool_params[i]));
+    }
+    for (i = 0; i < vec->string_params.size(); ++i) {
+      vc_it.add_after_then_move(new ParamContent(vec->string_params[i]));
+    }
+    for (i = 0; i < vec->double_params.size(); ++i) {
+      vc_it.add_after_then_move(new ParamContent(vec->double_params[i]));
+    }
+  }
+
+  // Count the # of entries starting with a specific prefix.
+  for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
+    ParamContent* vc = vc_it.data();
+    STRING tag;
+    STRING tag2;
+    STRING tag3;
+
+    GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);
+    amount[tag.c_str()]++;
+    amount[tag2.c_str()]++;
+    amount[tag3.c_str()]++;
+  }
+
+  vclist.sort(ParamContent::Compare);  // Sort the list alphabetically.
+
+  SVMenuNode* other = mr->AddChild("OTHER");
+
+  // go through the list again and this time create the menu structure.
+  vc_it.move_to_first();
+  for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
+    ParamContent* vc = vc_it.data();
+    STRING tag;
+    STRING tag2;
+    STRING tag3;
+    GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);
+
+    if (amount[tag.c_str()] == 1) {
+      other->AddChild(vc->GetName(), vc->GetId(), vc->GetValue().c_str(),
+                      vc->GetDescription());
+    } else {  // More than one would use this submenu -> create submenu.
+      SVMenuNode* sv = mr->AddChild(tag.c_str());
+      if ((amount[tag.c_str()] <= MAX_ITEMS_IN_SUBMENU) ||
+          (amount[tag2.c_str()] <= 1)) {
+        sv->AddChild(vc->GetName(), vc->GetId(),
+                     vc->GetValue().c_str(), vc->GetDescription());
+      } else {  // Make subsubmenus.
+        SVMenuNode* sv2 = sv->AddChild(tag2.c_str());
+        sv2->AddChild(vc->GetName(), vc->GetId(),
+                      vc->GetValue().c_str(), vc->GetDescription());
+      }
+    }
+  }
+  return mr;
+}
+
+// Event listener. Waits for SVET_POPUP events and processes them.
+void ParamsEditor::Notify(const SVEvent* sve) {
+  if (sve->type == SVET_POPUP) {  // only catch SVET_POPUP!
+    char* param = sve->parameter;
+    if (sve->command_id == writeCommands[0]) {
+      WriteParams(param, false);
+    } else if (sve->command_id == writeCommands[1]) {
+      WriteParams(param, true);
+    } else {
+      ParamContent* vc = ParamContent::GetParamContentById(
+          sve->command_id);
+      vc->SetValue(param);
+      sv_window_->AddMessage("Setting %s to %s",
+                             vc->GetName(), vc->GetValue().c_str());
+    }
+  }
+}
+
+// Integrate the parameters editor as popupmenu into the existing scrollview
+// window (usually the pg editor). If sv == null, create a new empty
+// empty window and attach the parameters editor to that window (ugly).
+ParamsEditor::ParamsEditor(tesseract::Tesseract* tess,
+                                 ScrollView* sv) {
+  if (sv == nullptr) {
+    const char* name = "ParamEditorMAIN";
+    sv = new ScrollView(name, 1, 1, 200, 200, 300, 200);
+  }
+
+  sv_window_ = sv;
+
+  //Only one event handler per window.
+  //sv->AddEventHandler((SVEventHandler*) this);
+
+  SVMenuNode* svMenuRoot = BuildListOfAllLeaves(tess);
+
+  STRING paramfile;
+  paramfile = tess->datadir;
+  paramfile += VARDIR;             // parameters dir
+  paramfile += "edited";           // actual name
+
+  SVMenuNode* std_menu = svMenuRoot->AddChild ("Build Config File");
+
+  writeCommands[0] = nrParams+1;
+  std_menu->AddChild("All Parameters", writeCommands[0],
+                     paramfile.c_str(), "Config file name?");
+
+  writeCommands[1] = nrParams+2;
+  std_menu->AddChild ("changed_ Parameters Only", writeCommands[1],
+                      paramfile.c_str(), "Config file name?");
+
+  svMenuRoot->BuildMenu(sv, false);
+}
+
+
+// Write all (changed_) parameters to a config file.
+void ParamsEditor::WriteParams(char *filename,
+                               bool changes_only) {
+  FILE *fp;                      // input file
+  char msg_str[255];
+                                 // if file exists
+  if ((fp = fopen (filename, "rb")) != nullptr) {
+    fclose(fp);
+    sprintf (msg_str, "Overwrite file " "%s" "? (Y/N)", filename);
+    int a = sv_window_->ShowYesNoDialog(msg_str);
+    if (a == 'n') {
+      return;
+    }  // don't write
+  }
+
+
+  fp = fopen (filename, "wb");  // can we write to it?
+  if (fp == nullptr) {
+    sv_window_->AddMessage(
+        "Can't write to file "
+        "%s"
+        "",
+        filename);
+    return;
+  }
+  for (auto& iter : vcMap) {
+    ParamContent* cur = iter.second;
+    if (!changes_only || cur->HasChanged()) {
+      fprintf(fp, "%-25s   %-12s   # %s\n",
+              cur->GetName(), cur->GetValue().c_str(), cur->GetDescription());
+    }
+  }
+  fclose(fp);
+}
+
+} // namespace tesseract
+
+#endif // !GRAPHICS_DISABLED
diff --git a/tesseract/src/ccmain/paramsd.h b/tesseract/src/ccmain/paramsd.h
new file mode 100644
index 00000000..c8019c1c
--- /dev/null
+++ b/tesseract/src/ccmain/paramsd.h
@@ -0,0 +1,134 @@
+///////////////////////////////////////////////////////////////////////
+// File:        paramsd.h
+// Description: Tesseract parameter editor
+// Author:      Joern Wanke
+//
+// (C) Copyright 2007, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+//
+// Tesseract parameter editor is used to edit all the parameters used
+// within tesseract from the ui.
+#ifndef TESSERACT_CCMAIN_PARAMSD_H_
+#define TESSERACT_CCMAIN_PARAMSD_H_
+
+#ifndef GRAPHICS_DISABLED
+
+#include "elst.h"        // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
+#include "scrollview.h"  // for ScrollView (ptr only), SVEvent (ptr only)
+#include "strngs.h"      // for STRING
+
+namespace tesseract {
+
+class SVMenuNode;
+
+class BoolParam;
+class DoubleParam;
+class IntParam;
+class StringParam;
+class Tesseract;
+
+// A list of all possible parameter types used.
+enum ParamType {
+  VT_INTEGER,
+  VT_BOOLEAN,
+  VT_STRING,
+  VT_DOUBLE
+};
+
+// A rather hackish helper structure which can take any kind of parameter input
+// (defined by ParamType) and do a couple of common operations on them, like
+// comparisond or getting its value. It is used in the context of the
+// ParamsEditor as a bridge from the internal tesseract parameters to the
+// ones displayed by the ScrollView server.
+class ParamContent : public ELIST_LINK {
+ public:
+  // Compare two VC objects by their name.
+  static int Compare(const void* v1, const void* v2);
+
+  // Gets a VC object identified by its ID.
+  static ParamContent* GetParamContentById(int id);
+
+  // Constructors for the various ParamTypes.
+  ParamContent() = default;
+  explicit ParamContent(tesseract::StringParam* it);
+  explicit ParamContent(tesseract::IntParam* it);
+  explicit ParamContent(tesseract::BoolParam* it);
+  explicit ParamContent(tesseract::DoubleParam* it);
+
+
+  // Getters and Setters.
+  void SetValue(const char* val);
+  STRING GetValue() const;
+  const char* GetName() const;
+  const char* GetDescription() const;
+
+  int GetId() { return my_id_; }
+  bool HasChanged() { return changed_; }
+
+ private:
+  // The unique ID of this VC object.
+  int my_id_;
+  // Whether the parameter was changed_ and thus needs to be rewritten.
+  bool changed_ = false;
+  // The actual ParamType of this VC object.
+  ParamType param_type_;
+
+  union {
+    tesseract::StringParam* sIt;
+    tesseract::IntParam* iIt;
+    tesseract::BoolParam* bIt;
+    tesseract::DoubleParam* dIt;
+  };
+};
+
+ELISTIZEH(ParamContent)
+
+// The parameters editor enables the user to edit all the parameters used within
+// tesseract. It can be invoked on its own, but is supposed to be invoked by
+// the program editor.
+class ParamsEditor : public SVEventHandler {
+ public:
+  // Integrate the parameters editor as popupmenu into the existing scrollview
+  // window (usually the pg editor). If sv == null, create a new empty
+  // empty window and attach the parameter editor to that window (ugly).
+  explicit ParamsEditor(tesseract::Tesseract*, ScrollView* sv = nullptr);
+
+  // Event listener. Waits for SVET_POPUP events and processes them.
+  void Notify(const SVEvent* sve) override;
+
+ private:
+  // Gets the up to the first 3 prefixes from s (split by _).
+  // For example, tesseract_foo_bar will be split into tesseract,foo and bar.
+  void GetPrefixes(const char* s, STRING* level_one,
+                   STRING* level_two, STRING* level_three);
+
+  // Gets the first n words (split by _) and puts them in t.
+  // For example, tesseract_foo_bar with N=2 will yield tesseract_foo_.
+  void GetFirstWords(const char *s,  // source string
+                     int n,          // number of words
+                     char *t);       // target string
+
+  // Find all editable parameters used within tesseract and create a
+  // SVMenuNode tree from it.
+  SVMenuNode *BuildListOfAllLeaves(tesseract::Tesseract *tess);
+
+  // Write all (changed_) parameters to a config file.
+  void WriteParams(char* filename, bool changes_only);
+
+  ScrollView* sv_window_;
+};
+
+} // namespace tesseract
+
+#endif // !GRAPHICS_DISABLED
+#endif  // TESSERACT_CCMAIN_PARAMSD_H_
diff --git a/tesseract/src/ccmain/pgedit.cpp b/tesseract/src/ccmain/pgedit.cpp
new file mode 100644
index 00000000..b00b5f64
--- /dev/null
+++ b/tesseract/src/ccmain/pgedit.cpp
@@ -0,0 +1,981 @@
+/**********************************************************************
+ * File:        pgedit.cpp (Formerly pgeditor.c)
+ * Description: Page structure file editor
+ * Author:      Phil Cheatle
+ *
+ *(C) Copyright 1991, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0(the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http:// www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#include "pgedit.h"
+
+#include "blread.h"
+#include "control.h"
+#include "paramsd.h"
+#include "pageres.h"
+#include "tordmain.h"
+#include "scrollview.h"
+#include "svmnode.h"
+#include "statistc.h"
+#include "tesseractclass.h"
+#include "werdit.h"
+
+#include <cctype>
+#include <cmath>
+
+#ifndef GRAPHICS_DISABLED
+namespace tesseract {
+#define ASC_HEIGHT     (2 * kBlnBaselineOffset + kBlnXHeight)
+#define X_HEIGHT       (kBlnBaselineOffset + kBlnXHeight)
+#define BL_HEIGHT     kBlnBaselineOffset
+#define DESC_HEIGHT     0
+
+enum CMD_EVENTS
+{
+  NULL_CMD_EVENT,
+  CHANGE_DISP_CMD_EVENT,
+  DUMP_WERD_CMD_EVENT,
+  SHOW_POINT_CMD_EVENT,
+  SHOW_BLN_WERD_CMD_EVENT,
+  DEBUG_WERD_CMD_EVENT,
+  BLAMER_CMD_EVENT,
+  BOUNDING_BOX_CMD_EVENT,
+  CORRECT_TEXT_CMD_EVENT,
+  POLYGONAL_CMD_EVENT,
+  BL_NORM_CMD_EVENT,
+  BITMAP_CMD_EVENT,
+  IMAGE_CMD_EVENT,
+  BLOCKS_CMD_EVENT,
+  BASELINES_CMD_EVENT,
+  UNIFORM_DISP_CMD_EVENT,
+  REFRESH_CMD_EVENT,
+  QUIT_CMD_EVENT,
+  RECOG_WERDS,
+  RECOG_PSEUDO,
+  SHOW_BLOB_FEATURES,
+  SHOW_SUBSCRIPT_CMD_EVENT,
+  SHOW_SUPERSCRIPT_CMD_EVENT,
+  SHOW_ITALIC_CMD_EVENT,
+  SHOW_BOLD_CMD_EVENT,
+  SHOW_UNDERLINE_CMD_EVENT,
+  SHOW_FIXEDPITCH_CMD_EVENT,
+  SHOW_SERIF_CMD_EVENT,
+  SHOW_SMALLCAPS_CMD_EVENT,
+  SHOW_DROPCAPS_CMD_EVENT,
+};
+
+enum ColorationMode {
+  CM_RAINBOW,
+  CM_SUBSCRIPT,
+  CM_SUPERSCRIPT,
+  CM_ITALIC,
+  CM_BOLD,
+  CM_UNDERLINE,
+  CM_FIXEDPITCH,
+  CM_SERIF,
+  CM_SMALLCAPS,
+  CM_DROPCAPS
+};
+
+/*
+ *
+ *  Some global data
+ *
+ */
+
+static ScrollView* image_win;
+static ParamsEditor* pe;
+static bool stillRunning = false;
+
+static ScrollView* bln_word_window = nullptr; // baseline norm words
+
+static CMD_EVENTS mode = CHANGE_DISP_CMD_EVENT; // selected words op
+
+static bool recog_done = false; // recog_all_words was called
+
+// These variables should remain global, since they are only used for the
+// debug mode (in which only a single Tesseract thread/instance will exist).
+static BITS16 word_display_mode;
+static ColorationMode color_mode = CM_RAINBOW;
+static bool display_image = false;
+static bool display_blocks = false;
+static bool display_baselines = false;
+
+static PAGE_RES *current_page_res = nullptr;
+
+STRING_VAR(editor_image_win_name, "EditorImage",
+           "Editor image window name");
+INT_VAR(editor_image_xpos, 590, "Editor image X Pos");
+INT_VAR(editor_image_ypos, 10, "Editor image Y Pos");
+static INT_VAR(editor_image_menuheight, 50, "Add to image height for menu bar");
+INT_VAR(editor_image_word_bb_color, ScrollView::BLUE,
+        "Word bounding box colour");
+INT_VAR(editor_image_blob_bb_color, ScrollView::YELLOW,
+        "Blob bounding box colour");
+INT_VAR(editor_image_text_color, ScrollView::WHITE,
+        "Correct text colour");
+
+STRING_VAR(editor_dbwin_name, "EditorDBWin",
+           "Editor debug window name");
+INT_VAR(editor_dbwin_xpos, 50, "Editor debug window X Pos");
+INT_VAR(editor_dbwin_ypos, 500, "Editor debug window Y Pos");
+INT_VAR(editor_dbwin_height, 24, "Editor debug window height");
+INT_VAR(editor_dbwin_width, 80, "Editor debug window width");
+
+STRING_VAR(editor_word_name, "BlnWords", "BL normalized word window");
+INT_VAR(editor_word_xpos, 60, "Word window X Pos");
+INT_VAR(editor_word_ypos, 510, "Word window Y Pos");
+INT_VAR(editor_word_height, 240, "Word window height");
+INT_VAR(editor_word_width, 655, "Word window width");
+
+/**
+ * show_point()
+ *
+ * Show coords of point, blob bounding box, word bounding box and offset from
+ * row baseline
+ */
+
+static void show_point(PAGE_RES* page_res, float x, float y) {
+  FCOORD pt(x, y);
+  PAGE_RES_IT pr_it(page_res);
+
+  const int kBufsize = 512;
+  char msg[kBufsize];
+  char *msg_ptr = msg;
+
+  msg_ptr += sprintf(msg_ptr, "Pt:(%0.3f, %0.3f) ", x, y);
+
+  for (WERD_RES* word = pr_it.word(); word != nullptr; word = pr_it.forward()) {
+    if (pr_it.row() != pr_it.prev_row() &&
+        pr_it.row()->row->bounding_box().contains(pt)) {
+      msg_ptr += sprintf(msg_ptr, "BL(x)=%0.3f ",
+                         pr_it.row()->row->base_line(x));
+    }
+    if (word->word->bounding_box().contains(pt)) {
+      TBOX box = word->word->bounding_box();
+      msg_ptr += sprintf(msg_ptr, "Wd(%d, %d)/(%d, %d) ",
+                         box.left(), box.bottom(),
+                         box.right(), box.top());
+      C_BLOB_IT cblob_it(word->word->cblob_list());
+      for (cblob_it.mark_cycle_pt();
+           !cblob_it.cycled_list();
+           cblob_it.forward()) {
+        C_BLOB* cblob = cblob_it.data();
+        box = cblob->bounding_box();
+        if (box.contains(pt)) {
+          msg_ptr += sprintf(msg_ptr,
+                             "CBlb(%d, %d)/(%d, %d) ",
+                             box.left(), box.bottom(),
+                             box.right(), box.top());
+        }
+      }
+    }
+  }
+  image_win->AddMessage(msg);
+}
+
+/**
+ * pgeditor_msg()
+ *
+ * Display a message - in the command window if there is one, or to stdout
+ */
+
+static void pgeditor_msg( // message display
+                  const char *msg) {
+    image_win->AddMessage(msg);
+}
+
+class BlnEventHandler : public SVEventHandler {
+ public:
+  void Notify(const SVEvent* sv_event) override {
+    if (sv_event->type == SVET_DESTROY)
+      bln_word_window = nullptr;
+    else if (sv_event->type == SVET_CLICK)
+      show_point(current_page_res, sv_event->x, sv_event->y);
+  }
+};
+
+/**
+ *  bln_word_window_handle()
+ *
+ *  @return a WINDOW for the word window, creating it if necessary
+ */
+static ScrollView* bln_word_window_handle() {  // return handle
+                                 // not opened yet
+  if (bln_word_window == nullptr) {
+    pgeditor_msg("Creating BLN word window...");
+    bln_word_window = new ScrollView(editor_word_name.c_str(),
+      editor_word_xpos, editor_word_ypos, editor_word_width,
+      editor_word_height, 4000, 4000, true);
+    auto* a = new BlnEventHandler();
+    bln_word_window->AddEventHandler(a);
+    pgeditor_msg("Creating BLN word window...Done");
+  }
+  return bln_word_window;
+}
+
+/**
+ *  build_image_window()
+ *
+ *  Destroy the existing image window if there is one.  Work out how big the
+ *  new window needs to be. Create it and re-display.
+ */
+
+static void build_image_window(int width, int height) {
+  delete image_win;
+  image_win = new ScrollView(editor_image_win_name.c_str(),
+                             editor_image_xpos, editor_image_ypos,
+                             width + 1,
+                             height + editor_image_menuheight + 1,
+                             width,
+                             height,
+                             true);
+}
+
+/**
+ *  display_bln_lines()
+ *
+ *  Display normalized baseline, x-height, ascender limit and descender limit
+ */
+
+static void display_bln_lines(ScrollView* window, ScrollView::Color colour,
+                              float scale_factor, float y_offset,
+                              float minx, float maxx) {
+  window->Pen(colour);
+  window->Line(minx, y_offset + scale_factor * DESC_HEIGHT,
+               maxx, y_offset + scale_factor * DESC_HEIGHT);
+  window->Line(minx, y_offset + scale_factor * BL_HEIGHT,
+               maxx, y_offset + scale_factor * BL_HEIGHT);
+  window->Line(minx, y_offset + scale_factor * X_HEIGHT,
+               maxx, y_offset + scale_factor * X_HEIGHT);
+  window->Line(minx, y_offset + scale_factor * ASC_HEIGHT,
+               maxx, y_offset + scale_factor * ASC_HEIGHT);
+}
+
+/**
+ *  notify()
+ *
+ *  Event handler that processes incoming events, either forwarding
+ *  them to process_cmd_win_event or process_image_event.
+ *
+ */
+
+void PGEventHandler::Notify(const SVEvent* event) {
+  char myval = '0';
+  if (event->type == SVET_POPUP) {
+    pe->Notify(event);
+  } // These are handled by ParamsEditor
+  else if (event->type == SVET_EXIT) { stillRunning = false; }
+  else if (event->type == SVET_MENU) {
+     if (strcmp(event->parameter, "true") == 0) { myval = 'T'; }
+     else if (strcmp(event->parameter, "false") == 0) { myval = 'F'; }
+     tess_->process_cmd_win_event(event->command_id, &myval);
+  }
+  else {
+    tess_->process_image_event(*event);
+  }
+}
+
+/**
+ *  build_menu()
+ *
+ *  Construct the menu tree used by the command window
+ */
+SVMenuNode *Tesseract::build_menu_new() {
+  SVMenuNode* parent_menu;
+  auto* root_menu_item = new SVMenuNode();
+
+  SVMenuNode* modes_menu_item = root_menu_item->AddChild("MODES");
+
+  modes_menu_item->AddChild("Change Display", CHANGE_DISP_CMD_EVENT);
+  modes_menu_item->AddChild("Dump Word", DUMP_WERD_CMD_EVENT);
+  modes_menu_item->AddChild("Show Point", SHOW_POINT_CMD_EVENT);
+  modes_menu_item->AddChild("Show BL Norm Word", SHOW_BLN_WERD_CMD_EVENT);
+  modes_menu_item->AddChild("Config Words", DEBUG_WERD_CMD_EVENT);
+  modes_menu_item->AddChild("Recog Words", RECOG_WERDS);
+  modes_menu_item->AddChild("Recog Blobs", RECOG_PSEUDO);
+  modes_menu_item->AddChild("Show Blob Features", SHOW_BLOB_FEATURES);
+
+  parent_menu = root_menu_item->AddChild("DISPLAY");
+
+  parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, false);
+  parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, false);
+  parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, false);
+  parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, false);
+  parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, false);
+  parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, true);
+  parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT);
+  parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT);
+  parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT);
+  parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT);
+  parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT);
+  parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT);
+  parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT);
+  parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT);
+  parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT);
+
+
+  parent_menu = root_menu_item->AddChild("OTHER");
+
+  parent_menu->AddChild("Quit", QUIT_CMD_EVENT);
+  parent_menu->AddChild("Show Image", IMAGE_CMD_EVENT, false);
+  parent_menu->AddChild("ShowBlock Outlines", BLOCKS_CMD_EVENT, false);
+  parent_menu->AddChild("Show Baselines", BASELINES_CMD_EVENT, false);
+  parent_menu->AddChild("Uniform Display", UNIFORM_DISP_CMD_EVENT);
+  parent_menu->AddChild("Refresh Display", REFRESH_CMD_EVENT);
+
+  return root_menu_item;
+}
+
+/**
+ *  do_re_display()
+ *
+ *  Redisplay page
+ */
+void Tesseract::do_re_display(
+        bool (tesseract::Tesseract::* word_painter)(PAGE_RES_IT* pr_it)) {
+  int block_count = 1;
+
+  image_win->Clear();
+  if (display_image) {
+    image_win->Image(pix_binary_, 0, 0);
+  }
+
+  image_win->Brush(ScrollView::NONE);
+  PAGE_RES_IT pr_it(current_page_res);
+  for (WERD_RES* word = pr_it.word(); word != nullptr; word = pr_it.forward()) {
+    (this->*word_painter)(&pr_it);
+    if (display_baselines && pr_it.row() != pr_it.prev_row())
+      pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
+    if (display_blocks && pr_it.block() != pr_it.prev_block())
+      pr_it.block()->block->pdblk.plot(image_win, block_count++, ScrollView::RED);
+  }
+  image_win->Update();
+}
+
+/**
+ *  pgeditor_main()
+ *
+ *  Top level editor operation:
+ *  Setup a new window and an according event handler
+ *
+ */
+
+void Tesseract::pgeditor_main(int width, int height, PAGE_RES *page_res) {
+  current_page_res = page_res;
+  if (current_page_res->block_res_list.empty())
+    return;
+
+  recog_done = false;
+  stillRunning = true;
+
+  build_image_window(width, height);
+  word_display_mode.set(DF_EDGE_STEP);
+  do_re_display(&tesseract::Tesseract::word_set_display);
+#ifndef GRAPHICS_DISABLED
+  pe = new ParamsEditor(this, image_win);
+#endif
+  PGEventHandler pgEventHandler(this);
+
+  image_win->AddEventHandler(&pgEventHandler);
+  image_win->AddMessageBox();
+
+  SVMenuNode* svMenuRoot = build_menu_new();
+
+  svMenuRoot->BuildMenu(image_win);
+  image_win->SetVisible(true);
+
+  image_win->AwaitEvent(SVET_DESTROY);
+  image_win->AddEventHandler(nullptr);
+}
+
+/**
+ *  process_cmd_win_event()
+ *
+ *  Process a command returned from the command window
+ * (Just call the appropriate command handler)
+ */
+
+bool Tesseract::process_cmd_win_event(                 // UI command semantics
+        int32_t cmd_event,  // which menu item?
+        char* new_value   // any prompt data
+) {
+  char msg[160];
+  bool exit = false;
+
+  color_mode = CM_RAINBOW;
+
+  // Run recognition on the full page if needed.
+  switch (cmd_event) {
+    case BLAMER_CMD_EVENT:
+    case SHOW_SUBSCRIPT_CMD_EVENT:
+    case SHOW_SUPERSCRIPT_CMD_EVENT:
+    case SHOW_ITALIC_CMD_EVENT:
+    case SHOW_BOLD_CMD_EVENT:
+    case SHOW_UNDERLINE_CMD_EVENT:
+    case SHOW_FIXEDPITCH_CMD_EVENT:
+    case SHOW_SERIF_CMD_EVENT:
+    case SHOW_SMALLCAPS_CMD_EVENT:
+    case SHOW_DROPCAPS_CMD_EVENT:
+      if (!recog_done) {
+        recog_all_words(current_page_res, nullptr, nullptr, nullptr, 0);
+        recog_done = true;
+      }
+      break;
+    default:
+      break;
+  }
+
+  char* parameter;
+
+  switch (cmd_event) {
+    case NULL_CMD_EVENT:
+      break;
+
+    case CHANGE_DISP_CMD_EVENT:
+    case DUMP_WERD_CMD_EVENT:
+    case SHOW_POINT_CMD_EVENT:
+    case SHOW_BLN_WERD_CMD_EVENT:
+    case RECOG_WERDS:
+    case RECOG_PSEUDO:
+    case SHOW_BLOB_FEATURES:
+      mode =static_cast<CMD_EVENTS>(cmd_event);
+      break;
+    case DEBUG_WERD_CMD_EVENT:
+      mode = DEBUG_WERD_CMD_EVENT;
+      parameter = image_win->ShowInputDialog("Config File Name");
+      word_config_ = parameter;
+      delete[] parameter;
+      break;
+    case BOUNDING_BOX_CMD_EVENT:
+      if (new_value[0] == 'T')
+        word_display_mode.set(DF_BOX);
+      else
+        word_display_mode.reset(DF_BOX);
+      mode = CHANGE_DISP_CMD_EVENT;
+      break;
+    case BLAMER_CMD_EVENT:
+      if (new_value[0] == 'T')
+        word_display_mode.set(DF_BLAMER);
+      else
+        word_display_mode.reset(DF_BLAMER);
+      do_re_display(&tesseract::Tesseract::word_display);
+      mode = CHANGE_DISP_CMD_EVENT;
+      break;
+    case CORRECT_TEXT_CMD_EVENT:
+      if (new_value[0] == 'T')
+        word_display_mode.set(DF_TEXT);
+      else
+        word_display_mode.reset(DF_TEXT);
+      mode = CHANGE_DISP_CMD_EVENT;
+      break;
+    case POLYGONAL_CMD_EVENT:
+      if (new_value[0] == 'T')
+        word_display_mode.set(DF_POLYGONAL);
+      else
+        word_display_mode.reset(DF_POLYGONAL);
+      mode = CHANGE_DISP_CMD_EVENT;
+      break;
+    case BL_NORM_CMD_EVENT:
+      if (new_value[0] == 'T')
+        word_display_mode.set(DF_BN_POLYGONAL);
+      else
+        word_display_mode.reset(DF_BN_POLYGONAL);
+      mode = CHANGE_DISP_CMD_EVENT;
+      break;
+    case BITMAP_CMD_EVENT:
+      if (new_value[0] == 'T')
+        word_display_mode.set(DF_EDGE_STEP);
+      else
+        word_display_mode.reset(DF_EDGE_STEP);
+      mode = CHANGE_DISP_CMD_EVENT;
+      break;
+    case UNIFORM_DISP_CMD_EVENT:
+      do_re_display(&tesseract::Tesseract::word_set_display);
+      break;
+    case IMAGE_CMD_EVENT:
+      display_image =(new_value[0] == 'T');
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case BLOCKS_CMD_EVENT:
+      display_blocks =(new_value[0] == 'T');
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case BASELINES_CMD_EVENT:
+      display_baselines =(new_value[0] == 'T');
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case SHOW_SUBSCRIPT_CMD_EVENT:
+      color_mode = CM_SUBSCRIPT;
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case SHOW_SUPERSCRIPT_CMD_EVENT:
+      color_mode = CM_SUPERSCRIPT;
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case SHOW_ITALIC_CMD_EVENT:
+      color_mode = CM_ITALIC;
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case SHOW_BOLD_CMD_EVENT:
+      color_mode = CM_BOLD;
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case SHOW_UNDERLINE_CMD_EVENT:
+      color_mode = CM_UNDERLINE;
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case SHOW_FIXEDPITCH_CMD_EVENT:
+      color_mode = CM_FIXEDPITCH;
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case SHOW_SERIF_CMD_EVENT:
+      color_mode = CM_SERIF;
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case SHOW_SMALLCAPS_CMD_EVENT:
+      color_mode = CM_SMALLCAPS;
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case SHOW_DROPCAPS_CMD_EVENT:
+      color_mode = CM_DROPCAPS;
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case REFRESH_CMD_EVENT:
+      do_re_display(&tesseract::Tesseract::word_display);
+      break;
+    case QUIT_CMD_EVENT:
+      exit = true;
+      ScrollView::Exit();
+      break;
+
+    default:
+      snprintf(msg, sizeof(msg), "Unrecognised event %" PRId32 "(%s)",
+               cmd_event, new_value);
+      image_win->AddMessage(msg);
+    break;
+  }
+  return exit;
+}
+
+
+/**
+ * process_image_event()
+ *
+ * User has done something in the image window - mouse down or up.  Work out
+ * what it is and do something with it.
+ * If DOWN - just remember where it was.
+ * If UP - for each word in the selected area do the operation defined by
+ * the current mode.
+ */
+void Tesseract::process_image_event( // action in image win
+                                    const SVEvent &event) {
+   // The following variable should remain static, since it is used by
+   // debug editor, which uses a single Tesseract instance.
+  static ICOORD down;
+  ICOORD up;
+  TBOX selection_box;
+  char msg[80];
+
+  switch(event.type) {
+
+    case SVET_SELECTION:
+      if (event.type == SVET_SELECTION) {
+        down.set_x(event.x + event.x_size);
+        down.set_y(event.y + event.y_size);
+        if (mode == SHOW_POINT_CMD_EVENT)
+          show_point(current_page_res, event.x, event.y);
+      }
+
+      up.set_x(event.x);
+      up.set_y(event.y);
+
+      selection_box = TBOX(down, up);
+
+      switch(mode) {
+        case CHANGE_DISP_CMD_EVENT:
+          process_selected_words(
+              current_page_res,
+              selection_box,
+              &tesseract::Tesseract::word_blank_and_set_display);
+          break;
+       case DUMP_WERD_CMD_EVENT:
+          process_selected_words(current_page_res,
+                                 selection_box,
+                                 &tesseract::Tesseract::word_dumper);
+          break;
+        case SHOW_BLN_WERD_CMD_EVENT:
+          process_selected_words(current_page_res,
+                                 selection_box,
+                                 &tesseract::Tesseract::word_bln_display);
+          break;
+        case DEBUG_WERD_CMD_EVENT:
+          debug_word(current_page_res, selection_box);
+          break;
+        case SHOW_POINT_CMD_EVENT:
+          break;                 // ignore up event
+
+        case RECOG_WERDS:
+        #ifndef DISABLED_LEGACY_ENGINE
+          image_win->AddMessage("Recogging selected words");
+          this->process_selected_words(current_page_res,
+                                       selection_box,
+                                       &Tesseract::recog_interactive);
+        #endif  // ndef DISABLED_LEGACY_ENGINE
+          break;
+        case RECOG_PSEUDO:
+          image_win->AddMessage("Recogging selected blobs");
+          recog_pseudo_word(current_page_res, selection_box);
+          break;
+        case SHOW_BLOB_FEATURES:
+          blob_feature_display(current_page_res, selection_box);
+          break;
+
+        default:
+          sprintf(msg, "Mode %d not yet implemented", mode);
+          image_win->AddMessage(msg);
+          break;
+      }
+    default:
+      break;
+  }
+}
+
+/**
+ * debug_word
+ *
+ * Process the whole image, but load word_config_ for the selected word(s).
+ */
+void Tesseract::debug_word(PAGE_RES* page_res, const TBOX &selection_box) {
+#ifndef DISABLED_LEGACY_ENGINE
+  ResetAdaptiveClassifier();
+#endif
+  recog_all_words(page_res, nullptr, &selection_box, word_config_.c_str(), 0);
+}
+
+
+/**********************************************************************
+ * WERD PROCESSOR FUNCTIONS
+ * ========================
+ *
+ * These routines are invoked by one or more of:
+ *    process_all_words()
+ *    process_selected_words()
+ * or
+ *    process_all_words_it()
+ *    process_selected_words_it()
+ * for each word to be processed
+ **********************************************************************/
+
+/**
+ * word_blank_and_set_display()  Word processor
+ *
+ * Blank display of word then redisplay word according to current display mode
+ * settings
+ */
+
+bool Tesseract::word_blank_and_set_display(PAGE_RES_IT* pr_it) {
+  pr_it->word()->word->bounding_box().plot(image_win, ScrollView::BLACK,
+                                           ScrollView::BLACK);
+  return word_set_display(pr_it);
+}
+
+
+/**
+ * word_bln_display()
+ *
+ * Normalize word and display in word window
+ */
+bool Tesseract::word_bln_display(PAGE_RES_IT* pr_it) {
+  WERD_RES* word_res = pr_it->word();
+  if (word_res->chopped_word == nullptr) {
+    // Setup word normalization parameters.
+    word_res->SetupForRecognition(unicharset, this, BestPix(),
+                                  tessedit_ocr_engine_mode, nullptr,
+                                  classify_bln_numeric_mode,
+                                  textord_use_cjk_fp_model,
+                                  poly_allow_detailed_fx,
+                                  pr_it->row()->row, pr_it->block()->block);
+  }
+  bln_word_window_handle()->Clear();
+  display_bln_lines(bln_word_window_handle(), ScrollView::CYAN,
+                     1.0, 0.0f, -1000.0f, 1000.0f);
+  C_BLOB_IT it(word_res->word->cblob_list());
+  ScrollView::Color color = WERD::NextColor(ScrollView::BLACK);
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    it.data()->plot_normed(word_res->denorm, color, ScrollView::BROWN,
+                           bln_word_window_handle());
+    color = WERD::NextColor(color);
+  }
+  bln_word_window_handle()->Update();
+  return true;
+}
+
+
+
+/**
+ *  word_display()  Word Processor
+ *
+ *  Display a word according to its display modes
+ */
+bool Tesseract::word_display(PAGE_RES_IT* pr_it) {
+  WERD_RES* word_res = pr_it->word();
+  WERD* word = word_res->word;
+  TBOX word_bb;                   // word bounding box
+  int word_height;               // ht of word BB
+  bool displayed_something = false;
+  float shift;                   // from bot left
+
+  if (color_mode != CM_RAINBOW && word_res->box_word != nullptr) {
+  #ifndef DISABLED_LEGACY_ENGINE
+    BoxWord* box_word = word_res->box_word;
+    WERD_CHOICE* best_choice = word_res->best_choice;
+    int length = box_word->length();
+    if (word_res->fontinfo == nullptr) return false;
+    const FontInfo& font_info = *word_res->fontinfo;
+    for (int i = 0; i < length; ++i) {
+      ScrollView::Color color = ScrollView::GREEN;
+      switch (color_mode) {
+        case CM_SUBSCRIPT:
+          if (best_choice->BlobPosition(i) == SP_SUBSCRIPT)
+            color = ScrollView::RED;
+          break;
+        case CM_SUPERSCRIPT:
+          if (best_choice->BlobPosition(i) == SP_SUPERSCRIPT)
+            color = ScrollView::RED;
+          break;
+        case CM_ITALIC:
+          if (font_info.is_italic())
+            color = ScrollView::RED;
+          break;
+        case CM_BOLD:
+          if (font_info.is_bold())
+            color = ScrollView::RED;
+          break;
+        case CM_FIXEDPITCH:
+          if (font_info.is_fixed_pitch())
+            color = ScrollView::RED;
+          break;
+        case CM_SERIF:
+          if (font_info.is_serif())
+            color = ScrollView::RED;
+          break;
+        case CM_SMALLCAPS:
+          if (word_res->small_caps)
+            color = ScrollView::RED;
+          break;
+        case CM_DROPCAPS:
+          if (best_choice->BlobPosition(i) == SP_DROPCAP)
+            color = ScrollView::RED;
+          break;
+          // TODO(rays) underline is currently completely unsupported.
+        case CM_UNDERLINE:
+        default:
+          break;
+      }
+      image_win->Pen(color);
+      TBOX box = box_word->BlobBox(i);
+      image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
+    }
+    return true;
+  #else
+    return false;
+  #endif  // ndef DISABLED_LEGACY_ENGINE
+  }
+  /*
+    Note the double coercions of(COLOUR)((int32_t)editor_image_word_bb_color)
+    etc. are to keep the compiler happy.
+  */
+                                 // display bounding box
+  if (word->display_flag(DF_BOX)) {
+    word->bounding_box().plot(image_win,
+     static_cast<ScrollView::Color>((int32_t)
+      editor_image_word_bb_color),
+     static_cast<ScrollView::Color>((int32_t)
+      editor_image_word_bb_color));
+
+    auto c = static_cast<ScrollView::Color>((int32_t) editor_image_blob_bb_color);
+    image_win->Pen(c);
+    // cblob iterator
+    C_BLOB_IT c_it(word->cblob_list());
+    for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward())
+      c_it.data()->bounding_box().plot(image_win);
+    displayed_something = true;
+  }
+
+                                 // display edge steps
+  if (word->display_flag(DF_EDGE_STEP)) {     // edgesteps available
+    word->plot(image_win);      // rainbow colors
+    displayed_something = true;
+  }
+
+                                 // display poly approx
+  if (word->display_flag(DF_POLYGONAL)) {
+                                 // need to convert
+    TWERD* tword = TWERD::PolygonalCopy(poly_allow_detailed_fx, word);
+    tword->plot(image_win);
+    delete tword;
+    displayed_something = true;
+  }
+
+  // Display correct text and blamer information.
+  STRING text;
+  STRING blame;
+  if (word->display_flag(DF_TEXT) && word->text() != nullptr) {
+    text = word->text();
+  }
+  if (word->display_flag(DF_BLAMER) &&
+      !(word_res->blamer_bundle != nullptr &&
+        word_res->blamer_bundle->incorrect_result_reason() == IRR_CORRECT)) {
+    text = "";
+    const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
+    if (blamer_bundle == nullptr) {
+      text += "NULL";
+    } else {
+      text = blamer_bundle->TruthString();
+    }
+    text += " -> ";
+    STRING best_choice_str;
+    if (word_res->best_choice == nullptr) {
+      best_choice_str = "NULL";
+    } else {
+      word_res->best_choice->string_and_lengths(&best_choice_str, nullptr);
+    }
+    text += best_choice_str;
+    IncorrectResultReason reason = (blamer_bundle == nullptr) ?
+        IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason();
+    ASSERT_HOST(reason < IRR_NUM_REASONS);
+    blame += " [";
+    blame += BlamerBundle::IncorrectReasonName(reason);
+    blame += "]";
+  }
+  if (text.length() > 0) {
+    word_bb = word->bounding_box();
+    image_win->Pen(ScrollView::RED);
+    word_height = word_bb.height();
+    int text_height = 0.50 * word_height;
+    if (text_height > 20) text_height = 20;
+    image_win->TextAttributes("Arial", text_height, false, false, false);
+    shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
+    image_win->Text(word_bb.left() + shift,
+                    word_bb.bottom() + 0.25 * word_height, text.c_str());
+    if (blame.length() > 0) {
+      image_win->Text(word_bb.left() + shift,
+                      word_bb.bottom() + 0.25 * word_height - text_height,
+                      blame.c_str());
+    }
+
+    displayed_something = true;
+  }
+
+  if (!displayed_something)      // display BBox anyway
+    word->bounding_box().plot(image_win,
+     static_cast<ScrollView::Color>((int32_t) editor_image_word_bb_color),
+     static_cast<ScrollView::Color>((int32_t)
+      editor_image_word_bb_color));
+  return true;
+}
+}  // namespace tesseract
+#endif // !GRAPHICS_DISABLED
+
+namespace tesseract {
+/**
+ * word_dumper()
+ *
+ * Dump members to the debug window
+ */
+bool Tesseract::word_dumper(PAGE_RES_IT* pr_it) {
+  if (pr_it->block()->block != nullptr) {
+    tprintf("\nBlock data...\n");
+    pr_it->block()->block->print(nullptr, false);
+  }
+  tprintf("\nRow data...\n");
+  pr_it->row()->row->print(nullptr);
+  tprintf("\nWord data...\n");
+  WERD_RES* word_res = pr_it->word();
+  word_res->word->print();
+  if (word_res->blamer_bundle != nullptr && wordrec_debug_blamer &&
+      word_res->blamer_bundle->incorrect_result_reason() != IRR_CORRECT) {
+    tprintf("Current blamer debug: %s\n",
+            word_res->blamer_bundle->debug().c_str());
+  }
+  return true;
+}
+
+#ifndef GRAPHICS_DISABLED
+/**
+ * word_set_display()  Word processor
+ *
+ * Display word according to current display mode settings
+ */
+bool Tesseract::word_set_display(PAGE_RES_IT* pr_it) {
+  WERD* word = pr_it->word()->word;
+  word->set_display_flag(DF_BOX, word_display_mode[DF_BOX]);
+  word->set_display_flag(DF_TEXT, word_display_mode[DF_TEXT]);
+  word->set_display_flag(DF_POLYGONAL, word_display_mode[DF_POLYGONAL]);
+  word->set_display_flag(DF_EDGE_STEP, word_display_mode[DF_EDGE_STEP]);
+  word->set_display_flag(DF_BN_POLYGONAL,
+    word_display_mode[DF_BN_POLYGONAL]);
+  word->set_display_flag(DF_BLAMER, word_display_mode[DF_BLAMER]);
+  return word_display(pr_it);
+}
+
+
+// page_res is non-const because the iterator doesn't know if you are going
+// to change the items it points to! Really a const here though.
+void Tesseract::blob_feature_display(PAGE_RES* page_res,
+                                     const TBOX& selection_box) {
+#ifndef DISABLED_LEGACY_ENGINE
+  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
+  if (it != nullptr) {
+    WERD_RES* word_res = it->word();
+    word_res->x_height = it->row()->row->x_height();
+    word_res->SetupForRecognition(unicharset, this, BestPix(),
+                                  tessedit_ocr_engine_mode, nullptr,
+                                  classify_bln_numeric_mode,
+                                  textord_use_cjk_fp_model,
+                                  poly_allow_detailed_fx,
+                                  it->row()->row, it->block()->block);
+    TWERD* bln_word = word_res->chopped_word;
+    TBLOB* bln_blob = bln_word->blobs[0];
+    INT_FX_RESULT_STRUCT fx_info;
+    std::vector<INT_FEATURE_STRUCT> bl_features;
+    std::vector<INT_FEATURE_STRUCT> cn_features;
+    Classify::ExtractFeatures(*bln_blob, classify_nonlinear_norm, &bl_features,
+                              &cn_features, &fx_info, nullptr);
+    // Display baseline features.
+    ScrollView* bl_win = CreateFeatureSpaceWindow("BL Features", 512, 0);
+    ClearFeatureSpaceWindow(baseline, bl_win);
+    for (int f = 0; f < bl_features.size(); ++f)
+      RenderIntFeature(bl_win, &bl_features[f], ScrollView::GREEN);
+    bl_win->Update();
+    // Display cn features.
+    ScrollView* cn_win = CreateFeatureSpaceWindow("CN Features", 512, 0);
+    ClearFeatureSpaceWindow(character, cn_win);
+    for (int f = 0; f < cn_features.size(); ++f)
+      RenderIntFeature(cn_win, &cn_features[f], ScrollView::GREEN);
+    cn_win->Update();
+
+    it->DeleteCurrentWord();
+    delete it;
+  }
+#endif  // ndef DISABLED_LEGACY_ENGINE
+}
+
+#endif // !GRAPHICS_DISABLED
+
+}  // namespace tesseract
diff --git a/tesseract/src/ccmain/pgedit.h b/tesseract/src/ccmain/pgedit.h
new file mode 100644
index 00000000..55467f67
--- /dev/null
+++ b/tesseract/src/ccmain/pgedit.h
@@ -0,0 +1,71 @@
+///////////////////////////////////////////////////////////////////////
+// File:        pgedit.h
+// Description: Page structure file editor
+// Author:      Joern Wanke
+//
+// (C) Copyright 2007, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef PGEDIT_H
+#define PGEDIT_H
+
+#include "params.h"      // for INT_VAR_H, IntParam, STRING_VAR_H, StringParam
+#include "scrollview.h"  // for SVEvent (ptr only), SVEventHandler, ScrollView
+
+namespace tesseract {
+
+class BLOCK_LIST;
+class PAGE_RES;
+
+class Tesseract;
+
+// A small event handler class to process incoming events to
+// this window.
+class PGEventHandler : public SVEventHandler {
+  public:
+   PGEventHandler(tesseract::Tesseract* tess) : tess_(tess) {
+   }
+   void Notify(const SVEvent* sve) override;
+  private:
+    tesseract::Tesseract* tess_;
+};
+
+extern BLOCK_LIST *current_block_list;
+extern STRING_VAR_H (editor_image_win_name, "EditorImage",
+"Editor image window name");
+extern INT_VAR_H (editor_image_xpos, 590, "Editor image X Pos");
+extern INT_VAR_H (editor_image_ypos, 10, "Editor image Y Pos");
+extern INT_VAR_H (editor_image_height, 680, "Editor image height");
+extern INT_VAR_H (editor_image_width, 655, "Editor image width");
+extern INT_VAR_H (editor_image_word_bb_color, BLUE,
+"Word bounding box colour");
+extern INT_VAR_H (editor_image_blob_bb_color, YELLOW,
+"Blob bounding box colour");
+extern INT_VAR_H (editor_image_text_color, WHITE, "Correct text colour");
+extern STRING_VAR_H (editor_dbwin_name, "EditorDBWin",
+"Editor debug window name");
+extern INT_VAR_H (editor_dbwin_xpos, 50, "Editor debug window X Pos");
+extern INT_VAR_H (editor_dbwin_ypos, 500, "Editor debug window Y Pos");
+extern INT_VAR_H (editor_dbwin_height, 24, "Editor debug window height");
+extern INT_VAR_H (editor_dbwin_width, 80, "Editor debug window width");
+extern STRING_VAR_H (editor_word_name, "BlnWords",
+"BL normalised word window");
+extern INT_VAR_H (editor_word_xpos, 60, "Word window X Pos");
+extern INT_VAR_H (editor_word_ypos, 510, "Word window Y Pos");
+extern INT_VAR_H (editor_word_height, 240, "Word window height");
+extern INT_VAR_H (editor_word_width, 655, "Word window width");
+extern double_VAR_H (editor_smd_scale_factor, 1.0, "Scaling for smd image");
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/ccmain/recogtraining.cpp b/tesseract/src/ccmain/recogtraining.cpp
new file mode 100644
index 00000000..9368f32a
--- /dev/null
+++ b/tesseract/src/ccmain/recogtraining.cpp
@@ -0,0 +1,238 @@
+///////////////////////////////////////////////////////////////////////
+// File:        recogtraining.cpp
+// Description: Functions for ambiguity and parameter training.
+// Author:      Daria Antonova
+//
+// (C) Copyright 2009, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "tesseractclass.h"
+
+#include "boxread.h"
+#include "control.h"
+#include "host.h"  // for NearlyEqual
+#include "ratngs.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#include "reject.h"
+#endif
+#include "stopper.h"
+
+namespace tesseract {
+
+const int16_t kMaxBoxEdgeDiff = 2;
+
+// Sets flags necessary for recognition in the training mode.
+// Opens and returns the pointer to the output file.
+FILE* Tesseract::init_recog_training(const char* filename) {
+  if (tessedit_ambigs_training) {
+    tessedit_tess_adaption_mode.set_value(0);  // turn off adaption
+    tessedit_enable_doc_dict.set_value(0);     // turn off document dictionary
+    // Explore all segmentations.
+    getDict().stopper_no_acceptable_choices.set_value(1);
+  }
+
+  STRING output_fname = filename;
+  const char* lastdot = strrchr(output_fname.c_str(), '.');
+  if (lastdot != nullptr)
+    output_fname[lastdot - output_fname.c_str()] = '\0';
+  output_fname += ".txt";
+  FILE* output_file = fopen(output_fname.c_str(), "a+");
+  if (output_file == nullptr) {
+    tprintf("Error: Could not open file %s\n", output_fname.c_str());
+    ASSERT_HOST(output_file);
+  }
+  return output_file;
+}
+
+// Copies the bounding box from page_res_it->word() to the given TBOX.
+static bool read_t(PAGE_RES_IT* page_res_it, TBOX* tbox) {
+  while (page_res_it->block() != nullptr && page_res_it->word() == nullptr)
+    page_res_it->forward();
+
+  if (page_res_it->word() != nullptr) {
+    *tbox = page_res_it->word()->word->bounding_box();
+
+    // If tbox->left() is negative, the training image has vertical text and
+    // all the coordinates of bounding boxes of page_res are rotated by 90
+    // degrees in a counterclockwise direction. We need to rotate the TBOX back
+    // in order to compare with the TBOXes of box files.
+    if (tbox->left() < 0) {
+      tbox->rotate(FCOORD(0.0, -1.0));
+    }
+
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// This function takes tif/box pair of files and runs recognition on the image,
+// while making sure that the word bounds that tesseract identified roughly
+// match to those specified by the input box file. For each word (ngram in a
+// single bounding box from the input box file) it outputs the ocred result,
+// the correct label, rating and certainty.
+void Tesseract::recog_training_segmented(const char* filename,
+                                         PAGE_RES* page_res,
+                                         volatile ETEXT_DESC* monitor,
+                                         FILE* output_file) {
+  std::string box_fname = filename;
+  const char* lastdot = strrchr(box_fname.c_str(), '.');
+  if (lastdot != nullptr)
+    box_fname[lastdot - box_fname.c_str()] = '\0';
+  box_fname += ".box";
+  // ReadNextBox() will close box_file
+  FILE* box_file = fopen(box_fname.c_str(), "r");
+  if (box_file == nullptr) {
+    tprintf("Error: Could not open file %s\n", box_fname.c_str());
+    ASSERT_HOST(box_file);
+  }
+
+  PAGE_RES_IT page_res_it;
+  page_res_it.page_res = page_res;
+  page_res_it.restart_page();
+  STRING label;
+
+  // Process all the words on this page.
+  TBOX tbox;  // tesseract-identified box
+  TBOX bbox;  // box from the box file
+  bool keep_going;
+  int line_number = 0;
+  int examined_words = 0;
+  do {
+    keep_going = read_t(&page_res_it, &tbox);
+    keep_going &=
+        ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
+    // Align bottom left points of the TBOXes.
+    while (keep_going &&
+           !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
+      if (bbox.bottom() < tbox.bottom()) {
+        page_res_it.forward();
+        keep_going = read_t(&page_res_it, &tbox);
+      } else {
+        keep_going =
+            ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
+      }
+    }
+    while (keep_going &&
+           !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
+      if (bbox.left() > tbox.left()) {
+        page_res_it.forward();
+        keep_going = read_t(&page_res_it, &tbox);
+      } else {
+        keep_going =
+            ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
+      }
+    }
+    // OCR the word if top right points of the TBOXes are similar.
+    if (keep_going &&
+        NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
+        NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
+      ambigs_classify_and_output(label.c_str(), &page_res_it, output_file);
+      examined_words++;
+    }
+    page_res_it.forward();
+  } while (keep_going);
+
+  // Set up scripts on all of the words that did not get sent to
+  // ambigs_classify_and_output.  They all should have, but if all the
+  // werd_res's don't get uch_sets, tesseract will crash when you try
+  // to iterate over them. :-(
+  int total_words = 0;
+  for (page_res_it.restart_page(); page_res_it.block() != nullptr;
+       page_res_it.forward()) {
+    if (page_res_it.word()) {
+      if (page_res_it.word()->uch_set == nullptr)
+        page_res_it.word()->SetupFake(unicharset);
+      total_words++;
+    }
+  }
+  if (examined_words < 0.85 * total_words) {
+    tprintf(
+        "TODO(antonova): clean up recog_training_segmented; "
+        " It examined only a small fraction of the ambigs image.\n");
+  }
+  tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words,
+          total_words);
+}
+
+// Helper prints the given set of blob choices.
+static void PrintPath(int length, const BLOB_CHOICE** blob_choices,
+                      const UNICHARSET& unicharset, const char* label,
+                      FILE* output_file) {
+  float rating = 0.0f;
+  float certainty = 0.0f;
+  for (int i = 0; i < length; ++i) {
+    const BLOB_CHOICE* blob_choice = blob_choices[i];
+    fprintf(output_file, "%s",
+            unicharset.id_to_unichar(blob_choice->unichar_id()));
+    rating += blob_choice->rating();
+    if (certainty > blob_choice->certainty())
+      certainty = blob_choice->certainty();
+  }
+  fprintf(output_file, "\t%s\t%.4f\t%.4f\n", label, rating, certainty);
+}
+
+// Helper recursively prints all paths through the ratings matrix, starting
+// at column col.
+static void PrintMatrixPaths(int col, int dim, const MATRIX& ratings,
+                             int length, const BLOB_CHOICE** blob_choices,
+                             const UNICHARSET& unicharset, const char* label,
+                             FILE* output_file) {
+  for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
+    if (ratings.get(col, row) != NOT_CLASSIFIED) {
+      BLOB_CHOICE_IT bc_it(ratings.get(col, row));
+      for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
+        blob_choices[length] = bc_it.data();
+        if (row + 1 < dim) {
+          PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices,
+                           unicharset, label, output_file);
+        } else {
+          PrintPath(length + 1, blob_choices, unicharset, label, output_file);
+        }
+      }
+    }
+  }
+}
+
+// Runs classify_word_pass1() on the current word. Outputs Tesseract's
+// raw choice as a result of the classification. For words labeled with a
+// single unichar also outputs all alternatives from blob_choices of the
+// best choice.
+void Tesseract::ambigs_classify_and_output(const char* label,
+                                           PAGE_RES_IT* pr_it,
+                                           FILE* output_file) {
+  // Classify word.
+  fflush(stdout);
+  WordData word_data(*pr_it);
+  SetupWordPassN(1, &word_data);
+  classify_word_and_language(1, pr_it, &word_data);
+  WERD_RES* werd_res = word_data.word;
+  WERD_CHOICE* best_choice = werd_res->best_choice;
+  ASSERT_HOST(best_choice != nullptr);
+
+  // Compute the number of unichars in the label.
+  std::vector<UNICHAR_ID> encoding;
+  if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
+    tprintf("Not outputting illegal unichar %s\n", label);
+    return;
+  }
+
+  // Dump all paths through the ratings matrix (which is normally small).
+  int dim = werd_res->ratings->dimension();
+  const auto** blob_choices = new const BLOB_CHOICE*[dim];
+  PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset,
+                   label, output_file);
+  delete[] blob_choices;
+}
+
+}  // namespace tesseract
diff --git a/tesseract/src/ccmain/reject.cpp b/tesseract/src/ccmain/reject.cpp
new file mode 100644
index 00000000..e2df9f40
--- /dev/null
+++ b/tesseract/src/ccmain/reject.cpp
@@ -0,0 +1,792 @@
+/**********************************************************************
+ * File:        reject.cpp  (Formerly reject.c)
+ * Description: Rejection functions used in tessedit
+ * Author:      Phil Cheatle
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#include "reject.h"
+
+#ifdef DISABLED_LEGACY_ENGINE
+
+#include "tesseractclass.h"
+
+namespace tesseract {
+
+int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
+  const WERD_CHOICE &word = *werd_res->best_choice;
+  int dict_word_type = werd_res->tesseract->dict_word(word);
+  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
+}
+}  // namespace tesseract
+
+#else
+
+#include "tessvars.h"
+#include "control.h"
+#include "docqual.h"
+#include "tesseractclass.h"
+
+#include "genericvector.h"
+#include "helpers.h"
+
+#include <cctype>
+#include <cerrno>
+#include <cstring>
+
+namespace tesseract {
+
+CLISTIZEH (STRING) CLISTIZE (STRING)
+
+/*************************************************************************
+ * set_done()
+ *
+ * Set the done flag based on the word acceptability criteria
+ *************************************************************************/
+
+void Tesseract::set_done(WERD_RES *word, int16_t pass) {
+  word->done = word->tess_accepted &&
+      (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr);
+  bool word_is_ambig = word->best_choice->dangerous_ambig_found();
+  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
+      word->best_choice->permuter() == FREQ_DAWG_PERM ||
+      word->best_choice->permuter() == USER_DAWG_PERM;
+  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
+      one_ell_conflict(word, false)) {
+    if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
+    word->done = false;
+  }
+  if (word->done && ((!word_from_dict &&
+      word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
+    if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
+      word->done = false;
+  }
+  if (tessedit_rejection_debug) {
+    tprintf("set_done(): done=%d\n", word->done);
+    word->best_choice->print("");
+  }
+}
+
+
+/*************************************************************************
+ * make_reject_map()
+ *
+ * Sets the done flag to indicate whether the resylt is acceptable.
+ *
+ * Sets a reject map for the word.
+ *************************************************************************/
+void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
+  int i;
+  int offset;
+
+  flip_0O(word);
+  check_debug_pt(word, -1);     // For trap only
+  set_done(word, pass);  // Set acceptance
+  word->reject_map.initialise(word->best_choice->unichar_lengths().length());
+  reject_blanks(word);
+  /*
+  0: Rays original heuristic - the baseline
+  */
+  if (tessedit_reject_mode == 0) {
+    if (!word->done)
+      reject_poor_matches(word);
+  } else if (tessedit_reject_mode == 5) {
+    /*
+    5: Reject I/1/l from words where there is no strong contextual confirmation;
+      the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
+      and the whole of any words which are very small
+    */
+    if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
+      word->reject_map.rej_word_small_xht();
+    } else {
+      one_ell_conflict(word, true);
+      /*
+        Originally the code here just used the done flag. Now I have duplicated
+        and unpacked the conditions for setting the done flag so that each
+        mechanism can be turned on or off independently. This works WITHOUT
+        affecting the done flag setting.
+      */
+      if (rej_use_tess_accepted && !word->tess_accepted)
+        word->reject_map.rej_word_not_tess_accepted ();
+
+      if (rej_use_tess_blanks &&
+        (strchr (word->best_choice->unichar_string().c_str(), ' ') != nullptr))
+        word->reject_map.rej_word_contains_blanks ();
+
+      WERD_CHOICE* best_choice = word->best_choice;
+      if (rej_use_good_perm) {
+        if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
+             best_choice->permuter() == FREQ_DAWG_PERM ||
+             best_choice->permuter() == USER_DAWG_PERM) &&
+            (!rej_use_sensible_wd ||
+             acceptable_word_string(*word->uch_set,
+                                    best_choice->unichar_string().c_str(),
+                                    best_choice->unichar_lengths().c_str()) !=
+                                        AC_UNACCEPTABLE)) {
+          // PASSED TEST
+        } else if (best_choice->permuter() == NUMBER_PERM) {
+          if (rej_alphas_in_number_perm) {
+            for (i = 0, offset = 0;
+                 best_choice->unichar_string()[offset] != '\0';
+                 offset += best_choice->unichar_lengths()[i++]) {
+              if (word->reject_map[i].accepted() &&
+                  word->uch_set->get_isalpha(
+                      best_choice->unichar_string().c_str() + offset,
+                      best_choice->unichar_lengths()[i]))
+                word->reject_map[i].setrej_bad_permuter();
+              // rej alpha
+            }
+          }
+        } else {
+          word->reject_map.rej_word_bad_permuter();
+        }
+      }
+      /* Ambig word rejection was here once !!*/
+    }
+  } else {
+    tprintf("BAD tessedit_reject_mode\n");
+    ASSERT_HOST("Fatal error encountered!" == nullptr);
+  }
+
+  if (tessedit_image_border > -1)
+    reject_edge_blobs(word);
+
+  check_debug_pt (word, 10);
+  if (tessedit_rejection_debug) {
+    tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
+    tprintf("Certainty: %f     Rating: %f\n",
+      word->best_choice->certainty (), word->best_choice->rating ());
+    tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
+  }
+
+  flip_hyphens(word);
+  check_debug_pt(word, 20);
+}
+
+void reject_blanks(WERD_RES *word) {
+  int16_t i;
+  int16_t offset;
+
+  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
+       offset += word->best_choice->unichar_lengths()[i], i += 1) {
+    if (word->best_choice->unichar_string()[offset] == ' ')
+                                 //rej unrecognised blobs
+      word->reject_map[i].setrej_tess_failure ();
+  }
+}
+
+void Tesseract::reject_I_1_L(WERD_RES *word) {
+  int16_t i;
+  int16_t offset;
+
+  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
+       offset += word->best_choice->unichar_lengths()[i], i += 1) {
+    if (STRING (conflict_set_I_l_1).
+    contains (word->best_choice->unichar_string()[offset])) {
+                                 //rej 1Il conflict
+      word->reject_map[i].setrej_1Il_conflict ();
+    }
+  }
+}
+
+void reject_poor_matches(WERD_RES *word) {
+  float threshold = compute_reject_threshold(word->best_choice);
+  for (int i = 0; i < word->best_choice->length(); ++i) {
+    if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
+      word->reject_map[i].setrej_tess_failure();
+    else if (word->best_choice->certainty(i) < threshold)
+      word->reject_map[i].setrej_poor_match();
+  }
+}
+
+
+/**********************************************************************
+ * compute_reject_threshold
+ *
+ * Set a rejection threshold for this word.
+ * Initially this is a trivial function which looks for the largest
+ * gap in the certainty value.
+ **********************************************************************/
+
+float compute_reject_threshold(WERD_CHOICE* word) {
+  float threshold;               // rejection threshold
+  float bestgap = 0.0f;          // biggest gap
+  float gapstart;                // bottom of gap
+
+  int blob_count = word->length();
+  GenericVector<float> ratings;
+  ratings.resize_no_init(blob_count);
+  for (int i = 0; i < blob_count; ++i) {
+    ratings[i] = word->certainty(i);
+  }
+  ratings.sort();
+  gapstart = ratings[0] - 1;     // all reject if none better
+  if (blob_count >= 3) {
+    for (int index = 0; index < blob_count - 1; index++) {
+      if (ratings[index + 1] - ratings[index] > bestgap) {
+        bestgap = ratings[index + 1] - ratings[index];
+        // find biggest
+        gapstart = ratings[index];
+      }
+    }
+  }
+  threshold = gapstart + bestgap / 2;
+
+  return threshold;
+}
+
+
+/*************************************************************************
+ * reject_edge_blobs()
+ *
+ * If the word is perilously close to the edge of the image, reject those blobs
+ * in the word which are too close to the edge as they could be clipped.
+ *************************************************************************/
+void Tesseract::reject_edge_blobs(WERD_RES *word) {
+  TBOX word_box = word->word->bounding_box();
+  // Use the box_word as it is already denormed back to image coordinates.
+  int blobcount = word->box_word->length();
+
+  if (word_box.left() < tessedit_image_border ||
+      word_box.bottom() < tessedit_image_border ||
+      word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
+      word_box.top() + tessedit_image_border > ImageHeight() - 1) {
+    ASSERT_HOST(word->reject_map.length() == blobcount);
+    for (int blobindex = 0; blobindex < blobcount; blobindex++) {
+      TBOX blob_box = word->box_word->BlobBox(blobindex);
+      if (blob_box.left() < tessedit_image_border ||
+          blob_box.bottom() < tessedit_image_border ||
+          blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
+          blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
+        word->reject_map[blobindex].setrej_edge_char();
+        // Close to edge
+      }
+    }
+  }
+}
+
+/**********************************************************************
+ * one_ell_conflict()
+ *
+ * Identify words where there is a potential I/l/1 error.
+ * - A bundle of contextual heuristics!
+ **********************************************************************/
+bool Tesseract::one_ell_conflict(WERD_RES* word_res, bool update_map) {
+  const char *word;
+  const char *lengths;
+  int16_t word_len;                //its length
+  int16_t first_alphanum_index_;
+  int16_t first_alphanum_offset_;
+  int16_t i;
+  int16_t offset;
+  bool non_conflict_set_char;   //non conf set a/n?
+  bool conflict = false;
+  bool allow_1s;
+  ACCEPTABLE_WERD_TYPE word_type;
+  bool dict_perm_type;
+  bool dict_word_ok;
+  int dict_word_type;
+
+  word = word_res->best_choice->unichar_string().c_str();
+  lengths = word_res->best_choice->unichar_lengths().c_str();
+  word_len = strlen(lengths);
+  /*
+    If there are no occurrences of the conflict set characters then the word
+    is OK.
+  */
+  if (strpbrk(word, conflict_set_I_l_1.c_str()) == nullptr)
+    return false;
+
+  /*
+    There is a conflict if there are NO other (confirmed) alphanumerics apart
+    from those in the conflict set.
+  */
+
+  for (i = 0, offset = 0, non_conflict_set_char = false;
+       (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
+    non_conflict_set_char =
+        (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
+            word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
+        !STRING (conflict_set_I_l_1).contains (word[offset]);
+  if (!non_conflict_set_char) {
+    if (update_map)
+      reject_I_1_L(word_res);
+    return true;
+  }
+
+  /*
+    If the word is accepted by a dawg permuter, and the first alpha character
+    is "I" or "l", check to see if the alternative is also a dawg word. If it
+    is, then there is a potential error otherwise the word is ok.
+  */
+
+  dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
+    (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
+    (rej_trust_doc_dawg &&
+    (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
+    (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
+  dict_word_type = dict_word(*(word_res->best_choice));
+  dict_word_ok = (dict_word_type > 0) &&
+    (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
+
+  if ((rej_1Il_use_dict_word && dict_word_ok) ||
+    (rej_1Il_trust_permuter_type && dict_perm_type) ||
+  (dict_perm_type && dict_word_ok)) {
+    first_alphanum_index_ = first_alphanum_index (word, lengths);
+    first_alphanum_offset_ = first_alphanum_offset (word, lengths);
+    if (lengths[first_alphanum_index_] == 1 &&
+        word[first_alphanum_offset_] == 'I') {
+      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
+      if (safe_dict_word(word_res) > 0) {
+        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
+        if (update_map)
+          word_res->reject_map[first_alphanum_index_].
+            setrej_1Il_conflict();
+        return true;
+      }
+      else {
+        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
+        return false;
+      }
+    }
+
+    if (lengths[first_alphanum_index_] == 1 &&
+        word[first_alphanum_offset_] == 'l') {
+      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
+      if (safe_dict_word(word_res) > 0) {
+        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
+        if (update_map)
+          word_res->reject_map[first_alphanum_index_].
+            setrej_1Il_conflict();
+        return true;
+      }
+      else {
+        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
+        return false;
+      }
+    }
+    return false;
+  }
+
+  /*
+    NEW 1Il code. The old code relied on permuter types too much. In fact,
+    tess will use TOP_CHOICE permute for good things like "palette".
+    In this code the string is examined independently to see if it looks like
+    a well formed word.
+  */
+
+  /*
+    REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
+    dictionary word.
+  */
+  first_alphanum_index_ = first_alphanum_index (word, lengths);
+  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
+  if (lengths[first_alphanum_index_] == 1 &&
+      word[first_alphanum_offset_] == 'l') {
+    word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
+    if (safe_dict_word(word_res) > 0)
+      return false;
+    else
+      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
+  }
+  else if (lengths[first_alphanum_index_] == 1 &&
+           word[first_alphanum_offset_] == 'I') {
+    word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
+    if (safe_dict_word(word_res) > 0)
+      return false;
+    else
+      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
+  }
+  /*
+    For strings containing digits:
+      If there are no alphas OR the numeric permuter liked the word,
+        reject any non 1 conflict chs
+      Else reject all conflict chs
+  */
+  if (word_contains_non_1_digit (word, lengths)) {
+    allow_1s = (alpha_count (word, lengths) == 0) ||
+      (word_res->best_choice->permuter () == NUMBER_PERM);
+
+    int16_t offset;
+    conflict = false;
+    for (i = 0, offset = 0; word[offset] != '\0';
+         offset += word_res->best_choice->unichar_lengths()[i++]) {
+      if ((!allow_1s || (word[offset] != '1')) &&
+      STRING (conflict_set_I_l_1).contains (word[offset])) {
+        if (update_map)
+          word_res->reject_map[i].setrej_1Il_conflict ();
+        conflict = true;
+      }
+    }
+    return conflict;
+  }
+  /*
+    For anything else. See if it conforms to an acceptable word type. If so,
+    treat accordingly.
+  */
+  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
+  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
+    first_alphanum_index_ = first_alphanum_index (word, lengths);
+    first_alphanum_offset_ = first_alphanum_offset (word, lengths);
+    if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
+      if (update_map)
+        word_res->reject_map[first_alphanum_index_].
+            setrej_1Il_conflict ();
+      return true;
+    }
+    else
+      return false;
+  }
+  else if (word_type == AC_UPPER_CASE) {
+    return false;
+  }
+  else {
+    if (update_map)
+      reject_I_1_L(word_res);
+    return true;
+  }
+}
+
+
+int16_t Tesseract::first_alphanum_index(const char *word,
+                                      const char *word_lengths) {
+  int16_t i;
+  int16_t offset;
+
+  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
+    if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
+        unicharset.get_isdigit(word + offset, word_lengths[i]))
+      return i;
+  }
+  return -1;
+}
+
+int16_t Tesseract::first_alphanum_offset(const char *word,
+                                       const char *word_lengths) {
+  int16_t i;
+  int16_t offset;
+
+  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
+    if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
+        unicharset.get_isdigit(word + offset, word_lengths[i]))
+      return offset;
+  }
+  return -1;
+}
+
+int16_t Tesseract::alpha_count(const char *word,
+                             const char *word_lengths) {
+  int16_t i;
+  int16_t offset;
+  int16_t count = 0;
+
+  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
+    if (unicharset.get_isalpha (word + offset, word_lengths[i]))
+      count++;
+  }
+  return count;
+}
+
+
+bool Tesseract::word_contains_non_1_digit(const char* word,
+                                          const char* word_lengths) {
+  int16_t i;
+  int16_t offset;
+
+  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
+    if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
+        (word_lengths[i] != 1 || word[offset] != '1'))
+      return true;
+  }
+  return false;
+}
+
+/*************************************************************************
+ * dont_allow_1Il()
+ * Don't unreject LONE accepted 1Il conflict set chars
+ *************************************************************************/
+void Tesseract::dont_allow_1Il(WERD_RES *word) {
+  int i = 0;
+  int offset;
+  int word_len = word->reject_map.length();
+  const char *s = word->best_choice->unichar_string().c_str();
+  const char *lengths = word->best_choice->unichar_lengths().c_str();
+  bool accepted_1Il = false;
+
+  for (i = 0, offset = 0; i < word_len;
+       offset += word->best_choice->unichar_lengths()[i++]) {
+    if (word->reject_map[i].accepted()) {
+      if (STRING(conflict_set_I_l_1).contains(s[offset])) {
+        accepted_1Il = true;
+      } else {
+        if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
+            word->uch_set->get_isdigit(s + offset, lengths[i]))
+          return;                // >=1 non 1Il ch accepted
+      }
+    }
+  }
+  if (!accepted_1Il)
+    return;                      //Nothing to worry about
+
+  for (i = 0, offset = 0; i < word_len;
+       offset += word->best_choice->unichar_lengths()[i++]) {
+    if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
+      word->reject_map[i].accepted())
+      word->reject_map[i].setrej_postNN_1Il();
+  }
+}
+
+
+int16_t Tesseract::count_alphanums(WERD_RES *word_res) {
+  int count = 0;
+  const WERD_CHOICE *best_choice = word_res->best_choice;
+  for (int i = 0; i < word_res->reject_map.length(); ++i) {
+    if ((word_res->reject_map[i].accepted()) &&
+        (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
+            word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
+      count++;
+    }
+  }
+  return count;
+}
+
+
+// reject all if most rejected.
+void Tesseract::reject_mostly_rejects(WERD_RES *word) {
+  /* Reject the whole of the word if the fraction of rejects exceeds a limit */
+
+  if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=
+    rej_whole_of_mostly_reject_word_fract)
+    word->reject_map.rej_word_mostly_rej();
+}
+
+
+bool Tesseract::repeated_nonalphanum_wd(WERD_RES* word, ROW* row) {
+  int16_t char_quality;
+  int16_t accepted_char_quality;
+
+  if (word->best_choice->unichar_lengths().length() <= 1)
+    return false;
+
+  if (!STRING(ok_repeated_ch_non_alphanum_wds).
+    contains(word->best_choice->unichar_string()[0]))
+    return false;
+
+  UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
+  for (int i = 1; i < word->best_choice->length(); ++i) {
+    if (word->best_choice->unichar_id(i) != uch_id) return false;
+  }
+
+  word_char_quality(word, &char_quality, &accepted_char_quality);
+
+  if ((word->best_choice->unichar_lengths().length () == char_quality) &&
+    (char_quality == accepted_char_quality))
+    return true;
+  else
+    return false;
+}
+
+int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
+  const WERD_CHOICE &word = *werd_res->best_choice;
+  int dict_word_type = werd_res->tesseract->dict_word(word);
+  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
+}
+
+// Note: After running this function word_res->ratings
+// might not contain the right BLOB_CHOICE corresponding to each character
+// in word_res->best_choice.
+void Tesseract::flip_hyphens(WERD_RES *word_res) {
+  WERD_CHOICE *best_choice = word_res->best_choice;
+  int i;
+  int prev_right = -9999;
+  int next_left;
+  TBOX out_box;
+  float aspect_ratio;
+
+  if (tessedit_lower_flip_hyphen <= 1)
+    return;
+
+  int num_blobs = word_res->rebuild_word->NumBlobs();
+  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
+  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
+    TBLOB* blob = word_res->rebuild_word->blobs[i];
+    out_box = blob->bounding_box();
+    if (i + 1 == num_blobs)
+      next_left = 9999;
+    else
+      next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
+    // Don't touch small or touching blobs - it is too dangerous.
+    if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
+        (out_box.left() > prev_right) && (out_box.right() < next_left)) {
+      aspect_ratio = out_box.width() / static_cast<float>(out_box.height());
+      if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
+        if (aspect_ratio >= tessedit_upper_flip_hyphen &&
+            word_res->uch_set->contains_unichar_id(unichar_dash) &&
+            word_res->uch_set->get_enabled(unichar_dash)) {
+          /* Certain HYPHEN */
+          best_choice->set_unichar_id(unichar_dash, i);
+          if (word_res->reject_map[i].rejected())
+            word_res->reject_map[i].setrej_hyphen_accept();
+        }
+        if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
+          word_res->reject_map[i].accepted())
+                                 //Suspected HYPHEN
+          word_res->reject_map[i].setrej_hyphen ();
+      }
+      else if (best_choice->unichar_id(i) == unichar_dash) {
+        if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
+          (word_res->reject_map[i].rejected()))
+          word_res->reject_map[i].setrej_hyphen_accept();
+        //Certain HYPHEN
+
+        if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
+          (word_res->reject_map[i].accepted()))
+                                 //Suspected HYPHEN
+          word_res->reject_map[i].setrej_hyphen();
+      }
+    }
+    prev_right = out_box.right();
+  }
+}
+
+// Note: After running this function word_res->ratings
+// might not contain the right BLOB_CHOICE corresponding to each character
+// in word_res->best_choice.
+void Tesseract::flip_0O(WERD_RES *word_res) {
+  WERD_CHOICE *best_choice = word_res->best_choice;
+  int i;
+  TBOX out_box;
+
+  if (!tessedit_flip_0O)
+    return;
+
+  int num_blobs = word_res->rebuild_word->NumBlobs();
+  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
+    TBLOB* blob = word_res->rebuild_word->blobs[i];
+    if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
+        word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
+      out_box = blob->bounding_box();
+      if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
+        (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
+        return;                  //Beware words with sub/superscripts
+    }
+  }
+  UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
+  UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
+  if (unichar_0 == INVALID_UNICHAR_ID ||
+      !word_res->uch_set->get_enabled(unichar_0) ||
+      unichar_O == INVALID_UNICHAR_ID ||
+      !word_res->uch_set->get_enabled(unichar_O)) {
+    return;  // 0 or O are not present/enabled in unicharset
+  }
+  for (i = 1; i < best_choice->length(); ++i) {
+    if (best_choice->unichar_id(i) == unichar_0 ||
+        best_choice->unichar_id(i) == unichar_O) {
+      /* A0A */
+      if ((i+1) < best_choice->length() &&
+          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
+          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
+        best_choice->set_unichar_id(unichar_O, i);
+      }
+      /* A00A */
+      if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
+          (i+1) < best_choice->length() &&
+          (best_choice->unichar_id(i+1) == unichar_0 ||
+           best_choice->unichar_id(i+1) == unichar_O) &&
+          (i+2) < best_choice->length() &&
+          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
+        best_choice->set_unichar_id(unichar_O, i);
+        i++;
+      }
+      /* AA0<non digit or end of word> */
+      if ((i > 1) &&
+          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
+          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
+          (((i+1) < best_choice->length() &&
+            !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
+            !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
+            !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
+           (i == best_choice->length() - 1))) {
+        best_choice->set_unichar_id(unichar_O, i);
+      }
+      /* 9O9 */
+      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
+          (i+1) < best_choice->length() &&
+          non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
+        best_choice->set_unichar_id(unichar_0, i);
+      }
+      /* 9OOO */
+      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
+          (i+2) < best_choice->length() &&
+          (best_choice->unichar_id(i+1) == unichar_0 ||
+           best_choice->unichar_id(i+1) == unichar_O) &&
+          (best_choice->unichar_id(i+2) == unichar_0 ||
+           best_choice->unichar_id(i+2) == unichar_O)) {
+        best_choice->set_unichar_id(unichar_0, i);
+        best_choice->set_unichar_id(unichar_0, i+1);
+        best_choice->set_unichar_id(unichar_0, i+2);
+        i += 2;
+      }
+      /* 9OO<non upper> */
+      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
+          (i+2) < best_choice->length() &&
+          (best_choice->unichar_id(i+1) == unichar_0 ||
+          best_choice->unichar_id(i+1) == unichar_O) &&
+          !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
+        best_choice->set_unichar_id(unichar_0, i);
+        best_choice->set_unichar_id(unichar_0, i+1);
+        i++;
+      }
+      /* 9O<non upper> */
+      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
+          (i+1) < best_choice->length() &&
+          !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
+        best_choice->set_unichar_id(unichar_0, i);
+      }
+      /* 9[.,]OOO.. */
+      if ((i > 1) &&
+          (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
+              word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
+          (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
+           best_choice->unichar_id(i-2) == unichar_O)) {
+        if (best_choice->unichar_id(i-2) == unichar_O) {
+          best_choice->set_unichar_id(unichar_0, i-2);
+        }
+        while (i < best_choice->length() &&
+               (best_choice->unichar_id(i) == unichar_O ||
+                best_choice->unichar_id(i) == unichar_0)) {
+          best_choice->set_unichar_id(unichar_0, i);
+          i++;
+        }
+        i--;
+      }
+    }
+  }
+}
+
+bool Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
+  return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
+}
+
+bool Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
+  return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
+}
+}  // namespace tesseract
+
+#endif  // def DISABLED_LEGACY_ENGINE
diff --git a/tesseract/src/ccmain/reject.h b/tesseract/src/ccmain/reject.h
new file mode 100644
index 00000000..e144813a
--- /dev/null
+++ b/tesseract/src/ccmain/reject.h
@@ -0,0 +1,39 @@
+/**********************************************************************
+ * File:        reject.h
+ * Description: Rejection functions used in tessedit
+ * Author:      Phil Cheatle
+ * Created:     Wed Sep 23 16:50:21 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef REJECT_H
+#define REJECT_H
+
+namespace tesseract {
+
+class WERD_CHOICE;
+class WERD_RES;
+
+void reject_blanks(WERD_RES *word);
+void reject_poor_matches(WERD_RES *word);
+float compute_reject_threshold(WERD_CHOICE* word);
+bool word_contains_non_1_digit(const char* word, const char* word_lengths);
+void dont_allow_1Il(WERD_RES *word);
+void flip_hyphens(WERD_RES *word);
+void flip_0O(WERD_RES *word);
+bool non_0_digit(const char* str, int length);
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/ccmain/resultiterator.cpp b/tesseract/src/ccmain/resultiterator.cpp
new file mode 100644
index 00000000..d8f537f2
--- /dev/null
+++ b/tesseract/src/ccmain/resultiterator.cpp
@@ -0,0 +1,752 @@
+///////////////////////////////////////////////////////////////////////
+// File:        resultiterator.cpp
+// Description: Iterator for tesseract results that is capable of
+//              iterating in proper reading order over Bi Directional
+//              (e.g. mixed Hebrew and English) text.
+// Author:      David Eger
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include <tesseract/resultiterator.h>
+
+#include "pageres.h"
+#include "tesseractclass.h"
+#include "unicharset.h"
+
+#include "allheaders.h"
+
+#include <set>
+#include <vector>
+
+static const char * const kLRM = "\u200E";  // Left-to-Right Mark
+static const char * const kRLM = "\u200F";  // Right-to-Left Mark
+
+namespace tesseract {
+
+ResultIterator::ResultIterator(const LTRResultIterator& resit)
+    : LTRResultIterator(resit) {
+  in_minor_direction_ = false;
+  at_beginning_of_minor_run_ = false;
+  preserve_interword_spaces_ = false;
+
+  auto* p = ParamUtils::FindParam<BoolParam>("preserve_interword_spaces",
+                                             GlobalParams()->bool_params,
+                                             tesseract_->params()->bool_params);
+  if (p != nullptr)
+    preserve_interword_spaces_ = (bool)(*p);
+
+  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
+  MoveToLogicalStartOfTextline();
+}
+
+ResultIterator* ResultIterator::StartOfParagraph(
+    const LTRResultIterator& resit) {
+  return new ResultIterator(resit);
+}
+
+bool ResultIterator::ParagraphIsLtr() const {
+  return current_paragraph_is_ltr_;
+}
+
+bool ResultIterator::CurrentParagraphIsLtr() const {
+  if (!it_->word())
+    return true;  // doesn't matter.
+  LTRResultIterator it(*this);
+  it.RestartParagraph();
+  // Try to figure out the ltr-ness of the paragraph.  The rules below
+  // make more sense in the context of a difficult paragraph example.
+  // Here we denote {ltr characters, RTL CHARACTERS}:
+  //
+  //   "don't go in there!" DAIS EH
+  //   EHT OTNI DEPMUJ FELSMIH NEHT DNA
+  //                  .GNIDLIUB GNINRUB
+  //
+  // On the first line, the left-most word is LTR and the rightmost word
+  // is RTL.  Thus, we are better off taking the majority direction for
+  // the whole paragraph contents.  So instead of "the leftmost word is LTR"
+  // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
+  // would not do:  Typically an RTL paragraph would *not* start with an LTR
+  // word.  So our heuristics are as follows:
+  //
+  // (1) If the first text line has an RTL word in the left-most position
+  //     it is RTL.
+  // (2) If the first text line has an LTR word in the right-most position
+  //     it is LTR.
+  // (3) If neither of the above is true, take the majority count for the
+  //     paragraph -- if there are more rtl words, it is RTL.  If there
+  //     are more LTR words, it's LTR.
+  bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
+  bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
+  int num_ltr, num_rtl;
+  num_rtl = leftmost_rtl ? 1 : 0;
+  num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
+  for (it.Next(RIL_WORD);
+       !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
+       it.Next(RIL_WORD)) {
+    StrongScriptDirection dir = it.WordDirection();
+    rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
+    num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
+    num_ltr += rightmost_ltr ? 1 : 0;
+  }
+  if (leftmost_rtl)
+    return false;
+  if (rightmost_ltr)
+    return true;
+  // First line is ambiguous.  Take statistics on the whole paragraph.
+  if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA))
+    do {
+      StrongScriptDirection dir = it.WordDirection();
+      num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
+      num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
+    } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
+  return num_ltr >= num_rtl;
+}
+
+const int ResultIterator::kMinorRunStart = -1;
+const int ResultIterator::kMinorRunEnd = -2;
+const int ResultIterator::kComplexWord = -3;
+
+void ResultIterator::CalculateBlobOrder(
+    std::vector<int>* blob_indices) const {
+  bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
+  blob_indices->clear();
+  if (Empty(RIL_WORD))
+    return;
+  if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
+    // Easy! just return the blobs in order;
+    for (int i = 0; i < word_length_; i++) blob_indices->push_back(i);
+    return;
+  }
+
+  // The blobs are in left-to-right order, but the current reading context
+  // is right-to-left.
+  const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
+  const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
+  const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
+  const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
+  const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
+  const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
+  const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
+
+  // Step 1: Scan for and mark European Number sequences
+  //   [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
+  GenericVector<int> letter_types;
+  for (int i = 0; i < word_length_; i++) {
+    letter_types.push_back(it_->word()->SymbolDirection(i));
+  }
+  // Convert a single separtor sandwiched between two EN's into an EN.
+  for (int i = 0; i + 2 < word_length_; i++) {
+    if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
+        (letter_types[i + 1] == U_EURO_NUM_SEP ||
+         letter_types[i + 1] == U_COMMON_NUM_SEP)) {
+      letter_types[i + 1] = U_EURO_NUM;
+    }
+  }
+  // Scan for sequences of European Number Terminators around ENs and convert
+  // them to ENs.
+  for (int i = 0; i < word_length_; i++) {
+    if (letter_types[i] == U_EURO_NUM_TERM) {
+      int j = i + 1;
+      while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) {
+        j++;
+      }
+      if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
+        // The sequence [i..j] should be converted to all European Numbers.
+        for (int k = i; k < j; k++) letter_types[k] = U_EURO_NUM;
+      }
+      j = i - 1;
+      while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) {
+        j--;
+      }
+      if (j > -1 && letter_types[j] == U_EURO_NUM) {
+        // The sequence [j..i] should be converted to all European Numbers.
+        for (int k = j; k <= i; k++) letter_types[k] = U_EURO_NUM;
+      }
+    }
+  }
+  // Step 2: Convert all remaining types to either L or R.
+  // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
+  // All other are R.
+  for (int i = 0; i < word_length_;) {
+    int ti = letter_types[i];
+    if (ti == U_LTR || ti == U_EURO_NUM) {
+      // Left to right sequence; scan to the end of it.
+      int last_good = i;
+      for (int j = i + 1; j < word_length_; j++) {
+        int tj = letter_types[j];
+        if (tj == U_LTR || tj == U_EURO_NUM) {
+          last_good = j;
+        } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
+          // do nothing.
+        } else {
+          break;
+        }
+      }
+      // [i..last_good] is the L sequence
+      for (int k = i; k <= last_good; k++) letter_types[k] = U_LTR;
+      i = last_good + 1;
+    } else {
+      letter_types[i] = U_RTL;
+      i++;
+    }
+  }
+
+  // At this point, letter_types is entirely U_LTR or U_RTL.
+  for (int i = word_length_ - 1; i >= 0;) {
+    if (letter_types[i] == U_RTL) {
+      blob_indices->push_back(i);
+      i--;
+    } else {
+      // left to right sequence.  scan to the beginning.
+      int j = i - 1;
+      for (; j >= 0 && letter_types[j] != U_RTL; j--) {
+      }  // pass
+      // Now (j, i] is LTR
+      for (int k = j + 1; k <= i; k++) blob_indices->push_back(k);
+      i = j;
+    }
+  }
+  ASSERT_HOST(blob_indices->size() == word_length_);
+}
+
+static void PrintScriptDirs(const std::vector<StrongScriptDirection>& dirs) {
+  for (int i = 0; i < dirs.size(); i++) {
+    switch (dirs[i]) {
+      case DIR_NEUTRAL:
+        tprintf("N ");
+        break;
+      case DIR_LEFT_TO_RIGHT:
+        tprintf("L ");
+        break;
+      case DIR_RIGHT_TO_LEFT:
+        tprintf("R ");
+        break;
+      case DIR_MIX:
+        tprintf("Z ");
+        break;
+      default:
+        tprintf("? ");
+        break;
+    }
+  }
+  tprintf("\n");
+}
+
+void ResultIterator::CalculateTextlineOrder(
+    bool paragraph_is_ltr, const LTRResultIterator& resit,
+    std::vector<int>* word_indices) const {
+  std::vector<StrongScriptDirection> directions;
+  CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
+}
+
+void ResultIterator::CalculateTextlineOrder(
+    bool paragraph_is_ltr, const LTRResultIterator& resit,
+    std::vector<StrongScriptDirection>* dirs_arg,
+    std::vector<int>* word_indices) const {
+  std::vector<StrongScriptDirection> dirs;
+  std::vector<StrongScriptDirection>* directions;
+  directions = (dirs_arg != nullptr) ? dirs_arg : &dirs;
+  directions->clear();
+
+  // A LTRResultIterator goes strictly left-to-right word order.
+  LTRResultIterator ltr_it(resit);
+  ltr_it.RestartRow();
+  if (ltr_it.Empty(RIL_WORD))
+    return;
+  do {
+    directions->push_back(ltr_it.WordDirection());
+  } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
+
+  word_indices->clear();
+  CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
+}
+
+void ResultIterator::CalculateTextlineOrder(
+    bool paragraph_is_ltr,
+    const std::vector<StrongScriptDirection>& word_dirs,
+    std::vector<int>* reading_order) {
+  reading_order->clear();
+  if (word_dirs.size() == 0)
+    return;
+
+  // Take all of the runs of minor direction words and insert them
+  // in reverse order.
+  int minor_direction, major_direction, major_step, start, end;
+  if (paragraph_is_ltr) {
+    start = 0;
+    end = word_dirs.size();
+    major_step = 1;
+    major_direction = DIR_LEFT_TO_RIGHT;
+    minor_direction = DIR_RIGHT_TO_LEFT;
+  } else {
+    start = word_dirs.size() - 1;
+    end = -1;
+    major_step = -1;
+    major_direction = DIR_RIGHT_TO_LEFT;
+    minor_direction = DIR_LEFT_TO_RIGHT;
+    // Special rule: if there are neutral words at the right most side
+    //   of a line adjacent to a left-to-right word in the middle of the
+    //   line, we interpret the end of the line as a single LTR sequence.
+    if (word_dirs[start] == DIR_NEUTRAL) {
+      int neutral_end = start;
+      while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
+        neutral_end--;
+      }
+      if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
+        // LTR followed by neutrals.
+        // Scan for the beginning of the minor left-to-right run.
+        int left = neutral_end;
+        for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
+          if (word_dirs[i] == DIR_LEFT_TO_RIGHT)
+            left = i;
+        }
+        reading_order->push_back(kMinorRunStart);
+        for (int i = left; i < word_dirs.size(); i++) {
+          reading_order->push_back(i);
+          if (word_dirs[i] == DIR_MIX)
+            reading_order->push_back(kComplexWord);
+        }
+        reading_order->push_back(kMinorRunEnd);
+        start = left - 1;
+      }
+    }
+  }
+  for (int i = start; i != end;) {
+    if (word_dirs[i] == minor_direction) {
+      int j = i;
+      while (j != end && word_dirs[j] != major_direction) j += major_step;
+      if (j == end)
+        j -= major_step;
+      while (j != i && word_dirs[j] != minor_direction) j -= major_step;
+      //  [j..i] is a minor direction run.
+      reading_order->push_back(kMinorRunStart);
+      for (int k = j; k != i; k -= major_step) {
+        reading_order->push_back(k);
+      }
+      reading_order->push_back(i);
+      reading_order->push_back(kMinorRunEnd);
+      i = j + major_step;
+    } else {
+      reading_order->push_back(i);
+      if (word_dirs[i] == DIR_MIX)
+        reading_order->push_back(kComplexWord);
+      i += major_step;
+    }
+  }
+}
+
+int ResultIterator::LTRWordIndex() const {
+  int this_word_index = 0;
+  LTRResultIterator textline(*this);
+  textline.RestartRow();
+  while (!textline.PositionedAtSameWord(it_)) {
+    this_word_index++;
+    textline.Next(RIL_WORD);
+  }
+  return this_word_index;
+}
+
+void ResultIterator::MoveToLogicalStartOfWord() {
+  if (word_length_ == 0) {
+    BeginWord(0);
+    return;
+  }
+  std::vector<int> blob_order;
+  CalculateBlobOrder(&blob_order);
+  if (blob_order.size() == 0 || blob_order[0] == 0)
+    return;
+  BeginWord(blob_order[0]);
+}
+
+bool ResultIterator::IsAtFinalSymbolOfWord() const {
+  if (!it_->word())
+    return true;
+  std::vector<int> blob_order;
+  CalculateBlobOrder(&blob_order);
+  return blob_order.size() == 0 || blob_order.back() == blob_index_;
+}
+
+bool ResultIterator::IsAtFirstSymbolOfWord() const {
+  if (!it_->word())
+    return true;
+  std::vector<int> blob_order;
+  CalculateBlobOrder(&blob_order);
+  return blob_order.size() == 0 || blob_order[0] == blob_index_;
+}
+
+void ResultIterator::AppendSuffixMarks(std::string* text) const {
+  if (!it_->word())
+    return;
+  bool reading_direction_is_ltr =
+      current_paragraph_is_ltr_ ^ in_minor_direction_;
+  // scan forward to see what meta-information the word ordering algorithm
+  // left us.
+  // If this word is at the  *end* of a minor run, insert the other
+  // direction's mark;  else if this was a complex word, insert the
+  // current reading order's mark.
+  std::vector<int> textline_order;
+  CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &textline_order);
+  int this_word_index = LTRWordIndex();
+  size_t i = 0;
+  for (const auto word_index : textline_order) {
+    if (word_index == this_word_index) {
+      break;
+    }
+    i++;
+  }
+  if (i == textline_order.size()) {
+    return;
+  }
+
+  int last_non_word_mark = 0;
+  for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
+    last_non_word_mark = textline_order[i];
+  }
+  if (last_non_word_mark == kComplexWord) {
+    *text += reading_direction_is_ltr ? kLRM : kRLM;
+  } else if (last_non_word_mark == kMinorRunEnd) {
+    if (current_paragraph_is_ltr_) {
+      *text += kLRM;
+    } else {
+      *text += kRLM;
+    }
+  }
+}
+
+void ResultIterator::MoveToLogicalStartOfTextline() {
+  std::vector<int> word_indices;
+  RestartRow();
+  CalculateTextlineOrder(current_paragraph_is_ltr_,
+                         dynamic_cast<const LTRResultIterator&>(*this),
+                         &word_indices);
+  int i = 0;
+  for (; i < word_indices.size() && word_indices[i] < 0; i++) {
+    if (word_indices[i] == kMinorRunStart)
+      in_minor_direction_ = true;
+    else if (word_indices[i] == kMinorRunEnd)
+      in_minor_direction_ = false;
+  }
+  if (in_minor_direction_)
+    at_beginning_of_minor_run_ = true;
+  if (i >= word_indices.size())
+    return;
+  int first_word_index = word_indices[i];
+  for (int j = 0; j < first_word_index; j++) {
+    PageIterator::Next(RIL_WORD);
+  }
+  MoveToLogicalStartOfWord();
+}
+
+void ResultIterator::Begin() {
+  LTRResultIterator::Begin();
+  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
+  in_minor_direction_ = false;
+  at_beginning_of_minor_run_ = false;
+  MoveToLogicalStartOfTextline();
+}
+
+bool ResultIterator::Next(PageIteratorLevel level) {
+  if (it_->block() == nullptr)
+    return false;  // already at end!
+  switch (level) {
+    case RIL_BLOCK:  // explicit fall-through
+    case RIL_PARA:   // explicit fall-through
+    case RIL_TEXTLINE:
+      if (!PageIterator::Next(level))
+        return false;
+      if (IsWithinFirstTextlineOfParagraph()) {
+        // if we've advanced to a new paragraph,
+        // recalculate current_paragraph_is_ltr_
+        current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
+      }
+      in_minor_direction_ = false;
+      MoveToLogicalStartOfTextline();
+      return it_->block() != nullptr;
+    case RIL_SYMBOL: {
+      std::vector<int> blob_order;
+      CalculateBlobOrder(&blob_order);
+      int next_blob = 0;
+      while (next_blob < blob_order.size() &&
+             blob_index_ != blob_order[next_blob])
+        next_blob++;
+      next_blob++;
+      if (next_blob < blob_order.size()) {
+        // we're in the same word; simply advance one blob.
+        BeginWord(blob_order[next_blob]);
+        at_beginning_of_minor_run_ = false;
+        return true;
+      }
+      level = RIL_WORD;  // we've fallen through to the next word.
+    }
+      // Fall through.
+    case RIL_WORD:  // explicit fall-through.
+    {
+      if (it_->word() == nullptr)
+        return Next(RIL_BLOCK);
+      std::vector<int> word_indices;
+      int this_word_index = LTRWordIndex();
+      CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &word_indices);
+      int final_real_index = word_indices.size() - 1;
+      while (final_real_index > 0 && word_indices[final_real_index] < 0)
+        final_real_index--;
+      for (int i = 0; i < final_real_index; i++) {
+        if (word_indices[i] == this_word_index) {
+          int j = i + 1;
+          for (; j < final_real_index && word_indices[j] < 0; j++) {
+            if (word_indices[j] == kMinorRunStart)
+              in_minor_direction_ = true;
+            if (word_indices[j] == kMinorRunEnd)
+              in_minor_direction_ = false;
+          }
+          at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
+          // awesome, we move to word_indices[j]
+          if (BidiDebug(3)) {
+            tprintf("Next(RIL_WORD): %d -> %d\n", this_word_index,
+                    word_indices[j]);
+          }
+          PageIterator::RestartRow();
+          for (int k = 0; k < word_indices[j]; k++) {
+            PageIterator::Next(RIL_WORD);
+          }
+          MoveToLogicalStartOfWord();
+          return true;
+        }
+      }
+      if (BidiDebug(3)) {
+        tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
+      }
+      // we're going off the end of the text line.
+      return Next(RIL_TEXTLINE);
+    }
+  }
+  ASSERT_HOST(false);  // shouldn't happen.
+  return false;
+}
+
+bool ResultIterator::IsAtBeginningOf(PageIteratorLevel level) const {
+  if (it_->block() == nullptr)
+    return false;  // Already at the end!
+  if (it_->word() == nullptr)
+    return true;  // In an image block.
+  if (level == RIL_SYMBOL)
+    return true;  // Always at beginning of a symbol.
+
+  bool at_word_start = IsAtFirstSymbolOfWord();
+  if (level == RIL_WORD)
+    return at_word_start;
+
+  ResultIterator line_start(*this);
+  // move to the first word in the line...
+  line_start.MoveToLogicalStartOfTextline();
+
+  bool at_textline_start = at_word_start && *line_start.it_ == *it_;
+  if (level == RIL_TEXTLINE)
+    return at_textline_start;
+
+  // now we move to the left-most word...
+  line_start.RestartRow();
+  bool at_block_start = at_textline_start &&
+                        line_start.it_->block() != line_start.it_->prev_block();
+  if (level == RIL_BLOCK)
+    return at_block_start;
+
+  bool at_para_start =
+      at_block_start ||
+      (at_textline_start && line_start.it_->row()->row->para() !=
+                                line_start.it_->prev_row()->row->para());
+  if (level == RIL_PARA)
+    return at_para_start;
+
+  ASSERT_HOST(false);  // shouldn't happen.
+  return false;
+}
+
+/**
+ * NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the
+ *   change that the variable next is now a ResultIterator instead of a
+ *   PageIterator.
+ */
+bool ResultIterator::IsAtFinalElement(PageIteratorLevel level,
+                                      PageIteratorLevel element) const {
+  if (Empty(element))
+    return true;  // Already at the end!
+  // The result is true if we step forward by element and find we are
+  // at the the end of the page or at beginning of *all* levels in:
+  // [level, element).
+  // When there is more than one level difference between element and level,
+  // we could for instance move forward one symbol and still be at the first
+  // word on a line, so we also have to be at the first symbol in a word.
+  ResultIterator next(*this);
+  next.Next(element);
+  if (next.Empty(element))
+    return true;  // Reached the end of the page.
+  while (element > level) {
+    element = static_cast<PageIteratorLevel>(element - 1);
+    if (!next.IsAtBeginningOf(element))
+      return false;
+  }
+  return true;
+}
+
+// Returns the number of blanks before the current word.
+int ResultIterator::BlanksBeforeWord() const {
+  if (CurrentParagraphIsLtr())
+    return LTRResultIterator::BlanksBeforeWord();
+  return IsAtBeginningOf(RIL_TEXTLINE) ? 0 : 1;
+}
+
+/**
+ * Returns the null terminated UTF-8 encoded text string for the current
+ * object at the given level. Use delete [] to free after use.
+ */
+char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
+  if (it_->word() == nullptr)
+    return nullptr;  // Already at the end!
+  std::string text;
+  switch (level) {
+    case RIL_BLOCK: {
+      ResultIterator pp(*this);
+      do {
+        pp.AppendUTF8ParagraphText(&text);
+      } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
+    } break;
+    case RIL_PARA:
+      AppendUTF8ParagraphText(&text);
+      break;
+    case RIL_TEXTLINE: {
+      ResultIterator it(*this);
+      it.MoveToLogicalStartOfTextline();
+      it.IterateAndAppendUTF8TextlineText(&text);
+    } break;
+    case RIL_WORD:
+      AppendUTF8WordText(&text);
+      break;
+    case RIL_SYMBOL: {
+      bool reading_direction_is_ltr =
+          current_paragraph_is_ltr_ ^ in_minor_direction_;
+      if (at_beginning_of_minor_run_) {
+        text += reading_direction_is_ltr ? kLRM : kRLM;
+      }
+      text = it_->word()->BestUTF8(blob_index_, false);
+      if (IsAtFinalSymbolOfWord())
+        AppendSuffixMarks(&text);
+    } break;
+  }
+  int length = text.length() + 1;
+  char* result = new char[length];
+  strncpy(result, text.c_str(), length);
+  return result;
+}
+std::vector<std::vector<std::vector<std::pair<const char*, float>>>>*
+ResultIterator::GetRawLSTMTimesteps() const {
+  if (it_->word() != nullptr) {
+    return &it_->word()->segmented_timesteps;
+  } else {
+    return nullptr;
+  }
+}
+
+std::vector<std::vector<std::pair<const char*, float>>>*
+ResultIterator::GetBestLSTMSymbolChoices() const {
+  if (it_->word() != nullptr) {
+    return &it_->word()->CTC_symbol_choices;
+  } else {
+    return nullptr;
+  }
+}
+
+void ResultIterator::AppendUTF8WordText(std::string* text) const {
+  if (!it_->word())
+    return;
+  ASSERT_HOST(it_->word()->best_choice != nullptr);
+  bool reading_direction_is_ltr =
+      current_paragraph_is_ltr_ ^ in_minor_direction_;
+  if (at_beginning_of_minor_run_) {
+    *text += reading_direction_is_ltr ? kLRM : kRLM;
+  }
+
+  std::vector<int> blob_order;
+  CalculateBlobOrder(&blob_order);
+  for (int i = 0; i < blob_order.size(); i++) {
+    *text += it_->word()->BestUTF8(blob_order[i], false);
+  }
+  AppendSuffixMarks(text);
+}
+
+void ResultIterator::IterateAndAppendUTF8TextlineText(std::string* text) {
+  if (Empty(RIL_WORD)) {
+    Next(RIL_WORD);
+    return;
+  }
+  if (BidiDebug(1)) {
+    std::vector<int> textline_order;
+    std::vector<StrongScriptDirection> dirs;
+    CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &dirs,
+                           &textline_order);
+    tprintf("Strong Script dirs     [%p/P=%s]: ", it_->row(),
+            current_paragraph_is_ltr_ ? "ltr" : "rtl");
+    PrintScriptDirs(dirs);
+    tprintf("Logical textline order [%p/P=%s]: ", it_->row(),
+            current_paragraph_is_ltr_ ? "ltr" : "rtl");
+    for (int i = 0; i < textline_order.size(); i++) {
+      tprintf("%d ", textline_order[i]);
+    }
+    tprintf("\n");
+  }
+
+  int words_appended = 0;
+  do {
+    int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space()
+                                               : (words_appended > 0);
+    for (int i = 0; i < numSpaces; ++i) {
+      *text += " ";
+    }
+    AppendUTF8WordText(text);
+    words_appended++;
+    if (BidiDebug(2)) {
+      tprintf("Num spaces=%d, text=%s\n", numSpaces, text->c_str());
+    }
+  } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
+  if (BidiDebug(1)) {
+    tprintf("%d words printed\n", words_appended);
+  }
+  *text += line_separator_;
+  // If we just finished a paragraph, add an extra newline.
+  if (IsAtBeginningOf(RIL_PARA)) {
+    *text += paragraph_separator_;
+  }
+}
+
+void ResultIterator::AppendUTF8ParagraphText(std::string* text) const {
+  ResultIterator it(*this);
+  it.RestartParagraph();
+  it.MoveToLogicalStartOfTextline();
+  if (it.Empty(RIL_WORD))
+    return;
+  do {
+    it.IterateAndAppendUTF8TextlineText(text);
+  } while (it.it_->block() != nullptr && !it.IsAtBeginningOf(RIL_PARA));
+}
+
+bool ResultIterator::BidiDebug(int min_level) const {
+  int debug_level = 1;
+  auto* p =
+      ParamUtils::FindParam<IntParam>("bidi_debug", GlobalParams()->int_params,
+                                      tesseract_->params()->int_params);
+  if (p != nullptr)
+    debug_level = (int32_t)(*p);
+  return debug_level >= min_level;
+}
+
+}  // namespace tesseract.
diff --git a/tesseract/src/ccmain/superscript.cpp b/tesseract/src/ccmain/superscript.cpp
new file mode 100644
index 00000000..02d22451
--- /dev/null
+++ b/tesseract/src/ccmain/superscript.cpp
@@ -0,0 +1,610 @@
+/******************************************************************
+ * File:        superscript.cpp
+ * Description: Correction pass to fix superscripts and subscripts.
+ * Author:      David Eger
+ *
+ * (C) Copyright 2012, Google, Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "normalis.h"
+#include "tesseractclass.h"
+
+namespace tesseract {
+
+static int LeadingUnicharsToChopped(WERD_RES *word, int num_unichars) {
+  int num_chopped = 0;
+  for (int i = 0; i < num_unichars; i++)
+    num_chopped += word->best_state[i];
+  return num_chopped;
+}
+
+static int TrailingUnicharsToChopped(WERD_RES *word, int num_unichars) {
+  int num_chopped = 0;
+  for (int i = 0; i < num_unichars; i++)
+    num_chopped += word->best_state[word->best_state.size() - 1 - i];
+  return num_chopped;
+}
+
+/**
+ * Given a recognized blob, see if a contiguous collection of sub-pieces
+ * (chopped blobs) starting at its left might qualify as being a subscript
+ * or superscript letter based only on y position.  Also do this for the
+ * right side.
+ */
+static void YOutlierPieces(WERD_RES *word, int rebuilt_blob_index,
+                           int super_y_bottom, int sub_y_top,
+                           ScriptPos *leading_pos, int *num_leading_outliers,
+                           ScriptPos *trailing_pos,
+                           int *num_trailing_outliers) {
+  ScriptPos sp_unused1, sp_unused2;
+  int unused1, unused2;
+  if (!leading_pos) leading_pos = &sp_unused1;
+  if (!num_leading_outliers) num_leading_outliers = &unused1;
+  if (!trailing_pos) trailing_pos = &sp_unused2;
+  if (!num_trailing_outliers) num_trailing_outliers = &unused2;
+
+  *num_leading_outliers = *num_trailing_outliers = 0;
+  *leading_pos = *trailing_pos = SP_NORMAL;
+
+  int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index);
+  int num_chopped_pieces = word->best_state[rebuilt_blob_index];
+  ScriptPos last_pos = SP_NORMAL;
+  int trailing_outliers = 0;
+  for (int i = 0; i < num_chopped_pieces; i++) {
+    TBOX box = word->chopped_word->blobs[chopped_start + i]->bounding_box();
+    ScriptPos pos = SP_NORMAL;
+    if (box.bottom() >= super_y_bottom) {
+      pos = SP_SUPERSCRIPT;
+    } else if (box.top() <= sub_y_top) {
+      pos = SP_SUBSCRIPT;
+    }
+    if (pos == SP_NORMAL) {
+      if (trailing_outliers == i) {
+        *num_leading_outliers = trailing_outliers;
+        *leading_pos = last_pos;
+      }
+      trailing_outliers = 0;
+    } else {
+      if (pos == last_pos) {
+        trailing_outliers++;
+      } else {
+        trailing_outliers = 1;
+      }
+    }
+    last_pos = pos;
+  }
+  *num_trailing_outliers = trailing_outliers;
+  *trailing_pos = last_pos;
+}
+
+/**
+ * Attempt to split off any high (or low) bits at the ends of the word with poor
+ * certainty and recognize them separately.  If the certainty gets much better
+ * and other sanity checks pass, accept.
+ *
+ * This superscript fix is meant to be called in the second pass of recognition
+ * when we have tried once and already have a preliminary answer for word.
+ *
+ * @return Whether we modified the given word.
+ */
+bool Tesseract::SubAndSuperscriptFix(WERD_RES *word) {
+  if (word->tess_failed || word->word->flag(W_REP_CHAR) ||
+      !word->best_choice) {
+    return false;
+  }
+  int num_leading, num_trailing;
+  ScriptPos sp_leading, sp_trailing;
+  float leading_certainty, trailing_certainty;
+  float avg_certainty, unlikely_threshold;
+
+  // Calculate the number of whole suspicious characters at the edges.
+  GetSubAndSuperscriptCandidates(
+          word, &num_leading, &sp_leading, &leading_certainty,
+          &num_trailing, &sp_trailing, &trailing_certainty,
+          &avg_certainty, &unlikely_threshold);
+
+  const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
+  const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
+
+  int num_blobs = word->best_choice->length();
+
+  // Calculate the remainder (partial characters) at the edges.
+  // This accounts for us having classified the best version of
+  // a word as [speaker?'] when it was instead [speaker.^{21}]
+  // (that is we accidentally thought the 2 was attached to the period).
+  int num_remainder_leading = 0, num_remainder_trailing = 0;
+  if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
+    int super_y_bottom =
+        kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
+    int sub_y_top =
+        kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
+    int last_word_char = num_blobs - 1 - num_trailing;
+    float last_char_certainty = word->best_choice->certainty(last_word_char);
+    if (word->best_choice->unichar_id(last_word_char) != 0 &&
+        last_char_certainty <= unlikely_threshold) {
+      ScriptPos rpos;
+      YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top,
+                     nullptr, nullptr, &rpos, &num_remainder_trailing);
+      if (num_trailing > 0 && rpos != sp_trailing) num_remainder_trailing = 0;
+      if (num_remainder_trailing > 0 &&
+          last_char_certainty < trailing_certainty) {
+        trailing_certainty = last_char_certainty;
+      }
+    }
+    bool another_blob_available = (num_remainder_trailing == 0) ||
+        num_leading + num_trailing + 1 < num_blobs;
+    int first_char_certainty = word->best_choice->certainty(num_leading);
+    if (another_blob_available &&
+        word->best_choice->unichar_id(num_leading) != 0 &&
+        first_char_certainty <= unlikely_threshold) {
+      ScriptPos lpos;
+      YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top,
+                     &lpos, &num_remainder_leading, nullptr, nullptr);
+      if (num_leading > 0 && lpos != sp_leading) num_remainder_leading = 0;
+      if (num_remainder_leading > 0 &&
+          first_char_certainty < leading_certainty) {
+        leading_certainty = first_char_certainty;
+      }
+    }
+  }
+
+  // If nothing to do, bail now.
+  if (num_leading + num_trailing +
+      num_remainder_leading + num_remainder_trailing == 0) {
+    return false;
+  }
+
+  if (superscript_debug >= 1) {
+    tprintf("Candidate for superscript detection: %s (",
+            word->best_choice->unichar_string().c_str());
+    if (num_leading || num_remainder_leading) {
+      tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading,
+              leading_pos);
+    }
+    if (num_trailing || num_remainder_trailing) {
+      tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing,
+              trailing_pos);
+    }
+    tprintf(")\n");
+  }
+  if (superscript_debug >= 3) {
+    word->best_choice->print();
+  }
+  if (superscript_debug >= 2) {
+    tprintf(" Certainties -- Average: %.2f  Unlikely thresh: %.2f  ",
+            avg_certainty, unlikely_threshold);
+    if (num_leading)
+      tprintf("Orig. leading (min): %.2f  ", leading_certainty);
+    if (num_trailing)
+      tprintf("Orig. trailing (min): %.2f  ", trailing_certainty);
+    tprintf("\n");
+  }
+
+  // We've now calculated the number of rebuilt blobs we want to carve off.
+  // However, split_word() works from TBLOBs in chopped_word, so we need to
+  // convert to those.
+  int num_chopped_leading =
+      LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
+  int num_chopped_trailing =
+      TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
+
+  int retry_leading = 0;
+  int retry_trailing = 0;
+  bool is_good = false;
+  WERD_RES *revised = TrySuperscriptSplits(
+      num_chopped_leading, leading_certainty, sp_leading,
+      num_chopped_trailing, trailing_certainty, sp_trailing,
+      word, &is_good, &retry_leading, &retry_trailing);
+  if (is_good) {
+    word->ConsumeWordResults(revised);
+  } else if (retry_leading || retry_trailing) {
+    int retry_chopped_leading =
+        LeadingUnicharsToChopped(revised, retry_leading);
+    int retry_chopped_trailing =
+        TrailingUnicharsToChopped(revised, retry_trailing);
+    WERD_RES *revised2 = TrySuperscriptSplits(
+        retry_chopped_leading, leading_certainty, sp_leading,
+        retry_chopped_trailing, trailing_certainty, sp_trailing,
+        revised, &is_good, &retry_leading, &retry_trailing);
+    if (is_good) {
+      word->ConsumeWordResults(revised2);
+    }
+    delete revised2;
+  }
+  delete revised;
+  return is_good;
+}
+
+/**
+ * Determine how many characters (rebuilt blobs) on each end of a given word
+ * might plausibly be superscripts so SubAndSuperscriptFix can try to
+ * re-recognize them.  Even if we find no whole blobs at either end,
+ * we will set *unlikely_threshold to a certainty that might be used to
+ * select "bad enough" outlier characters.  If *unlikely_threshold is set to 0,
+ * though, there's really no hope.
+ *
+ * @param[in]  word    The word to examine.
+ * @param[out] num_rebuilt_leading   the number of rebuilt blobs at the start
+ *                                   of the word which are all up or down and
+ *                                   seem badly classified.
+ * @param[out] leading_pos        "super" or "sub" (for debugging)
+ * @param[out] leading_certainty  the worst certainty in the leading blobs.
+ * @param[out] num_rebuilt_trailing   the number of rebuilt blobs at the end
+ *                                    of the word which are all up or down and
+ *                                    seem badly classified.
+ * @param[out] trailing_pos        "super" or "sub" (for debugging)
+ * @param[out] trailing_certainty  the worst certainty in the trailing blobs.
+ * @param[out] avg_certainty       the average certainty of "normal" blobs in
+ *                                 the word.
+ * @param[out] unlikely_threshold  the threshold (on certainty) we used to
+ *                                 select "bad enough" outlier characters.
+ */
+void Tesseract::GetSubAndSuperscriptCandidates(const WERD_RES *word,
+                                               int *num_rebuilt_leading,
+                                               ScriptPos *leading_pos,
+                                               float *leading_certainty,
+                                               int *num_rebuilt_trailing,
+                                               ScriptPos *trailing_pos,
+                                               float *trailing_certainty,
+                                               float *avg_certainty,
+                                               float *unlikely_threshold) {
+  *avg_certainty = *unlikely_threshold = 0.0f;
+  *num_rebuilt_leading = *num_rebuilt_trailing = 0;
+  *leading_certainty = *trailing_certainty = 0.0f;
+
+  int super_y_bottom =
+      kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
+  int sub_y_top =
+      kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
+
+  // Step one: Get an average certainty for "normally placed" characters.
+
+  // Counts here are of blobs in the rebuild_word / unichars in best_choice.
+  *leading_pos = *trailing_pos = SP_NORMAL;
+  int leading_outliers = 0;
+  int trailing_outliers = 0;
+  int num_normal = 0;
+  float normal_certainty_total = 0.0f;
+  float worst_normal_certainty = 0.0f;
+  ScriptPos last_pos = SP_NORMAL;
+  int num_blobs = word->rebuild_word->NumBlobs();
+  for (int b = 0; b < num_blobs; ++b) {
+    TBOX box = word->rebuild_word->blobs[b]->bounding_box();
+    ScriptPos pos = SP_NORMAL;
+    if (box.bottom() >= super_y_bottom) {
+      pos = SP_SUPERSCRIPT;
+    } else if (box.top() <= sub_y_top) {
+      pos = SP_SUBSCRIPT;
+    }
+    if (pos == SP_NORMAL) {
+      if (word->best_choice->unichar_id(b) != 0) {
+        float char_certainty = word->best_choice->certainty(b);
+        if (char_certainty < worst_normal_certainty) {
+          worst_normal_certainty = char_certainty;
+        }
+        num_normal++;
+        normal_certainty_total += char_certainty;
+      }
+      if (trailing_outliers == b) {
+        leading_outliers = trailing_outliers;
+        *leading_pos = last_pos;
+      }
+      trailing_outliers = 0;
+    } else {
+      if (last_pos == pos) {
+        trailing_outliers++;
+      } else {
+        trailing_outliers = 1;
+      }
+    }
+    last_pos = pos;
+  }
+  *trailing_pos = last_pos;
+  if (num_normal >= 3) {  // throw out the worst as an outlier.
+    num_normal--;
+    normal_certainty_total -= worst_normal_certainty;
+  }
+  if (num_normal > 0) {
+    *avg_certainty = normal_certainty_total / num_normal;
+    *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
+  }
+  if (num_normal == 0 ||
+      (leading_outliers == 0 && trailing_outliers == 0)) {
+    return;
+  }
+
+  // Step two: Try to split off bits of the word that are both outliers
+  //           and have much lower certainty than average
+  // Calculate num_leading and leading_certainty.
+  for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0;
+       *num_rebuilt_leading < leading_outliers;
+       (*num_rebuilt_leading)++) {
+    float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
+    if (char_certainty > *unlikely_threshold) {
+      break;
+    }
+    if (char_certainty < *leading_certainty) {
+      *leading_certainty = char_certainty;
+    }
+  }
+
+  // Calculate num_trailing and trailing_certainty.
+  for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
+       *num_rebuilt_trailing < trailing_outliers;
+       (*num_rebuilt_trailing)++) {
+    int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
+    float char_certainty = word->best_choice->certainty(blob_idx);
+    if (char_certainty > *unlikely_threshold) {
+      break;
+    }
+    if (char_certainty < *trailing_certainty) {
+      *trailing_certainty = char_certainty;
+    }
+  }
+}
+
+
+/**
+ * Try splitting off the given number of (chopped) blobs from the front and
+ * back of the given word and recognizing the pieces.
+ *
+ * @param[in]  num_chopped_leading   how many chopped blobs from the left
+ *                    end of the word to chop off and try recognizing as a
+ *                    superscript (or subscript)
+ * @param[in]  leading_certainty     the (minimum) certainty had by the
+ *                    characters in the original leading section.
+ * @param[in]  leading_pos    "super" or "sub" (for debugging)
+ * @param[in]  num_chopped_trailing  how many chopped blobs from the right
+ *                    end of the word to chop off and try recognizing as a
+ *                    superscript (or subscript)
+ * @param[in]  trailing_certainty    the (minimum) certainty had by the
+ *                    characters in the original trailing section.
+ * @param[in]  trailing_pos      "super" or "sub" (for debugging)
+ * @param[in]  word              the word to try to chop up.
+ * @param[out] is_good           do we believe our result?
+ * @param[out] retry_rebuild_leading, retry_rebuild_trailing
+ *         If non-zero, and !is_good, then the caller may have luck trying
+ *         to split the returned word with this number of (rebuilt) leading
+ *         and trailing blobs / unichars.
+ * @return A word which is the result of re-recognizing as asked.
+ */
+WERD_RES *Tesseract::TrySuperscriptSplits(
+    int num_chopped_leading, float leading_certainty, ScriptPos leading_pos,
+    int num_chopped_trailing, float trailing_certainty,
+    ScriptPos trailing_pos,
+    WERD_RES *word,
+    bool *is_good,
+    int *retry_rebuild_leading, int *retry_rebuild_trailing) {
+  int num_chopped = word->chopped_word->NumBlobs();
+
+  *retry_rebuild_leading = *retry_rebuild_trailing = 0;
+
+  // Chop apart the word into up to three pieces.
+
+  BlamerBundle *bb0 = nullptr;
+  BlamerBundle *bb1 = nullptr;
+  WERD_RES *prefix = nullptr;
+  WERD_RES *core = nullptr;
+  WERD_RES *suffix = nullptr;
+  if (num_chopped_leading > 0) {
+    prefix = new WERD_RES(*word);
+    split_word(prefix, num_chopped_leading, &core, &bb0);
+  } else {
+    core = new WERD_RES(*word);
+  }
+
+  if (num_chopped_trailing > 0) {
+    int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
+    split_word(core, split_pt, &suffix, &bb1);
+  }
+
+  //  Recognize the pieces in turn.
+  int saved_cp_multiplier = classify_class_pruner_multiplier;
+  int saved_im_multiplier = classify_integer_matcher_multiplier;
+  if (prefix) {
+    // Turn off Tesseract's y-position penalties for the leading superscript.
+    classify_class_pruner_multiplier.set_value(0);
+    classify_integer_matcher_multiplier.set_value(0);
+
+    // Adjust our expectations about the baseline for this prefix.
+    if (superscript_debug >= 3) {
+      tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
+    }
+    recog_word_recursive(prefix);
+    if (superscript_debug >= 2) {
+      tprintf(" The leading bits look like %s %s\n",
+              ScriptPosToString(leading_pos),
+              prefix->best_choice->unichar_string().c_str());
+    }
+
+    // Restore the normal y-position penalties.
+    classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
+    classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
+  }
+
+  if (superscript_debug >= 3) {
+    tprintf(" recognizing middle %d chopped blobs\n",
+            num_chopped - num_chopped_leading - num_chopped_trailing);
+  }
+
+  if (suffix) {
+    // Turn off Tesseract's y-position penalties for the trailing superscript.
+    classify_class_pruner_multiplier.set_value(0);
+    classify_integer_matcher_multiplier.set_value(0);
+
+    if (superscript_debug >= 3) {
+      tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
+    }
+    recog_word_recursive(suffix);
+    if (superscript_debug >= 2) {
+      tprintf(" The trailing bits look like %s %s\n",
+              ScriptPosToString(trailing_pos),
+              suffix->best_choice->unichar_string().c_str());
+    }
+
+    // Restore the normal y-position penalties.
+    classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
+    classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
+  }
+
+  // Evaluate whether we think the results are believably better
+  // than what we already had.
+  bool good_prefix = !prefix || BelievableSuperscript(
+      superscript_debug >= 1, *prefix,
+      superscript_bettered_certainty * leading_certainty,
+      retry_rebuild_leading, nullptr);
+  bool good_suffix = !suffix || BelievableSuperscript(
+      superscript_debug >= 1, *suffix,
+      superscript_bettered_certainty * trailing_certainty,
+      nullptr, retry_rebuild_trailing);
+
+  *is_good = good_prefix && good_suffix;
+  if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
+    // None of it is any good. Quit now.
+    delete core;
+    delete prefix;
+    delete suffix;
+    delete bb1;
+    return nullptr;
+  }
+  recog_word_recursive(core);
+
+  // Now paste the results together into core.
+  if (suffix) {
+    suffix->SetAllScriptPositions(trailing_pos);
+    join_words(core, suffix, bb1);
+  }
+  if (prefix) {
+    prefix->SetAllScriptPositions(leading_pos);
+    join_words(prefix, core, bb0);
+    core = prefix;
+    prefix = nullptr;
+  }
+
+  if (superscript_debug >= 1) {
+    tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
+            core->best_choice->unichar_string().c_str());
+  }
+  return core;
+}
+
+
+/**
+ * Return whether this is believable superscript or subscript text.
+ *
+ * We insist that:
+ *   + there are no punctuation marks.
+ *   + there are no italics.
+ *   + no normal-sized character is smaller than superscript_scaledown_ratio
+ *     of what it ought to be, and
+ *   + each character is at least as certain as certainty_threshold.
+ *
+ *  @param[in]  debug  If true, spew debug output
+ *  @param[in]  word   The word whose best_choice we're evaluating
+ *  @param[in]  certainty_threshold   If any of the characters have less
+ *                    certainty than this, reject.
+ *  @param[out]  left_ok  How many left-side characters were ok?
+ *  @param[out]  right_ok  How many right-side characters were ok?
+ *  @return  Whether the complete best choice is believable as a superscript.
+ */
+bool Tesseract::BelievableSuperscript(bool debug,
+                                      const WERD_RES &word,
+                                      float certainty_threshold,
+                                      int *left_ok,
+                                      int *right_ok) const {
+  int initial_ok_run_count = 0;
+  int ok_run_count = 0;
+  float worst_certainty = 0.0f;
+  const WERD_CHOICE &wc = *word.best_choice;
+
+  const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table();
+  for (int i = 0; i < wc.length(); i++) {
+    TBLOB *blob = word.rebuild_word->blobs[i];
+    UNICHAR_ID unichar_id = wc.unichar_id(i);
+    float char_certainty = wc.certainty(i);
+    bool bad_certainty = char_certainty < certainty_threshold;
+    bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
+    bool is_italic = word.fontinfo && word.fontinfo->is_italic();
+    BLOB_CHOICE *choice = word.GetBlobChoice(i);
+    if (choice && fontinfo_table.size() > 0) {
+      // Get better information from the specific choice, if available.
+      int font_id1 = choice->fontinfo_id();
+      bool font1_is_italic = font_id1 >= 0
+          ? fontinfo_table.get(font_id1).is_italic() : false;
+      int font_id2 = choice->fontinfo_id2();
+      is_italic = font1_is_italic &&
+          (font_id2 < 0 || fontinfo_table.get(font_id2).is_italic());
+    }
+
+    float height_fraction = 1.0f;
+    float char_height = blob->bounding_box().height();
+    float normal_height = char_height;
+    if (wc.unicharset()->top_bottom_useful()) {
+      int min_bot, max_bot, min_top, max_top;
+      wc.unicharset()->get_top_bottom(unichar_id,
+                                      &min_bot, &max_bot,
+                                      &min_top, &max_top);
+      float hi_height = max_top - max_bot;
+      float lo_height = min_top - min_bot;
+      normal_height = (hi_height + lo_height) / 2;
+      if (normal_height >= kBlnXHeight) {
+        // Only ding characters that we have decent information for because
+        // they're supposed to be normal sized, not tiny specks or dashes.
+        height_fraction = char_height / normal_height;
+      }
+    }
+    bool bad_height = height_fraction < superscript_scaledown_ratio;
+
+    if (debug) {
+      if (is_italic) {
+        tprintf(" Rejecting: superscript is italic.\n");
+      }
+      if (is_punc) {
+        tprintf(" Rejecting: punctuation present.\n");
+      }
+      const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
+      if (bad_certainty) {
+        tprintf(" Rejecting: don't believe character %s with certainty %.2f "
+                "which is less than threshold %.2f\n", char_str,
+                char_certainty, certainty_threshold);
+      }
+      if (bad_height) {
+        tprintf(" Rejecting: character %s seems too small @ %.2f versus "
+                "expected %.2f\n", char_str, char_height, normal_height);
+      }
+    }
+    if (bad_certainty || bad_height || is_punc || is_italic) {
+      if (ok_run_count == i) {
+        initial_ok_run_count = ok_run_count;
+      }
+      ok_run_count = 0;
+    } else {
+      ok_run_count++;
+    }
+    if (char_certainty < worst_certainty) {
+      worst_certainty = char_certainty;
+    }
+  }
+  bool all_ok = ok_run_count == wc.length();
+  if (all_ok && debug) {
+    tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
+  }
+  if (!all_ok) {
+    if (left_ok) *left_ok = initial_ok_run_count;
+    if (right_ok) *right_ok = ok_run_count;
+  }
+  return all_ok;
+}
+
+
+}  // namespace tesseract
diff --git a/tesseract/src/ccmain/tessbox.cpp b/tesseract/src/ccmain/tessbox.cpp
new file mode 100644
index 00000000..80c5a9ad
--- /dev/null
+++ b/tesseract/src/ccmain/tessbox.cpp
@@ -0,0 +1,75 @@
+/**********************************************************************
+ * File:        tessbox.cpp  (Formerly tessbox.c)
+ * Description: Black boxed Tess for developing a resaljet.
+ * Author:      Ray Smith
+ * Created:     Thu Apr 23 11:03:36 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "mfoutline.h"
+#include "tesseractclass.h"
+
+/**
+ * @name tess_segment_pass_n
+ *
+ * Segment a word using the pass_n conditions of the tess segmenter.
+ * @param pass_n pass number
+ * @param word word to do
+ */
+
+namespace tesseract {
+void Tesseract::tess_segment_pass_n(int pass_n, WERD_RES *word) {
+  int saved_enable_assoc = 0;
+  int saved_chop_enable = 0;
+
+  if (word->word->flag(W_DONT_CHOP)) {
+    saved_enable_assoc = wordrec_enable_assoc;
+    saved_chop_enable = chop_enable;
+    wordrec_enable_assoc.set_value(0);
+    chop_enable.set_value(0);
+  }
+  if (pass_n == 1)
+    set_pass1();
+  else
+    set_pass2();
+  recog_word(word);
+  if (word->best_choice == nullptr)
+    word->SetupFake(*word->uch_set);
+  if (word->word->flag(W_DONT_CHOP)) {
+    wordrec_enable_assoc.set_value(saved_enable_assoc);
+    chop_enable.set_value(saved_chop_enable);
+  }
+}
+
+/**
+ * @name tess_acceptable_word
+ *
+ * @return true if the word is regarded as "good enough".
+ * @param word_choice after context
+ * @param raw_choice before context
+ */
+bool Tesseract::tess_acceptable_word(WERD_RES* word) {
+  return getDict().AcceptableResult(word);
+}
+
+
+/**
+ * @name tess_add_doc_word
+ *
+ * Add the given word to the document dictionary
+ */
+void Tesseract::tess_add_doc_word(WERD_CHOICE *word_choice) {
+  getDict().add_document_word(*word_choice);
+}
+}  // namespace tesseract
diff --git a/tesseract/src/ccmain/tessedit.cpp b/tesseract/src/ccmain/tessedit.cpp
new file mode 100644
index 00000000..15b433f1
--- /dev/null
+++ b/tesseract/src/ccmain/tessedit.cpp
@@ -0,0 +1,474 @@
+/**********************************************************************
+ * File:        tessedit.cpp  (Formerly tessedit.c)
+ * Description: (Previously) Main program for merge of tess and editor.
+ *              Now just code to load the language model and various
+ *              engine-specific data files.
+ * Author:      Ray Smith
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+
+#include "control.h"
+#  include "matchdefs.h"
+#include "pageres.h"
+#include "params.h"
+#include "stopper.h"
+#include "tesseractclass.h"
+#include "tessvars.h"
+#include "tprintf.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#  include "chop.h"
+#  include "intmatcher.h"
+#  include "reject.h"
+#endif
+#include "lstmrecognizer.h"
+
+namespace tesseract {
+
+// Read a "config" file containing a set of variable, value pairs.
+// Searches the standard places: tessdata/configs, tessdata/tessconfigs
+// and also accepts a relative or absolute path name.
+void Tesseract::read_config_file(const char* filename,
+                                 SetParamConstraint constraint) {
+  std::string path = datadir;
+  path += "configs/";
+  path += filename;
+  FILE* fp;
+  if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
+    fclose(fp);
+  } else {
+    path = datadir;
+    path += "tessconfigs/";
+    path += filename;
+    if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
+      fclose(fp);
+    } else {
+      path = filename;
+    }
+  }
+  ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params());
+}
+
+// Returns false if a unicharset file for the specified language was not found
+// or was invalid.
+// This function initializes TessdataManager. After TessdataManager is
+// no longer needed, TessdataManager::End() should be called.
+//
+// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
+// it is OEM_DEFAULT, in which case the value of the variable will be obtained
+// from the language-specific config file (stored in [lang].traineddata), from
+// the config files specified on the command line or left as the default
+// OEM_TESSERACT_ONLY if none of the configs specify this variable.
+bool Tesseract::init_tesseract_lang_data(
+    const char* arg0, const char* textbase, const char* language,
+    OcrEngineMode oem, char** configs, int configs_size,
+    const std::vector<std::string>* vars_vec,
+    const std::vector<std::string>* vars_values, bool set_only_non_debug_params,
+    TessdataManager* mgr) {
+  // Set the basename, compute the data directory.
+  main_setup(arg0, textbase);
+
+  // Set the language data path prefix
+  lang = language != nullptr ? language : "eng";
+  language_data_path_prefix = datadir;
+  language_data_path_prefix += lang;
+  language_data_path_prefix += ".";
+
+  // Initialize TessdataManager.
+  std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
+  if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) {
+    tprintf("Error opening data file %s\n", tessdata_path.c_str());
+    tprintf(
+        "Please make sure the TESSDATA_PREFIX environment variable is set"
+        " to your \"tessdata\" directory.\n");
+    return false;
+  }
+#ifdef DISABLED_LEGACY_ENGINE
+  tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
+#else
+  if (oem == OEM_DEFAULT) {
+    // Set the engine mode from availability, which can then be overridden by
+    // the config file when we read it below.
+    if (!mgr->IsLSTMAvailable()) {
+      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
+    } else if (!mgr->IsBaseAvailable()) {
+      tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
+    } else {
+      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
+    }
+  }
+#endif  // ndef DISABLED_LEGACY_ENGINE
+
+  // If a language specific config file (lang.config) exists, load it in.
+  TFile fp;
+  if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
+    ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp,
+                                 this->params());
+  }
+
+  SetParamConstraint set_params_constraint =
+      set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
+                                : SET_PARAM_CONSTRAINT_NONE;
+  // Load tesseract variables from config files. This is done after loading
+  // language-specific variables from [lang].traineddata file, so that custom
+  // config files can override values in [lang].traineddata file.
+  for (int i = 0; i < configs_size; ++i) {
+    read_config_file(configs[i], set_params_constraint);
+  }
+
+  // Set params specified in vars_vec (done after setting params from config
+  // files, so that params in vars_vec can override those from files).
+  if (vars_vec != nullptr && vars_values != nullptr) {
+    for (int i = 0; i < vars_vec->size(); ++i) {
+      if (!ParamUtils::SetParam((*vars_vec)[i].c_str(),
+                                (*vars_values)[i].c_str(),
+                                set_params_constraint, this->params())) {
+        tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str());
+      }
+    }
+  }
+
+  if (!tessedit_write_params_to_file.empty()) {
+    FILE* params_file = fopen(tessedit_write_params_to_file.c_str(), "wb");
+    if (params_file != nullptr) {
+      ParamUtils::PrintParams(params_file, this->params());
+      fclose(params_file);
+    } else {
+      tprintf("Failed to open %s for writing params.\n",
+              tessedit_write_params_to_file.c_str());
+    }
+  }
+
+#ifndef DISABLED_LEGACY_ENGINE
+  // Determine which ocr engine(s) should be loaded and used for recognition.
+  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
+#endif
+
+  // If we are only loading the config file (and so not planning on doing any
+  // recognition) then there's nothing else do here.
+  if (tessedit_init_config_only) {
+    return true;
+  }
+
+// The various OcrEngineMode settings (see tesseract/publictypes.h) determine which
+// engine-specific data files need to be loaded.
+// If LSTM_ONLY is requested, the base Tesseract files are *Not* required.
+#ifdef DISABLED_LEGACY_ENGINE
+  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
+#else
+  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
+      tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
+#endif  // ndef DISABLED_LEGACY_ENGINE
+    if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
+      lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix.c_str());
+      ASSERT_HOST(lstm_recognizer_->Load(
+          this->params(), lstm_use_matrix ? language : nullptr, mgr));
+    } else {
+      tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
+      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
+    }
+  }
+
+  // Load the unicharset
+  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
+    // Avoid requiring a unicharset when we aren't running base tesseract.
+    unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
+  }
+#ifndef DISABLED_LEGACY_ENGINE
+  else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) ||
+           !unicharset.load_from_file(&fp, false)) {
+    tprintf("Error: Tesseract (legacy) engine requested, but components are "
+            "not present in %s!!\n", tessdata_path.c_str());
+    return false;
+  }
+#endif  // ndef DISABLED_LEGACY_ENGINE
+  if (unicharset.size() > MAX_NUM_CLASSES) {
+    tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
+    return false;
+  }
+  right_to_left_ = unicharset.major_right_to_left();
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+  // Setup initial unichar ambigs table and read universal ambigs.
+  UNICHARSET encoder_unicharset;
+  encoder_unicharset.CopyFrom(unicharset);
+  unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
+  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
+
+  if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
+    unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp,
+                                     ambigs_debug_level,
+                                     use_ambigs_for_adaption, &unicharset);
+  }
+
+  // Init ParamsModel.
+  // Load pass1 and pass2 weights (for now these two sets are the same, but in
+  // the future separate sets of weights can be generated).
+  for (int p = ParamsModel::PTRAIN_PASS1; p < ParamsModel::PTRAIN_NUM_PASSES;
+       ++p) {
+    language_model_->getParamsModel().SetPass(
+        static_cast<ParamsModel::PassEnum>(p));
+    if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
+      if (!language_model_->getParamsModel().LoadFromFp(lang.c_str(), &fp)) {
+        return false;
+      }
+    }
+  }
+#endif  // ndef DISABLED_LEGACY_ENGINE
+
+  return true;
+}
+
+// Helper returns true if the given string is in the vector of strings.
+static bool IsStrInList(const std::string& str,
+                        const std::vector<std::string>& str_list) {
+  for (int i = 0; i < str_list.size(); ++i) {
+    if (str_list[i] == str) return true;
+  }
+  return false;
+}
+
+// Parse a string of the form [~]<lang>[+[~]<lang>]*.
+// Langs with no prefix get appended to to_load, provided they
+// are not in there already.
+// Langs with ~ prefix get appended to not_to_load, provided they are not in
+// there already.
+void Tesseract::ParseLanguageString(const char* lang_str,
+                                    std::vector<std::string>* to_load,
+                                    std::vector<std::string>* not_to_load) {
+  std::string remains(lang_str);
+  while (!remains.empty()) {
+    // Find the start of the lang code and which vector to add to.
+    const char* start = remains.c_str();
+    while (*start == '+') ++start;
+    std::vector<std::string>* target = to_load;
+    if (*start == '~') {
+      target = not_to_load;
+      ++start;
+    }
+    // Find the index of the end of the lang code in string start.
+    int end = strlen(start);
+    const char* plus = strchr(start, '+');
+    if (plus != nullptr && plus - start < end) end = plus - start;
+    std::string lang_code(start);
+    lang_code.resize(end);
+    std::string next(start + end);
+    remains = next;
+    // Check whether lang_code is already in the target vector and add.
+    if (!IsStrInList(lang_code, *target)) {
+      target->push_back(lang_code);
+    }
+  }
+}
+
+// Initialize for potentially a set of languages defined by the language
+// string and recursively any additional languages required by any language
+// traineddata file (via tessedit_load_sublangs in its config) that is loaded.
+// See init_tesseract_internal for args.
+int Tesseract::init_tesseract(const char* arg0, const char* textbase,
+                              const char* language, OcrEngineMode oem,
+                              char** configs, int configs_size,
+                              const std::vector<std::string>* vars_vec,
+                              const std::vector<std::string>* vars_values,
+                              bool set_only_non_debug_params,
+                              TessdataManager* mgr) {
+  std::vector<std::string> langs_to_load;
+  std::vector<std::string> langs_not_to_load;
+  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
+
+  for (auto* lang : sub_langs_) {
+    delete lang;
+  }
+  sub_langs_.clear();
+  // Find the first loadable lang and load into this.
+  // Add any languages that this language requires
+  bool loaded_primary = false;
+  // Load the rest into sub_langs_.
+  for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
+    if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
+      const char* lang_str = langs_to_load[lang_index].c_str();
+      Tesseract* tess_to_init;
+      if (!loaded_primary) {
+        tess_to_init = this;
+      } else {
+        tess_to_init = new Tesseract;
+      }
+
+      int result = tess_to_init->init_tesseract_internal(
+          arg0, textbase, lang_str, oem, configs, configs_size, vars_vec,
+          vars_values, set_only_non_debug_params, mgr);
+      // Forget that language, but keep any reader we were given.
+      mgr->Clear();
+
+      if (!loaded_primary) {
+        if (result < 0) {
+          tprintf("Failed loading language '%s'\n", lang_str);
+        } else {
+          ParseLanguageString(tess_to_init->tessedit_load_sublangs.c_str(),
+                              &langs_to_load, &langs_not_to_load);
+          loaded_primary = true;
+        }
+      } else {
+        if (result < 0) {
+          tprintf("Failed loading language '%s'\n", lang_str);
+          delete tess_to_init;
+        } else {
+          sub_langs_.push_back(tess_to_init);
+          // Add any languages that this language requires
+          ParseLanguageString(tess_to_init->tessedit_load_sublangs.c_str(),
+                              &langs_to_load, &langs_not_to_load);
+        }
+      }
+    }
+  }
+  if (!loaded_primary) {
+    tprintf("Tesseract couldn't load any languages!\n");
+    return -1;  // Couldn't load any language!
+  }
+#ifndef DISABLED_LEGACY_ENGINE
+  if (!sub_langs_.empty()) {
+    // In multilingual mode word ratings have to be directly comparable,
+    // so use the same language model weights for all languages:
+    // use the primary language's params model if
+    // tessedit_use_primary_params_model is set,
+    // otherwise use default language model weights.
+    if (tessedit_use_primary_params_model) {
+      for (int s = 0; s < sub_langs_.size(); ++s) {
+        sub_langs_[s]->language_model_->getParamsModel().Copy(
+            this->language_model_->getParamsModel());
+      }
+      tprintf("Using params model of the primary language\n");
+    } else {
+      this->language_model_->getParamsModel().Clear();
+      for (int s = 0; s < sub_langs_.size(); ++s) {
+        sub_langs_[s]->language_model_->getParamsModel().Clear();
+      }
+    }
+  }
+
+  SetupUniversalFontIds();
+#endif  // ndef DISABLED_LEGACY_ENGINE
+  return 0;
+}
+
+// Common initialization for a single language.
+// arg0 is the datapath for the tessdata directory, which could be the
+// path of the tessdata directory with no trailing /, or (if tessdata
+// lives in the same directory as the executable, the path of the executable,
+// hence the name arg0.
+// textbase is an optional output file basename (used only for training)
+// language is the language code to load.
+// oem controls which engine(s) will operate on the image
+// configs (argv) is an array of config filenames to load variables from.
+// May be nullptr.
+// configs_size (argc) is the number of elements in configs.
+// vars_vec is an optional vector of variables to set.
+// vars_values is an optional corresponding vector of values for the variables
+// in vars_vec.
+// If set_only_init_params is true, then only the initialization variables
+// will be set.
+int Tesseract::init_tesseract_internal(const char* arg0, const char* textbase,
+                                       const char* language, OcrEngineMode oem,
+                                       char** configs, int configs_size,
+                                       const std::vector<std::string>* vars_vec,
+                                       const std::vector<std::string>* vars_values,
+                                       bool set_only_non_debug_params,
+                                       TessdataManager* mgr) {
+  if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
+                                configs_size, vars_vec, vars_values,
+                                set_only_non_debug_params, mgr)) {
+    return -1;
+  }
+  if (tessedit_init_config_only) {
+    return 0;
+  }
+  // If only LSTM will be used, skip loading Tesseract classifier's
+  // pre-trained templates and dictionary.
+  bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;
+  program_editup(textbase, init_tesseract ? mgr : nullptr,
+                 init_tesseract ? mgr : nullptr);
+  return 0;  // Normal exit
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+// Helper builds the all_fonts table by adding new fonts from new_fonts.
+static void CollectFonts(const UnicityTable<FontInfo>& new_fonts,
+                         UnicityTable<FontInfo>* all_fonts) {
+  for (int i = 0; i < new_fonts.size(); ++i) {
+    // UnicityTable uniques as we go.
+    all_fonts->push_back(new_fonts.get(i));
+  }
+}
+
+// Helper assigns an id to lang_fonts using the index in all_fonts table.
+static void AssignIds(const UnicityTable<FontInfo>& all_fonts,
+                      UnicityTable<FontInfo>* lang_fonts) {
+  for (int i = 0; i < lang_fonts->size(); ++i) {
+    int index = all_fonts.get_id(lang_fonts->get(i));
+    lang_fonts->get_mutable(i)->universal_id = index;
+  }
+}
+
+// Set the universal_id member of each font to be unique among all
+// instances of the same font loaded.
+void Tesseract::SetupUniversalFontIds() {
+  // Note that we can get away with bitwise copying FontInfo in
+  // all_fonts, as it is a temporary structure and we avoid setting the
+  // delete callback.
+  UnicityTable<FontInfo> all_fonts;
+
+  // Create the universal ID table.
+  CollectFonts(get_fontinfo_table(), &all_fonts);
+  for (int i = 0; i < sub_langs_.size(); ++i) {
+    CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
+  }
+  // Assign ids from the table to each font table.
+  AssignIds(all_fonts, &get_fontinfo_table());
+  for (int i = 0; i < sub_langs_.size(); ++i) {
+    AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
+  }
+  font_table_size_ = all_fonts.size();
+}
+
+// init the LM component
+int Tesseract::init_tesseract_lm(const char* arg0, const char* textbase,
+                                 const char* language, TessdataManager* mgr) {
+  if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
+                                nullptr, 0, nullptr, nullptr, false, mgr))
+    return -1;
+  getDict().SetupForLoad(Dict::GlobalDawgCache());
+  getDict().Load(lang, mgr);
+  getDict().FinishLoad();
+  return 0;
+}
+
+#endif  // ndef DISABLED_LEGACY_ENGINE
+
+void Tesseract::end_tesseract() { end_recog(); }
+
+/* Define command type identifiers */
+
+enum CMD_EVENTS {
+  ACTION_1_CMD_EVENT,
+  RECOG_WERDS,
+  RECOG_PSEUDO,
+  ACTION_2_CMD_EVENT
+};
+}  // namespace tesseract
diff --git a/tesseract/src/ccmain/tesseractclass.cpp b/tesseract/src/ccmain/tesseractclass.cpp
new file mode 100644
index 00000000..fdd88c52
--- /dev/null
+++ b/tesseract/src/ccmain/tesseractclass.cpp
@@ -0,0 +1,707 @@
+///////////////////////////////////////////////////////////////////////
+// File:        tesseractclass.cpp
+// Description: The Tesseract class. It holds/owns everything needed
+//              to run Tesseract on a single language, and also a set of
+//              sub-Tesseracts to run sub-languages. For thread safety, *every*
+//              variable that was previously global or static (except for
+//              constant data, and some visual debugging flags) has been moved
+//              in here, directly, or indirectly.
+//              This makes it safe to run multiple Tesseracts in different
+//              threads in parallel, and keeps the different language
+//              instances separate.
+//              Some global functions remain, but they are isolated re-entrant
+//              functions that operate on their arguments. Functions that work
+//              on variable data have been moved to an appropriate class based
+//              mostly on the directory hierarchy. For more information see
+//              slide 6 of "2ArchitectureAndDataStructures" in
+// https://drive.google.com/file/d/0B7l10Bj_LprhbUlIUFlCdGtDYkE/edit?usp=sharing
+//              Some global data and related functions still exist in the
+//              training-related code, but they don't interfere with normal
+//              recognition operation.
+// Author:      Ray Smith
+//
+// (C) Copyright 2008, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#include "tesseractclass.h"
+
+#include "allheaders.h"
+#include "edgblob.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#include "equationdetect.h"
+#endif
+#include "lstmrecognizer.h"
+
+namespace tesseract {
+
+Tesseract::Tesseract()
+    : BOOL_MEMBER(tessedit_resegment_from_boxes, false,
+                  "Take segmentation and labeling from box file",
+                  this->params()),
+      BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,
+                  "Conversion of word/line box file to char box file",
+                  this->params()),
+      BOOL_MEMBER(tessedit_train_from_boxes, false,
+                  "Generate training data from boxed chars", this->params()),
+      BOOL_MEMBER(tessedit_make_boxes_from_boxes, false,
+                  "Generate more boxes from boxed chars", this->params()),
+      BOOL_MEMBER(tessedit_train_line_recognizer, false,
+                  "Break input into lines and remap boxes if present",
+                  this->params()),
+      BOOL_MEMBER(tessedit_dump_pageseg_images, false,
+                  "Dump intermediate images made during page segmentation",
+                  this->params()),
+      BOOL_MEMBER(tessedit_do_invert, true,
+                 "Try inverting the image in `LSTMRecognizeWord`", this->params()),
+      // The default for pageseg_mode is the old behaviour, so as not to
+      // upset anything that relies on that.
+      INT_MEMBER(
+          tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
+          "Page seg mode: 0=osd only, 1=auto+osd, 2=auto_only, 3=auto, 4=column,"
+          " 5=block_vert, 6=block, 7=line, 8=word, 9=word_circle, 10=char,"
+          "11=sparse_text, 12=sparse_text+osd, 13=raw_line"
+          " (Values from PageSegMode enum in tesseract/publictypes.h)",
+          this->params()),
+      INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT,
+                      "Which OCR engine(s) to run (Tesseract, LSTM, both)."
+                      " Defaults to loading and running the most accurate"
+                      " available.",
+                      this->params()),
+      STRING_MEMBER(tessedit_char_blacklist, "",
+                    "Blacklist of chars not to recognize", this->params()),
+      STRING_MEMBER(tessedit_char_whitelist, "",
+                    "Whitelist of chars to recognize", this->params()),
+      STRING_MEMBER(tessedit_char_unblacklist, "",
+                    "List of chars to override tessedit_char_blacklist",
+                    this->params()),
+      BOOL_MEMBER(tessedit_ambigs_training, false,
+                  "Perform training for ambiguities", this->params()),
+      INT_MEMBER(pageseg_devanagari_split_strategy,
+                 tesseract::ShiroRekhaSplitter::NO_SPLIT,
+                 "Whether to use the top-line splitting process for Devanagari "
+                 "documents while performing page-segmentation.",
+                 this->params()),
+      INT_MEMBER(ocr_devanagari_split_strategy,
+                 tesseract::ShiroRekhaSplitter::NO_SPLIT,
+                 "Whether to use the top-line splitting process for Devanagari "
+                 "documents while performing ocr.",
+                 this->params()),
+      STRING_MEMBER(tessedit_write_params_to_file, "",
+                    "Write all parameters to the given file.", this->params()),
+      BOOL_MEMBER(tessedit_adaption_debug, false,
+                  "Generate and print debug"
+                  " information for adaption",
+                  this->params()),
+      INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
+      INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
+      INT_MEMBER(applybox_page, 0, "Page number to apply boxes from",
+                 this->params()),
+      STRING_MEMBER(applybox_exposure_pattern, ".exp",
+                    "Exposure value follows"
+                    " this pattern in the image filename. The name of the image"
+                    " files are expected to be in the form"
+                    " [lang].[fontname].exp[num].tif",
+                    this->params()),
+      BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
+                  "Learn both character fragments (as is done in the"
+                  " special low exposure mode) as well as unfragmented"
+                  " characters.",
+                  this->params()),
+      BOOL_MEMBER(applybox_learn_ngrams_mode, false,
+                  "Each bounding box"
+                  " is assumed to contain ngrams. Only learn the ngrams"
+                  " whose outlines overlap horizontally.",
+                  this->params()),
+      BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words",
+                  this->params()),
+      BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices",
+                  this->params()),
+      BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats",
+                  this->params()),
+      BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true,
+                  "Try to improve fuzzy spaces", this->params()),
+      BOOL_MEMBER(tessedit_unrej_any_wd, false,
+                  "Don't bother with word plausibility", this->params()),
+      BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?",
+                  this->params()),
+      BOOL_MEMBER(tessedit_enable_doc_dict, true,
+                  "Add words to the document dictionary", this->params()),
+      BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char",
+                  this->params()),
+      BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats",
+                  this->params()),
+      BOOL_MEMBER(tessedit_enable_bigram_correction, true,
+                  "Enable correction based on the word bigram dictionary.",
+                  this->params()),
+      BOOL_MEMBER(tessedit_enable_dict_correction, false,
+                  "Enable single word correction based on the dictionary.",
+                  this->params()),
+      INT_MEMBER(tessedit_bigram_debug, 0,
+                 "Amount of debug output for bigram correction.",
+                 this->params()),
+      BOOL_MEMBER(enable_noise_removal, true,
+                  "Remove and conditionally reassign small outlines when they"
+                  " confuse layout analysis, determining diacritics vs noise",
+                  this->params()),
+      INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines",
+                 this->params()),
+      // Worst (min) certainty, for which a diacritic is allowed to make the
+      // base
+      // character worse and still be included.
+      double_MEMBER(noise_cert_basechar, -8.0,
+                    "Hingepoint for base char certainty", this->params()),
+      // Worst (min) certainty, for which a non-overlapping diacritic is allowed
+      // to make the base character worse and still be included.
+      double_MEMBER(noise_cert_disjoint, -1.0,
+                    "Hingepoint for disjoint certainty", this->params()),
+      // Worst (min) certainty, for which a diacritic is allowed to make a new
+      // stand-alone blob.
+      double_MEMBER(noise_cert_punc, -3.0,
+                    "Threshold for new punc char certainty", this->params()),
+      // Factor of certainty margin for adding diacritics to not count as worse.
+      double_MEMBER(noise_cert_factor, 0.375,
+                    "Scaling on certainty diff from Hingepoint",
+                    this->params()),
+      INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob",
+                 this->params()),
+      INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word",
+                 this->params()),
+      INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
+      STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation",
+                    this->params()),
+      STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation",
+                    this->params()),
+      STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation",
+                    this->params()),
+      double_MEMBER(quality_rej_pc, 0.08,
+                    "good_quality_doc lte rejection limit", this->params()),
+      double_MEMBER(quality_blob_pc, 0.0,
+                    "good_quality_doc gte good blobs limit", this->params()),
+      double_MEMBER(quality_outline_pc, 1.0,
+                    "good_quality_doc lte outline error limit", this->params()),
+      double_MEMBER(quality_char_pc, 0.95,
+                    "good_quality_doc gte good char limit", this->params()),
+      INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word",
+                 this->params()),
+      INT_MEMBER(tessedit_tess_adaption_mode, 0x27,
+                 "Adaptation decision algorithm for tess", this->params()),
+      BOOL_MEMBER(tessedit_minimal_rej_pass1, false,
+                  "Do minimal rejection on pass 1 output", this->params()),
+      BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria",
+                  this->params()),
+      BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
+      double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
+      double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
+      INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.",
+                 this->params()),
+      INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
+                 this->params()),
+      BOOL_MEMBER(paragraph_text_based, true,
+                  "Run paragraph detection on the post-text-recognition "
+                  "(more accurate)",
+                  this->params()),
+      BOOL_MEMBER(lstm_use_matrix, 1,
+                  "Use ratings matrix/beam search with lstm", this->params()),
+      STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
+                    this->params()),
+      STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines",
+                    this->params()),
+      BOOL_MEMBER(tessedit_good_quality_unrej, true,
+                  "Reduce rejection on good docs", this->params()),
+      BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?",
+                  this->params()),
+      double_MEMBER(tessedit_reject_doc_percent, 65.00,
+                    "%rej allowed before rej whole doc", this->params()),
+      double_MEMBER(tessedit_reject_block_percent, 45.00,
+                    "%rej allowed before rej whole block", this->params()),
+      double_MEMBER(tessedit_reject_row_percent, 40.00,
+                    "%rej allowed before rej whole row", this->params()),
+      double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00,
+                    "Number of row rejects in whole word rejects"
+                    " which prevents whole row rejection",
+                    this->params()),
+      BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
+                  "Only rej partially rejected words in block rejection",
+                  this->params()),
+      BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,
+                  "Only rej partially rejected words in row rejection",
+                  this->params()),
+      BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false,
+                  "Use word segmentation quality metric", this->params()),
+      BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false,
+                  "Use word segmentation quality metric", this->params()),
+      INT_MEMBER(tessedit_preserve_min_wd_len, 2,
+                 "Only preserve wds longer than this", this->params()),
+      BOOL_MEMBER(tessedit_row_rej_good_docs, true,
+                  "Apply row rejection to good docs", this->params()),
+      double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1,
+                    "rej good doc wd if more than this fraction rejected",
+                    this->params()),
+      BOOL_MEMBER(tessedit_reject_bad_qual_wds, true,
+                  "Reject all bad quality wds", this->params()),
+      BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats",
+                  this->params()),
+      BOOL_MEMBER(tessedit_debug_quality_metrics, false,
+                  "Output data to debug file", this->params()),
+      BOOL_MEMBER(bland_unrej, false, "unrej potential with no checks",
+                  this->params()),
+      double_MEMBER(quality_rowrej_pc, 1.1,
+                    "good_quality_doc gte good char limit", this->params()),
+      BOOL_MEMBER(unlv_tilde_crunching, false,
+                  "Mark v.bad words for tilde crunch", this->params()),
+      BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
+                  this->params()),
+      BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
+                  this->params()),
+      BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
+                  this->params()),
+      BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
+                  "Take out ~^ early?", this->params()),
+      double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this",
+                    this->params()),
+      BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()),
+      double_MEMBER(crunch_poor_garbage_cert, -9.0,
+                    "crunch garbage cert lt this", this->params()),
+      double_MEMBER(crunch_poor_garbage_rate, 60,
+                    "crunch garbage rating lt this", this->params()),
+      double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this",
+                    this->params()),
+      double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this",
+                    this->params()),
+      double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this",
+                    this->params()),
+      double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this",
+                    this->params()),
+      double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this",
+                    this->params()),
+      double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this",
+                    this->params()),
+      double_MEMBER(crunch_del_min_width, 3.0,
+                    "Del if word width lt xht x this", this->params()),
+      double_MEMBER(crunch_del_high_word, 1.5,
+                    "Del if word gt xht x this above bl", this->params()),
+      double_MEMBER(crunch_del_low_word, 0.5,
+                    "Del if word gt xht x this below bl", this->params()),
+      double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this",
+                    this->params()),
+      INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch",
+                 this->params()),
+      INT_MEMBER(crunch_pot_indicators, 1,
+                 "How many potential indicators needed", this->params()),
+      BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings",
+                  this->params()),
+      BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring",
+                  this->params()),
+      BOOL_MEMBER(crunch_leave_accept_strings, false,
+                  "Don't pot crunch sensible strings", this->params()),
+      BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures",
+                  this->params()),
+      INT_MEMBER(crunch_leave_lc_strings, 4,
+                 "Don't crunch words with long lower case strings",
+                 this->params()),
+      INT_MEMBER(crunch_leave_uc_strings, 4,
+                 "Don't crunch words with long lower case strings",
+                 this->params()),
+      INT_MEMBER(crunch_long_repetitions, 3,
+                 "Crunch words with long repetitions", this->params()),
+      INT_MEMBER(crunch_debug, 0, "As it says", this->params()),
+      INT_MEMBER(fixsp_non_noise_limit, 1,
+                 "How many non-noise blbs either side?", this->params()),
+      double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this",
+                    this->params()),
+      BOOL_MEMBER(tessedit_prefer_joined_punct, false,
+                  "Reward punctuation joins", this->params()),
+      INT_MEMBER(fixsp_done_mode, 1, "What constitues done for spacing",
+                 this->params()),
+      INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug",
+                 this->params()),
+      STRING_MEMBER(numeric_punctuation, ".,",
+                    "Punct. chs expected WITHIN numbers", this->params()),
+      INT_MEMBER(x_ht_acceptance_tolerance, 8,
+                 "Max allowed deviation of blob top outside of font data",
+                 this->params()),
+      INT_MEMBER(x_ht_min_change, 8,
+                 "Min change in xht before actually trying it", this->params()),
+      INT_MEMBER(superscript_debug, 0,
+                 "Debug level for sub & superscript fixer", this->params()),
+      double_MEMBER(
+          superscript_worse_certainty, 2.0,
+          "How many times worse "
+          "certainty does a superscript position glyph need to be for "
+          "us to try classifying it as a char with a different "
+          "baseline?",
+          this->params()),
+      double_MEMBER(
+          superscript_bettered_certainty, 0.97,
+          "What reduction in "
+          "badness do we think sufficient to choose a superscript "
+          "over what we'd thought.  For example, a value of 0.6 means "
+          "we want to reduce badness of certainty by at least 40%",
+          this->params()),
+      double_MEMBER(superscript_scaledown_ratio, 0.4,
+                    "A superscript scaled down more than this is unbelievably "
+                    "small.  For example, 0.3 means we expect the font size to "
+                    "be no smaller than 30% of the text line font size.",
+                    this->params()),
+      double_MEMBER(subscript_max_y_top, 0.5,
+                    "Maximum top of a character measured as a multiple of "
+                    "x-height above the baseline for us to reconsider whether "
+                    "it's a subscript.",
+                    this->params()),
+      double_MEMBER(superscript_min_y_bottom, 0.3,
+                    "Minimum bottom of a character measured as a multiple of "
+                    "x-height above the baseline for us to reconsider whether "
+                    "it's a superscript.",
+                    this->params()),
+      BOOL_MEMBER(tessedit_write_block_separators, false,
+                  "Write block separators in output", this->params()),
+      BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code",
+                  this->params()),
+      BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file",
+                  this->params()),
+      BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file",
+                  this->params()),
+      BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
+                  this->params()),
+      BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file",
+                  this->params()),
+      BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
+                  this->params()),
+      BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
+                  this->params()),
+      BOOL_MEMBER(tessedit_create_wordstrbox, false, "Write WordStr format .box output file",
+                  this->params()),
+      BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
+                  this->params()),
+      BOOL_MEMBER(textonly_pdf, false,
+                  "Create PDF with only one invisible text layer",
+                  this->params()),
+      INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params()),
+      INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image",
+                 this->params()),
+      INT_MEMBER(min_characters_to_try, 50,
+                 "Specify minimum characters to try during OSD",
+                 this->params()),
+      STRING_MEMBER(unrecognised_char, "|",
+                    "Output char for unidentified blobs", this->params()),
+      INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
+      INT_MEMBER(suspect_short_words, 2,
+                 "Don't suspect dict wds longer than this", this->params()),
+      BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected",
+                  this->params()),
+      double_MEMBER(suspect_rating_per_ch, 999.9,
+                    "Don't touch bad rating limit", this->params()),
+      double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit",
+                    this->params()),
+      BOOL_MEMBER(tessedit_minimal_rejection, false,
+                  "Only reject tess failures", this->params()),
+      BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING",
+                  this->params()),
+      BOOL_MEMBER(tessedit_word_for_word, false,
+                  "Make output have exactly one word per WERD", this->params()),
+      BOOL_MEMBER(tessedit_zero_kelvin_rejection, false,
+                  "Don't reject ANYTHING AT ALL", this->params()),
+      INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm",
+                 this->params()),
+      BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug",
+                  this->params()),
+      BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips",
+                  this->params()),
+      double_MEMBER(tessedit_lower_flip_hyphen, 1.5,
+                    "Aspect ratio dot/hyphen test", this->params()),
+      double_MEMBER(tessedit_upper_flip_hyphen, 1.8,
+                    "Aspect ratio dot/hyphen test", this->params()),
+      BOOL_MEMBER(rej_trust_doc_dawg, false,
+                  "Use DOC dawg in 11l conf. detector", this->params()),
+      BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test",
+                  this->params()),
+      BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check",
+                  this->params()),
+      BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control",
+                  this->params()),
+      BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control",
+                  this->params()),
+      BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control",
+                  this->params()),
+      BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check",
+                  this->params()),
+      BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check",
+                  this->params()),
+      double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85,
+                    "if >this fract", this->params()),
+      INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit",
+                 this->params()),
+      STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075",
+                    "Allow NN to unrej", this->params()),
+      STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set",
+                    this->params()),
+      INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this",
+                 this->params()),
+      BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes",
+                  this->params()),
+      INT_MEMBER(tessedit_page_number, -1,
+                 "-1 -> All pages, else specific page to process",
+                 this->params()),
+      BOOL_MEMBER(tessedit_write_images, false,
+                  "Capture the image from the IPE", this->params()),
+      BOOL_MEMBER(interactive_display_mode, false, "Run interactively?",
+                  this->params()),
+      STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
+      BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word",
+                  this->params()),
+      STRING_MEMBER(tessedit_load_sublangs, "",
+                    "List of languages to load with this one", this->params()),
+      BOOL_MEMBER(tessedit_use_primary_params_model, false,
+                  "In multilingual mode use params model of the"
+                  " primary language",
+                  this->params()),
+      double_MEMBER(min_orientation_margin, 7.0,
+                    "Min acceptable orientation margin", this->params()),
+      BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
+                  this->params()),
+      BOOL_MEMBER(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model",
+                  this->params()),
+      BOOL_MEMBER(poly_allow_detailed_fx, false,
+                  "Allow feature extractors to see the original outline",
+                  this->params()),
+      BOOL_INIT_MEMBER(tessedit_init_config_only, false,
+                       "Only initialize with the config file. Useful if the "
+                       "instance is not going to be used for OCR but say only "
+                       "for layout analysis.",
+                       this->params()),
+      BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
+                  this->params()),
+      BOOL_MEMBER(textord_tabfind_vertical_text, true,
+                  "Enable vertical detection", this->params()),
+      BOOL_MEMBER(textord_tabfind_force_vertical_text, false,
+                  "Force using vertical text page mode", this->params()),
+      double_MEMBER(
+          textord_tabfind_vertical_text_ratio, 0.5,
+          "Fraction of textlines deemed vertical to use vertical page "
+          "mode",
+          this->params()),
+      double_MEMBER(
+          textord_tabfind_aligned_gap_fraction, 0.75,
+          "Fraction of height used as a minimum gap for aligned blobs.",
+          this->params()),
+      INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
+                 this->params()),
+      BOOL_MEMBER(preserve_interword_spaces, false,
+                  "Preserve multiple interword spaces", this->params()),
+      STRING_MEMBER(page_separator, "\f",
+                    "Page separator (default is form feed control character)",
+                    this->params()),
+      INT_MEMBER(lstm_choice_mode, 0,
+          "Allows to include alternative symbols choices in the hOCR output. "
+          "Valid input values are 0, 1 and 2. 0 is the default value. "
+          "With 1 the alternative symbol choices per timestep are included. "
+          "With 2 alternative symbol choices are extracted from the CTC "
+          "process instead of the lattice. The choices are mapped per "
+          "character.",
+          this->params()),
+      INT_MEMBER(
+          lstm_choice_iterations, 5,
+          "Sets the number of cascading iterations for the Beamsearch in "
+          "lstm_choice_mode. Note that lstm_choice_mode must be set to a "
+          "value greater than 0 to produce results.",
+                 this->params()),
+      double_MEMBER(
+          lstm_rating_coefficient, 5,
+          "Sets the rating coefficient for the lstm choices. The smaller the "
+          "coefficient, the better are the ratings for each choice and less "
+          "information is lost due to the cut off at 0. The standard value is "
+          "5", this->params()),
+      BOOL_MEMBER(pageseg_apply_music_mask, true,
+                "Detect music staff and remove intersecting components", this->params()),
+
+      backup_config_file_(nullptr),
+      pix_binary_(nullptr),
+      pix_grey_(nullptr),
+      pix_original_(nullptr),
+      pix_thresholds_(nullptr),
+      source_resolution_(0),
+      textord_(this),
+      right_to_left_(false),
+      scaled_color_(nullptr),
+      scaled_factor_(-1),
+      deskew_(1.0f, 0.0f),
+      reskew_(1.0f, 0.0f),
+      most_recently_used_(this),
+      font_table_size_(0),
+      equ_detect_(nullptr),
+      lstm_recognizer_(nullptr),
+      train_line_page_num_(0) {
+}
+
+Tesseract::~Tesseract() {
+  Clear();
+  pixDestroy(&pix_original_);
+  end_tesseract();
+  for (auto* lang : sub_langs_) {
+    delete lang;
+  }
+  delete lstm_recognizer_;
+  lstm_recognizer_ = nullptr;
+}
+
+Dict& Tesseract::getDict() {
+  if (0 == Classify::getDict().NumDawgs() && AnyLSTMLang()) {
+    if (lstm_recognizer_ && lstm_recognizer_->GetDict()) {
+      return *lstm_recognizer_->GetDict();
+    }
+  }
+  return Classify::getDict();
+}
+
+
+void Tesseract::Clear() {
+  STRING debug_name = imagebasename + "_debug.pdf";
+  pixa_debug_.WritePDF(debug_name.c_str());
+  pixDestroy(&pix_binary_);
+  pixDestroy(&pix_grey_);
+  pixDestroy(&pix_thresholds_);
+  pixDestroy(&scaled_color_);
+  deskew_ = FCOORD(1.0f, 0.0f);
+  reskew_ = FCOORD(1.0f, 0.0f);
+  splitter_.Clear();
+  scaled_factor_ = -1;
+  for (int i = 0; i < sub_langs_.size(); ++i)
+    sub_langs_[i]->Clear();
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+void Tesseract::SetEquationDetect(EquationDetect* detector) {
+  equ_detect_ = detector;
+  equ_detect_->SetLangTesseract(this);
+}
+
+// Clear all memory of adaption for this and all subclassifiers.
+void Tesseract::ResetAdaptiveClassifier() {
+  ResetAdaptiveClassifierInternal();
+  for (int i = 0; i < sub_langs_.size(); ++i) {
+    sub_langs_[i]->ResetAdaptiveClassifierInternal();
+  }
+}
+
+#endif  //ndef DISABLED_LEGACY_ENGINE
+
+// Clear the document dictionary for this and all subclassifiers.
+void Tesseract::ResetDocumentDictionary() {
+  getDict().ResetDocumentDictionary();
+  for (int i = 0; i < sub_langs_.size(); ++i) {
+    sub_langs_[i]->getDict().ResetDocumentDictionary();
+  }
+}
+
+void Tesseract::SetBlackAndWhitelist() {
+  // Set the white and blacklists (if any)
+  unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
+                                     tessedit_char_whitelist.c_str(),
+                                     tessedit_char_unblacklist.c_str());
+  if (lstm_recognizer_) {
+    UNICHARSET& lstm_unicharset = lstm_recognizer_->GetUnicharset();
+    lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
+                                            tessedit_char_whitelist.c_str(),
+                                            tessedit_char_unblacklist.c_str());
+  }
+  // Black and white lists should apply to all loaded classifiers.
+  for (int i = 0; i < sub_langs_.size(); ++i) {
+    sub_langs_[i]->unicharset.set_black_and_whitelist(
+        tessedit_char_blacklist.c_str(), tessedit_char_whitelist.c_str(),
+        tessedit_char_unblacklist.c_str());
+    if (sub_langs_[i]->lstm_recognizer_) {
+      UNICHARSET& lstm_unicharset = sub_langs_[i]->lstm_recognizer_->GetUnicharset();
+      lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
+                                              tessedit_char_whitelist.c_str(),
+                                              tessedit_char_unblacklist.c_str());
+    }
+  }
+}
+
+// Perform steps to prepare underlying binary image/other data structures for
+// page segmentation.
+void Tesseract::PrepareForPageseg() {
+  textord_.set_use_cjk_fp_model(textord_use_cjk_fp_model);
+  // Find the max splitter strategy over all langs.
+  auto max_pageseg_strategy =
+      static_cast<ShiroRekhaSplitter::SplitStrategy>(
+      static_cast<int32_t>(pageseg_devanagari_split_strategy));
+  for (int i = 0; i < sub_langs_.size(); ++i) {
+    auto pageseg_strategy =
+        static_cast<ShiroRekhaSplitter::SplitStrategy>(
+        static_cast<int32_t>(sub_langs_[i]->pageseg_devanagari_split_strategy));
+    if (pageseg_strategy > max_pageseg_strategy)
+      max_pageseg_strategy = pageseg_strategy;
+    pixDestroy(&sub_langs_[i]->pix_binary_);
+    sub_langs_[i]->pix_binary_ = pixClone(pix_binary());
+  }
+  // Perform shiro-rekha (top-line) splitting and replace the current image by
+  // the newly split image.
+  splitter_.set_orig_pix(pix_binary());
+  splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
+  if (splitter_.Split(true, &pixa_debug_)) {
+    ASSERT_HOST(splitter_.splitted_image());
+    pixDestroy(&pix_binary_);
+    pix_binary_ = pixClone(splitter_.splitted_image());
+  }
+}
+
+// Perform steps to prepare underlying binary image/other data structures for
+// OCR. The current segmentation is required by this method.
+// Note that this method resets pix_binary_ to the original binarized image,
+// which may be different from the image actually used for OCR depending on the
+// value of devanagari_ocr_split_strategy.
+void Tesseract::PrepareForTessOCR(BLOCK_LIST* block_list,
+                                  Tesseract* osd_tess, OSResults* osr) {
+  // Find the max splitter strategy over all langs.
+  auto max_ocr_strategy =
+      static_cast<ShiroRekhaSplitter::SplitStrategy>(
+      static_cast<int32_t>(ocr_devanagari_split_strategy));
+  for (int i = 0; i < sub_langs_.size(); ++i) {
+    auto ocr_strategy =
+        static_cast<ShiroRekhaSplitter::SplitStrategy>(
+        static_cast<int32_t>(sub_langs_[i]->ocr_devanagari_split_strategy));
+    if (ocr_strategy > max_ocr_strategy)
+      max_ocr_strategy = ocr_strategy;
+  }
+  // Utilize the segmentation information available.
+  splitter_.set_segmentation_block_list(block_list);
+  splitter_.set_ocr_split_strategy(max_ocr_strategy);
+  // Run the splitter for OCR
+  bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
+  // Restore pix_binary to the binarized original pix for future reference.
+  ASSERT_HOST(splitter_.orig_pix());
+  pixDestroy(&pix_binary_);
+  pix_binary_ = pixClone(splitter_.orig_pix());
+  // If the pageseg and ocr strategies are different, refresh the block list
+  // (from the last SegmentImage call) with blobs from the real image to be used
+  // for OCR.
+  if (splitter_.HasDifferentSplitStrategies()) {
+    BLOCK block("", true, 0, 0, 0, 0, pixGetWidth(pix_binary_),
+                pixGetHeight(pix_binary_));
+    Pix* pix_for_ocr = split_for_ocr ? splitter_.splitted_image() :
+        splitter_.orig_pix();
+    extract_edges(pix_for_ocr, &block);
+    splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
+  }
+  // The splitter isn't needed any more after this, so save memory by clearing.
+  splitter_.Clear();
+}
+
+}  // namespace tesseract
diff --git a/tesseract/src/ccmain/tesseractclass.h b/tesseract/src/ccmain/tesseractclass.h
new file mode 100644
index 00000000..159b0ea7
--- /dev/null
+++ b/tesseract/src/ccmain/tesseractclass.h
@@ -0,0 +1,1163 @@
+///////////////////////////////////////////////////////////////////////
+// File:        tesseractclass.h
+// Description: The Tesseract class. It holds/owns everything needed
+//              to run Tesseract on a single language, and also a set of
+//              sub-Tesseracts to run sub-languages. For thread safety, *every*
+//              global variable goes in here, directly, or indirectly.
+//              This makes it safe to run multiple Tesseracts in different
+//              threads in parallel, and keeps the different language
+//              instances separate.
+// Author:      Ray Smith
+//
+// (C) Copyright 2008, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H_
+#define TESSERACT_CCMAIN_TESSERACTCLASS_H_
+
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h" // DISABLED_LEGACY_ENGINE
+#endif
+
+#include "control.h"                // for ACCEPTABLE_WERD_TYPE
+#include "debugpixa.h"              // for DebugPixa
+#include "devanagari_processing.h"  // for ShiroRekhaSplitter
+#ifndef DISABLED_LEGACY_ENGINE
+#include "docqual.h"                // for GARBAGE_LEVEL
+#endif
+#include "pageres.h"                // for WERD_RES (ptr only), PAGE_RES (pt...
+#include "params.h"                 // for BOOL_VAR_H, BoolParam, DoubleParam
+#include "points.h"                 // for FCOORD
+#include "ratngs.h"                 // for ScriptPos, WERD_CHOICE (ptr only)
+#include "tessdatamanager.h"        // for TessdataManager
+#include "textord.h"                // for Textord
+#include "wordrec.h"                // for Wordrec
+
+#include "genericvector.h"          // for GenericVector, PointerVector
+#include <tesseract/publictypes.h>            // for OcrEngineMode, PageSegMode, OEM_L...
+#include "strngs.h"                 // for STRING
+#include <tesseract/unichar.h>                // for UNICHAR_ID
+
+#include "allheaders.h"             // for pixDestroy, pixGetWidth, pixGetHe...
+
+#include <cstdint>                  // for int16_t, int32_t, uint16_t
+#include <cstdio>                   // for FILE
+
+namespace tesseract {
+
+class BLOCK_LIST;
+class ETEXT_DESC;
+struct OSResults;
+class PAGE_RES;
+class PAGE_RES_IT;
+class ROW;
+class SVMenuNode;
+class TBOX;
+class TO_BLOCK_LIST;
+class WERD;
+class WERD_CHOICE;
+class WERD_RES;
+
+class ColumnFinder;
+class DocumentData;
+class EquationDetect;
+class ImageData;
+class LSTMRecognizer;
+class Tesseract;
+
+// Top-level class for all tesseract global instance data.
+// This class either holds or points to all data used by an instance
+// of Tesseract, including the memory allocator. When this is
+// complete, Tesseract will be thread-safe. UNTIL THEN, IT IS NOT!
+//
+// NOTE to developers: Do not create cyclic dependencies through this class!
+// The directory dependency tree must remain a tree! The keep this clean,
+// lower-level code (eg in ccutil, the bottom level) must never need to
+// know about the content of a higher-level directory.
+// The following scheme will grant the easiest access to lower-level
+// global members without creating a cyclic dependency:
+//
+// Class Hierarchy (^ = inheritance):
+//
+//             CCUtil (ccutil/ccutil.h)
+//                         ^      Members include: UNICHARSET
+//           CCStruct (ccstruct/ccstruct.h)
+//                         ^       Members include: Image
+//           Classify (classify/classify.h)
+//                         ^       Members include: Dict
+//             WordRec (wordrec/wordrec.h)
+//                         ^       Members include: WERD*, DENORM*
+//        Tesseract (ccmain/tesseractclass.h)
+//                                 Members include: Pix*
+//
+// Other important classes:
+//
+//  TessBaseAPI (tesseract/baseapi.h)
+//                                 Members include: BLOCK_LIST*, PAGE_RES*,
+//                                 Tesseract*, ImageThresholder*
+//  Dict (dict/dict.h)
+//                                 Members include: Image* (private)
+//
+// NOTE: that each level contains members that correspond to global
+// data that is defined (and used) at that level, not necessarily where
+// the type is defined so for instance:
+// BOOL_VAR_H(textord_show_blobs, false, "Display unsorted blobs");
+// goes inside the Textord class, not the cc_util class.
+
+// A collection of various variables for statistics and debugging.
+struct TesseractStats {
+  TesseractStats()
+      : adaption_word_number(0),
+        doc_blob_quality(0),
+        doc_outline_errs(0),
+        doc_char_quality(0),
+        good_char_count(0),
+        doc_good_char_quality(0),
+        word_count(0),
+        dict_words(0),
+        tilde_crunch_written(false),
+        last_char_was_newline(true),
+        last_char_was_tilde(false),
+        write_results_empty_block(true) {}
+
+  int32_t adaption_word_number;
+  int16_t doc_blob_quality;
+  int16_t doc_outline_errs;
+  int16_t doc_char_quality;
+  int16_t good_char_count;
+  int16_t doc_good_char_quality;
+  int32_t word_count;     // count of word in the document
+  int32_t dict_words;     // number of dicitionary words in the document
+  STRING dump_words_str;  // accumulator used by dump_words()
+  // Flags used by write_results()
+  bool tilde_crunch_written;
+  bool last_char_was_newline;
+  bool last_char_was_tilde;
+  bool write_results_empty_block;
+};
+
+// Struct to hold all the pointers to relevant data for processing a word.
+struct WordData {
+  WordData()
+      : word(nullptr), row(nullptr), block(nullptr), prev_word(nullptr) {}
+  explicit WordData(const PAGE_RES_IT& page_res_it)
+      : word(page_res_it.word()),
+        row(page_res_it.row()->row),
+        block(page_res_it.block()->block),
+        prev_word(nullptr) {}
+  WordData(BLOCK* block_in, ROW* row_in, WERD_RES* word_res)
+      : word(word_res), row(row_in), block(block_in), prev_word(nullptr) {}
+
+  WERD_RES* word;
+  ROW* row;
+  BLOCK* block;
+  WordData* prev_word;
+  PointerVector<WERD_RES> lang_words;
+};
+
+// Definition of a Tesseract WordRecognizer. The WordData provides the context
+// of row/block, in_word holds an initialized, possibly pre-classified word,
+// that the recognizer may or may not consume (but if so it sets
+// *in_word=nullptr) and produces one or more output words in out_words, which
+// may be the consumed in_word, or may be generated independently. This api
+// allows both a conventional tesseract classifier to work, or a line-level
+// classifier that generates multiple words from a merged input.
+using WordRecognizer = void (Tesseract::*)(const WordData&, WERD_RES**,
+                                           PointerVector<WERD_RES>*);
+
+class TESS_API Tesseract : public Wordrec {
+ public:
+  Tesseract();
+  ~Tesseract() override;
+
+  // Return appropriate dictionary
+  Dict& getDict() override;
+
+  // Clear as much used memory as possible without resetting the adaptive
+  // classifier or losing any other classifier data.
+  void Clear();
+  // Clear all memory of adaption for this and all subclassifiers.
+  void ResetAdaptiveClassifier();
+  // Clear the document dictionary for this and all subclassifiers.
+  void ResetDocumentDictionary();
+
+  // Set the equation detector.
+  void SetEquationDetect(EquationDetect* detector);
+
+  // Simple accessors.
+  const FCOORD& reskew() const {
+    return reskew_;
+  }
+  // Destroy any existing pix and return a pointer to the pointer.
+  Pix** mutable_pix_binary() {
+    pixDestroy(&pix_binary_);
+    return &pix_binary_;
+  }
+  Pix* pix_binary() const {
+    return pix_binary_;
+  }
+  Pix* pix_grey() const {
+    return pix_grey_;
+  }
+  void set_pix_grey(Pix* grey_pix) {
+    pixDestroy(&pix_grey_);
+    pix_grey_ = grey_pix;
+  }
+  Pix* pix_original() const {
+    return pix_original_;
+  }
+  // Takes ownership of the given original_pix.
+  void set_pix_original(Pix* original_pix) {
+    pixDestroy(&pix_original_);
+    pix_original_ = original_pix;
+    // Clone to sublangs as well.
+    for (int i = 0; i < sub_langs_.size(); ++i) {
+      sub_langs_[i]->set_pix_original(original_pix ? pixClone(original_pix)
+                                                   : nullptr);
+    }
+  }
+  // Returns a pointer to a Pix representing the best available resolution image
+  // of the page, with best available bit depth as second priority. Result can
+  // be of any bit depth, but never color-mapped, as that has always been
+  // removed. Note that in grey and color, 0 is black and 255 is
+  // white. If the input was binary, then black is 1 and white is 0.
+  // To tell the difference pixGetDepth() will return 32, 8 or 1.
+  // In any case, the return value is a borrowed Pix, and should not be
+  // deleted or pixDestroyed.
+  Pix* BestPix() const {
+    if (pixGetWidth(pix_original_) == ImageWidth()) {
+      return pix_original_;
+    } else if (pix_grey_ != nullptr) {
+      return pix_grey_;
+    } else {
+      return pix_binary_;
+    }
+  }
+  void set_pix_thresholds(Pix* thresholds) {
+    pixDestroy(&pix_thresholds_);
+    pix_thresholds_ = thresholds;
+  }
+  int source_resolution() const {
+    return source_resolution_;
+  }
+  void set_source_resolution(int ppi) {
+    source_resolution_ = ppi;
+  }
+  int ImageWidth() const {
+    return pixGetWidth(pix_binary_);
+  }
+  int ImageHeight() const {
+    return pixGetHeight(pix_binary_);
+  }
+  Pix* scaled_color() const {
+    return scaled_color_;
+  }
+  int scaled_factor() const {
+    return scaled_factor_;
+  }
+  void SetScaledColor(int factor, Pix* color) {
+    scaled_factor_ = factor;
+    scaled_color_ = color;
+  }
+  const Textord& textord() const {
+    return textord_;
+  }
+  Textord* mutable_textord() {
+    return &textord_;
+  }
+
+  bool right_to_left() const {
+    return right_to_left_;
+  }
+  int num_sub_langs() const {
+    return sub_langs_.size();
+  }
+  Tesseract* get_sub_lang(int index) const {
+    return sub_langs_[index];
+  }
+  // Returns true if any language uses Tesseract (as opposed to LSTM).
+  bool AnyTessLang() const {
+    if (tessedit_ocr_engine_mode != OEM_LSTM_ONLY)
+      return true;
+    for (int i = 0; i < sub_langs_.size(); ++i) {
+      if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_LSTM_ONLY)
+        return true;
+    }
+    return false;
+  }
+  // Returns true if any language uses the LSTM.
+  bool AnyLSTMLang() const {
+    if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY)
+      return true;
+    for (int i = 0; i < sub_langs_.size(); ++i) {
+      if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void SetBlackAndWhitelist();
+
+  // Perform steps to prepare underlying binary image/other data structures for
+  // page segmentation. Uses the strategy specified in the global variable
+  // pageseg_devanagari_split_strategy for perform splitting while preparing for
+  // page segmentation.
+  void PrepareForPageseg();
+
+  // Perform steps to prepare underlying binary image/other data structures for
+  // Tesseract OCR. The current segmentation is required by this method.
+  // Uses the strategy specified in the global variable
+  // ocr_devanagari_split_strategy for performing splitting while preparing for
+  // Tesseract ocr.
+  void PrepareForTessOCR(BLOCK_LIST* block_list, Tesseract* osd_tess,
+                         OSResults* osr);
+
+  int SegmentPage(const char* input_file, BLOCK_LIST* blocks,
+                  Tesseract* osd_tess, OSResults* osr);
+  void SetupWordScripts(BLOCK_LIST* blocks);
+  int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
+                  TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs,
+                  Tesseract* osd_tess, OSResults* osr);
+  ColumnFinder* SetupPageSegAndDetectOrientation(
+      PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess,
+      OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
+      Pix** music_mask_pix);
+  // par_control.cpp
+  void PrerecAllWordsPar(const std::vector<WordData>& words);
+
+  //// linerec.cpp
+  // Generates training data for training a line recognizer, eg LSTM.
+  // Breaks the page into lines, according to the boxes, and writes them to a
+  // serialized DocumentData based on output_basename.
+  // Return true if successful, false if an error occurred.
+  bool TrainLineRecognizer(const char* input_imagename,
+                           const STRING& output_basename,
+                           BLOCK_LIST* block_list);
+  // Generates training data for training a line recognizer, eg LSTM.
+  // Breaks the boxes into lines, normalizes them, converts to ImageData and
+  // appends them to the given training_data.
+  void TrainFromBoxes(const std::vector<TBOX>& boxes,
+                      const std::vector<STRING>& texts,
+                      BLOCK_LIST* block_list, DocumentData* training_data);
+
+  // Returns an Imagedata containing the image of the given textline,
+  // and ground truth boxes/truth text if available in the input.
+  // The image is not normalized in any way.
+  ImageData* GetLineData(const TBOX& line_box, const std::vector<TBOX>& boxes,
+                         const std::vector<STRING>& texts, int start_box,
+                         int end_box, const BLOCK& block);
+  // Helper gets the image of a rectangle, using the block.re_rotation() if
+  // needed to get to the image, and rotating the result back to horizontal
+  // layout. (CJK characters will be on their left sides) The vertical text flag
+  // is set in the returned ImageData if the text was originally vertical, which
+  // can be used to invoke a different CJK recognition engine. The revised_box
+  // is also returned to enable calculation of output bounding boxes.
+  ImageData* GetRectImage(const TBOX& box, const BLOCK& block, int padding,
+                          TBOX* revised_box) const;
+  // Recognizes a word or group of words, converting to WERD_RES in *words.
+  // Analogous to classify_word_pass1, but can handle a group of words as well.
+  void LSTMRecognizeWord(const BLOCK& block, ROW* row, WERD_RES* word,
+                         PointerVector<WERD_RES>* words);
+  // Apply segmentation search to the given set of words, within the constraints
+  // of the existing ratings matrix. If there is already a best_choice on a word
+  // leaves it untouched and just sets the done/accepted etc flags.
+  void SearchWords(PointerVector<WERD_RES>* words);
+
+  //// control.h /////////////////////////////////////////////////////////
+  bool ProcessTargetWord(const TBOX& word_box, const TBOX& target_word_box,
+                         const char* word_config, int pass);
+  // Sets up the words ready for whichever engine is to be run
+  void SetupAllWordsPassN(int pass_n, const TBOX* target_word_box,
+                          const char* word_config, PAGE_RES* page_res,
+                          std::vector<WordData>* words);
+  // Sets up the single word ready for whichever engine is to be run.
+  void SetupWordPassN(int pass_n, WordData* word);
+  // Runs word recognition on all the words.
+  bool RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor, PAGE_RES_IT* pr_it,
+                          std::vector<WordData>* words);
+  bool recog_all_words(PAGE_RES* page_res, ETEXT_DESC* monitor,
+                       const TBOX* target_word_box, const char* word_config,
+                       int dopasses);
+  void rejection_passes(PAGE_RES* page_res, ETEXT_DESC* monitor,
+                        const TBOX* target_word_box, const char* word_config);
+  void bigram_correction_pass(PAGE_RES* page_res);
+  void blamer_pass(PAGE_RES* page_res);
+  // Sets script positions and detects smallcaps on all output words.
+  void script_pos_pass(PAGE_RES* page_res);
+  // Helper to recognize the word using the given (language-specific) tesseract.
+  // Returns positive if this recognizer found more new best words than the
+  // number kept from best_words.
+  int RetryWithLanguage(const WordData& word_data, WordRecognizer recognizer,
+                        bool debug, WERD_RES** in_word,
+                        PointerVector<WERD_RES>* best_words);
+  // Moves good-looking "noise"/diacritics from the reject list to the main
+  // blob list on the current word. Returns true if anything was done, and
+  // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
+  bool ReassignDiacritics(int pass, PAGE_RES_IT* pr_it,
+                          bool* make_next_word_fuzzy);
+  // Attempts to put noise/diacritic outlines into the blobs that they overlap.
+  // Input: a set of noisy outlines that probably belong to the real_word.
+  // Output: outlines that overlapped blobs are set to nullptr and put back into
+  // the word, either in the blobs or in the reject list.
+  void AssignDiacriticsToOverlappingBlobs(
+      const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
+      PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
+      GenericVector<bool>* overlapped_any_blob,
+      GenericVector<C_BLOB*>* target_blobs);
+  // Attempts to assign non-overlapping outlines to their nearest blobs or
+  // make new blobs out of them.
+  void AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE*>& outlines,
+                                  int pass, WERD* real_word, PAGE_RES_IT* pr_it,
+                                  GenericVector<bool>* word_wanted,
+                                  GenericVector<C_BLOB*>* target_blobs);
+  // Starting with ok_outlines set to indicate which outlines overlap the blob,
+  // chooses the optimal set (approximately) and returns true if any outlines
+  // are desired, in which case ok_outlines indicates which ones.
+  bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold,
+                                   PAGE_RES_IT* pr_it, C_BLOB* blob,
+                                   const GenericVector<C_OUTLINE*>& outlines,
+                                   int num_outlines,
+                                   std::vector<bool>* ok_outlines);
+  // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
+  // the inclusion of the outlines, and returns the certainty of the raw choice.
+  float ClassifyBlobPlusOutlines(const std::vector<bool>& ok_outlines,
+                                 const GenericVector<C_OUTLINE*>& outlines,
+                                 int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
+                                 STRING* best_str);
+  // Classifies the given blob (part of word_data->word->word) as an individual
+  // word, using languages, chopper etc, returning only the certainty of the
+  // best raw choice, and undoing all the work done to fake out the word.
+  float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
+                           STRING* best_str, float* c2);
+  void classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
+                                  WordData* word_data);
+  void classify_word_pass1(const WordData& word_data, WERD_RES** in_word,
+                           PointerVector<WERD_RES>* out_words);
+  void recog_pseudo_word(PAGE_RES* page_res,  // blocks to check
+                         TBOX& selection_box);
+
+  void fix_rep_char(PAGE_RES_IT* page_res_it);
+
+  ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET& char_set,
+                                              const char* s,
+                                              const char* lengths);
+  void match_word_pass_n(int pass_n, WERD_RES* word, ROW* row, BLOCK* block);
+  void classify_word_pass2(const WordData& word_data, WERD_RES** in_word,
+                           PointerVector<WERD_RES>* out_words);
+  void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES* word,
+                          WERD_RES* new_word);
+  bool RunOldFixXht(WERD_RES* word, BLOCK* block, ROW* row);
+  bool TrainedXheightFix(WERD_RES* word, BLOCK* block, ROW* row);
+  // Runs recognition with the test baseline shift and x-height and returns true
+  // if there was an improvement in recognition result.
+  bool TestNewNormalization(int original_misfits, float baseline_shift,
+                            float new_x_ht, WERD_RES* word, BLOCK* block,
+                            ROW* row);
+  bool recog_interactive(PAGE_RES_IT* pr_it);
+
+  // Set fonts of this word.
+  void set_word_fonts(WERD_RES* word);
+  void font_recognition_pass(PAGE_RES* page_res);
+  void dictionary_correction_pass(PAGE_RES* page_res);
+  bool check_debug_pt(WERD_RES* word, int location);
+
+  //// superscript.cpp ////////////////////////////////////////////////////
+  bool SubAndSuperscriptFix(WERD_RES* word_res);
+  void GetSubAndSuperscriptCandidates(
+      const WERD_RES* word, int* num_rebuilt_leading, ScriptPos* leading_pos,
+      float* leading_certainty, int* num_rebuilt_trailing,
+      ScriptPos* trailing_pos, float* trailing_certainty, float* avg_certainty,
+      float* unlikely_threshold);
+  WERD_RES* TrySuperscriptSplits(int num_chopped_leading,
+                                 float leading_certainty, ScriptPos leading_pos,
+                                 int num_chopped_trailing,
+                                 float trailing_certainty,
+                                 ScriptPos trailing_pos, WERD_RES* word,
+                                 bool* is_good, int* retry_leading,
+                                 int* retry_trailing);
+  bool BelievableSuperscript(bool debug, const WERD_RES& word,
+                             float certainty_threshold, int* left_ok,
+                             int* right_ok) const;
+
+  //// output.h //////////////////////////////////////////////////////////
+
+  void output_pass(PAGE_RES_IT& page_res_it, const TBOX* target_word_box);
+  void write_results(PAGE_RES_IT& page_res_it,  // full info
+                     char newline_type,         // type of newline
+                     bool force_eol             // override tilde crunch?
+  );
+  void set_unlv_suspects(WERD_RES* word);
+  UNICHAR_ID get_rep_char(WERD_RES* word);  // what char is repeated?
+  bool acceptable_number_string(const char* s, const char* lengths);
+  int16_t count_alphanums(const WERD_CHOICE& word);
+  int16_t count_alphas(const WERD_CHOICE& word);
+
+  void read_config_file(const char* filename, SetParamConstraint constraint);
+  // Initialize for potentially a set of languages defined by the language
+  // string and recursively any additional languages required by any language
+  // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
+  // See init_tesseract_internal for args.
+  int init_tesseract(const char* arg0, const char* textbase,
+                     const char* language, OcrEngineMode oem, char** configs,
+                     int configs_size, const std::vector<std::string>* vars_vec,
+                     const std::vector<std::string>* vars_values,
+                     bool set_only_init_params, TessdataManager* mgr);
+  int init_tesseract(const char* datapath, const char* language,
+                     OcrEngineMode oem) {
+    TessdataManager mgr;
+    return init_tesseract(datapath, nullptr, language, oem, nullptr, 0, nullptr,
+                          nullptr, false, &mgr);
+  }
+  // Common initialization for a single language.
+  // arg0 is the datapath for the tessdata directory, which could be the
+  // path of the tessdata directory with no trailing /, or (if tessdata
+  // lives in the same directory as the executable, the path of the executable,
+  // hence the name arg0.
+  // textbase is an optional output file basename (used only for training)
+  // language is the language code to load.
+  // oem controls which engine(s) will operate on the image
+  // configs (argv) is an array of config filenames to load variables from.
+  // May be nullptr.
+  // configs_size (argc) is the number of elements in configs.
+  // vars_vec is an optional vector of variables to set.
+  // vars_values is an optional corresponding vector of values for the variables
+  // in vars_vec.
+  // If set_only_init_params is true, then only the initialization variables
+  // will be set.
+  int init_tesseract_internal(const char* arg0, const char* textbase,
+                              const char* language, OcrEngineMode oem,
+                              char** configs, int configs_size,
+                              const std::vector<std::string>* vars_vec,
+                              const std::vector<std::string>* vars_values,
+                              bool set_only_init_params, TessdataManager* mgr);
+
+  // Set the universal_id member of each font to be unique among all
+  // instances of the same font loaded.
+  void SetupUniversalFontIds();
+
+  int init_tesseract_lm(const char* arg0, const char* textbase,
+                        const char* language, TessdataManager* mgr);
+
+  void recognize_page(STRING& image_name);
+  void end_tesseract();
+
+  bool init_tesseract_lang_data(const char* arg0, const char* textbase,
+                                const char* language, OcrEngineMode oem,
+                                char** configs, int configs_size,
+                                const std::vector<std::string>* vars_vec,
+                                const std::vector<std::string>* vars_values,
+                                bool set_only_init_params,
+                                TessdataManager* mgr);
+
+  void ParseLanguageString(const char* lang_str, std::vector<std::string>* to_load,
+                           std::vector<std::string>* not_to_load);
+
+  //// pgedit.h //////////////////////////////////////////////////////////
+  SVMenuNode* build_menu_new();
+#ifndef GRAPHICS_DISABLED
+  void pgeditor_main(int width, int height, PAGE_RES* page_res);
+#endif // !GRAPHICS_DISABLED
+  void process_image_event(  // action in image win
+      const SVEvent& event);
+  bool process_cmd_win_event(  // UI command semantics
+      int32_t cmd_event,       // which menu item?
+      char* new_value          // any prompt data
+  );
+  void debug_word(PAGE_RES* page_res, const TBOX& selection_box);
+  void do_re_display(
+      bool (tesseract::Tesseract::*word_painter)(PAGE_RES_IT* pr_it));
+  bool word_display(PAGE_RES_IT* pr_it);
+  bool word_bln_display(PAGE_RES_IT* pr_it);
+  bool word_blank_and_set_display(PAGE_RES_IT* pr_its);
+  bool word_set_display(PAGE_RES_IT* pr_it);
+  // #ifndef GRAPHICS_DISABLED
+  bool word_dumper(PAGE_RES_IT* pr_it);
+  // #endif // !GRAPHICS_DISABLED
+  void blob_feature_display(PAGE_RES* page_res, const TBOX& selection_box);
+  //// reject.h //////////////////////////////////////////////////////////
+  // make rej map for word
+  void make_reject_map(WERD_RES* word, ROW* row, int16_t pass);
+  bool one_ell_conflict(WERD_RES* word_res, bool update_map);
+  int16_t first_alphanum_index(const char* word, const char* word_lengths);
+  int16_t first_alphanum_offset(const char* word, const char* word_lengths);
+  int16_t alpha_count(const char* word, const char* word_lengths);
+  bool word_contains_non_1_digit(const char* word, const char* word_lengths);
+  void dont_allow_1Il(WERD_RES* word);
+  int16_t count_alphanums(  // how many alphanums
+      WERD_RES* word);
+  void flip_0O(WERD_RES* word);
+  bool non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
+  bool non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
+  bool repeated_nonalphanum_wd(WERD_RES* word, ROW* row);
+  void nn_match_word(  // Match a word
+      WERD_RES* word, ROW* row);
+  void nn_recover_rejects(WERD_RES* word, ROW* row);
+  void set_done(  // set done flag
+      WERD_RES* word, int16_t pass);
+  int16_t safe_dict_word(const WERD_RES* werd_res);  // is best_choice in dict?
+  void flip_hyphens(WERD_RES* word);
+  void reject_I_1_L(WERD_RES* word);
+  void reject_edge_blobs(WERD_RES* word);
+  void reject_mostly_rejects(WERD_RES* word);
+  //// adaptions.h ///////////////////////////////////////////////////////
+  bool word_adaptable(  // should we adapt?
+      WERD_RES* word, uint16_t mode);
+
+  //// tfacepp.cpp ///////////////////////////////////////////////////////
+  void recog_word_recursive(WERD_RES* word);
+  void recog_word(WERD_RES* word);
+  void split_and_recog_word(WERD_RES* word);
+  void split_word(WERD_RES* word, int split_pt, WERD_RES** right_piece,
+                  BlamerBundle** orig_blamer_bundle) const;
+  void join_words(WERD_RES* word, WERD_RES* word2, BlamerBundle* orig_bb) const;
+  //// fixspace.cpp ///////////////////////////////////////////////////////
+  bool digit_or_numeric_punct(WERD_RES* word, int char_position);
+  int16_t eval_word_spacing(WERD_RES_LIST& word_res_list);
+  void match_current_words(WERD_RES_LIST& words, ROW* row, BLOCK* block);
+  int16_t fp_eval_word_spacing(WERD_RES_LIST& word_res_list);
+  void fix_noisy_space_list(WERD_RES_LIST& best_perm, ROW* row, BLOCK* block);
+  void fix_fuzzy_space_list(WERD_RES_LIST& best_perm, ROW* row, BLOCK* block);
+  void fix_sp_fp_word(WERD_RES_IT& word_res_it, ROW* row, BLOCK* block);
+  void fix_fuzzy_spaces(    // find fuzzy words
+      ETEXT_DESC* monitor,  // progress monitor
+      int32_t word_count,   // count of words in doc
+      PAGE_RES* page_res);
+  void dump_words(WERD_RES_LIST& perm, int16_t score, int16_t mode,
+                  bool improved);
+  bool fixspace_thinks_word_done(WERD_RES* word);
+  int16_t worst_noise_blob(WERD_RES* word_res, float* worst_noise_score);
+  float blob_noise_score(TBLOB* blob);
+  void break_noisiest_blob_word(WERD_RES_LIST& words);
+  //// docqual.cpp ////////////////////////////////////////////////////////
+#ifndef DISABLED_LEGACY_ENGINE
+  GARBAGE_LEVEL garbage_word(WERD_RES* word, bool ok_dict_word);
+  bool potential_word_crunch(WERD_RES* word, GARBAGE_LEVEL garbage_level,
+                             bool ok_dict_word);
+#endif
+  void tilde_crunch(PAGE_RES_IT& page_res_it);
+  void unrej_good_quality_words(  // unreject potential
+      PAGE_RES_IT& page_res_it);
+  void doc_and_block_rejection(  // reject big chunks
+      PAGE_RES_IT& page_res_it, bool good_quality_doc);
+  void quality_based_rejection(PAGE_RES_IT& page_res_it, bool good_quality_doc);
+  void convert_bad_unlv_chs(WERD_RES* word_res);
+  void tilde_delete(PAGE_RES_IT& page_res_it);
+  int16_t word_blob_quality(WERD_RES* word);
+  void word_char_quality(WERD_RES* word, int16_t* match_count,
+                         int16_t* accepted_match_count);
+  void unrej_good_chs(WERD_RES* word);
+  int16_t count_outline_errs(char c, int16_t outline_count);
+  int16_t word_outline_errs(WERD_RES* word);
+#ifndef DISABLED_LEGACY_ENGINE
+  bool terrible_word_crunch(WERD_RES* word, GARBAGE_LEVEL garbage_level);
+#endif
+  CRUNCH_MODE word_deletable(WERD_RES* word, int16_t& delete_mode);
+  int16_t failure_count(WERD_RES* word);
+  bool noise_outlines(TWERD* word);
+  //// pagewalk.cpp ///////////////////////////////////////////////////////
+  void process_selected_words(
+      PAGE_RES* page_res,  // blocks to check
+      // function to call
+      TBOX& selection_box,
+      bool (tesseract::Tesseract::*word_processor)(PAGE_RES_IT* pr_it));
+  //// tessbox.cpp ///////////////////////////////////////////////////////
+  void tess_add_doc_word(       // test acceptability
+      WERD_CHOICE* word_choice  // after context
+  );
+  void tess_segment_pass_n(int pass_n, WERD_RES* word);
+  bool tess_acceptable_word(WERD_RES* word);
+
+  //// applybox.cpp //////////////////////////////////////////////////////
+  // Applies the box file based on the image name filename, and resegments
+  // the words in the block_list (page), with:
+  // blob-mode: one blob per line in the box file, words as input.
+  // word/line-mode: one blob per space-delimited unit after the #, and one word
+  // per line in the box file. (See comment above for box file format.)
+  // If find_segmentation is true, (word/line mode) then the classifier is used
+  // to re-segment words/lines to match the space-delimited truth string for
+  // each box. In this case, the input box may be for a word or even a whole
+  // text line, and the output words will contain multiple blobs corresponding
+  // to the space-delimited input string.
+  // With find_segmentation false, no classifier is needed, but the chopper
+  // can still be used to correctly segment touching characters with the help
+  // of the input boxes.
+  // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
+  // from normal classification, ie. with a word, chopped_word, rebuild_word,
+  // seam_array, denorm, box_word, and best_state, but NO best_choice or
+  // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
+  // Instead, the correct_text member of WERD_RES is set, and this may be later
+  // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
+  // is not required before calling ApplyBoxTraining.
+  PAGE_RES* ApplyBoxes(const char* filename, bool find_segmentation,
+                       BLOCK_LIST* block_list);
+
+  // Any row xheight that is significantly different from the median is set
+  // to the median.
+  void PreenXHeights(BLOCK_LIST* block_list);
+
+  // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
+  // All fuzzy spaces are removed, and all the words are maximally chopped.
+  PAGE_RES* SetupApplyBoxes(const std::vector<TBOX>& boxes,
+                            BLOCK_LIST* block_list);
+  // Tests the chopper by exhaustively running chop_one_blob.
+  // The word_res will contain filled chopped_word, seam_array, denorm,
+  // box_word and best_state for the maximally chopped word.
+  void MaximallyChopWord(const std::vector<TBOX>& boxes, BLOCK* block,
+                         ROW* row, WERD_RES* word_res);
+  // Gather consecutive blobs that match the given box into the best_state
+  // and corresponding correct_text.
+  // Fights over which box owns which blobs are settled by pre-chopping and
+  // applying the blobs to box or next_box with the least non-overlap.
+  // Returns false if the box was in error, which can only be caused by
+  // failing to find an appropriate blob for a box.
+  // This means that occasionally, blobs may be incorrectly segmented if the
+  // chopper fails to find a suitable chop point.
+  bool ResegmentCharBox(PAGE_RES* page_res, const TBOX* prev_box,
+                        const TBOX& box, const TBOX* next_box,
+                        const char* correct_text);
+  // Consume all source blobs that strongly overlap the given box,
+  // putting them into a new word, with the correct_text label.
+  // Fights over which box owns which blobs are settled by
+  // applying the blobs to box or next_box with the least non-overlap.
+  // Returns false if the box was in error, which can only be caused by
+  // failing to find an overlapping blob for a box.
+  bool ResegmentWordBox(BLOCK_LIST* block_list, const TBOX& box,
+                        const TBOX* next_box, const char* correct_text);
+  // Resegments the words by running the classifier in an attempt to find the
+  // correct segmentation that produces the required string.
+  void ReSegmentByClassification(PAGE_RES* page_res);
+  // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
+  // Returns false if an invalid UNICHAR_ID is encountered.
+  bool ConvertStringToUnichars(const char* utf8,
+                               GenericVector<UNICHAR_ID>* class_ids);
+  // Resegments the word to achieve the target_text from the classifier.
+  // Returns false if the re-segmentation fails.
+  // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
+  // applies a full search on the classifier results to find the best classified
+  // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
+  // substitutions ARE used.
+  bool FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
+                        WERD_RES* word_res);
+  // Recursive helper to find a match to the target_text (from text_index
+  // position) in the choices (from choices_pos position).
+  // Choices is an array of GenericVectors, of length choices_length, with each
+  // element representing a starting position in the word, and the
+  // GenericVector holding classification results for a sequence of consecutive
+  // blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
+  void SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
+                     int choices_pos, int choices_length,
+                     const GenericVector<UNICHAR_ID>& target_text,
+                     int text_index, float rating,
+                     GenericVector<int>* segmentation, float* best_rating,
+                     GenericVector<int>* best_segmentation);
+  // Counts up the labelled words and the blobs within.
+  // Deletes all unused or emptied words, counting the unused ones.
+  // Resets W_BOL and W_EOL flags correctly.
+  // Builds the rebuild_word and rebuilds the box_word.
+  void TidyUp(PAGE_RES* page_res);
+  // Logs a bad box by line in the box file and box coords.
+  void ReportFailedBox(int boxfile_lineno, TBOX box, const char* box_ch,
+                       const char* err_msg);
+  // Creates a fake best_choice entry in each WERD_RES with the correct text.
+  void CorrectClassifyWords(PAGE_RES* page_res);
+  // Call LearnWord to extract features for labelled blobs within each word.
+  // Features are stored in an internal buffer.
+  void ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res);
+
+  //// fixxht.cpp ///////////////////////////////////////////////////////
+  // Returns the number of misfit blob tops in this word.
+  int CountMisfitTops(WERD_RES* word_res);
+  // Returns a new x-height in pixels (original image coords) that is
+  // maximally compatible with the result in word_res.
+  // Returns 0.0f if no x-height is found that is better than the current
+  // estimate.
+  float ComputeCompatibleXheight(WERD_RES* word_res, float* baseline_shift);
+  //// Data members ///////////////////////////////////////////////////////
+  // TODO(ocr-team): Find and remove obsolete parameters.
+  BOOL_VAR_H(tessedit_resegment_from_boxes, false,
+             "Take segmentation and labeling from box file");
+  BOOL_VAR_H(tessedit_resegment_from_line_boxes, false,
+             "Conversion of word/line box file to char box file");
+  BOOL_VAR_H(tessedit_train_from_boxes, false,
+             "Generate training data from boxed chars");
+  BOOL_VAR_H(tessedit_make_boxes_from_boxes, false,
+             "Generate more boxes from boxed chars");
+  BOOL_VAR_H(tessedit_train_line_recognizer, false,
+             "Break input into lines and remap boxes if present");
+  BOOL_VAR_H(tessedit_dump_pageseg_images, false,
+             "Dump intermediate images made during page segmentation");
+  BOOL_VAR_H(tessedit_do_invert, true,
+             "Try inverting the image in `LSTMRecognizeWord`");
+  INT_VAR_H(tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
+            "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
+            " 5=line, 6=word, 7=char"
+            " (Values from PageSegMode enum in tesseract/publictypes.h)");
+  INT_VAR_H(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT,
+            "Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults"
+            " to loading and running the most accurate available.");
+  STRING_VAR_H(tessedit_char_blacklist, "",
+               "Blacklist of chars not to recognize");
+  STRING_VAR_H(tessedit_char_whitelist, "", "Whitelist of chars to recognize");
+  STRING_VAR_H(tessedit_char_unblacklist, "",
+               "List of chars to override tessedit_char_blacklist");
+  BOOL_VAR_H(tessedit_ambigs_training, false,
+             "Perform training for ambiguities");
+  INT_VAR_H(pageseg_devanagari_split_strategy,
+            tesseract::ShiroRekhaSplitter::NO_SPLIT,
+            "Whether to use the top-line splitting process for Devanagari "
+            "documents while performing page-segmentation.");
+  INT_VAR_H(ocr_devanagari_split_strategy,
+            tesseract::ShiroRekhaSplitter::NO_SPLIT,
+            "Whether to use the top-line splitting process for Devanagari "
+            "documents while performing ocr.");
+  STRING_VAR_H(tessedit_write_params_to_file, "",
+               "Write all parameters to the given file.");
+  BOOL_VAR_H(tessedit_adaption_debug, false,
+             "Generate and print debug information for adaption");
+  INT_VAR_H(bidi_debug, 0, "Debug level for BiDi");
+  INT_VAR_H(applybox_debug, 1, "Debug level");
+  INT_VAR_H(applybox_page, 0, "Page number to apply boxes from");
+  STRING_VAR_H(applybox_exposure_pattern, ".exp",
+               "Exposure value follows this pattern in the image"
+               " filename. The name of the image files are expected"
+               " to be in the form [lang].[fontname].exp[num].tif");
+  BOOL_VAR_H(applybox_learn_chars_and_char_frags_mode, false,
+             "Learn both character fragments (as is done in the"
+             " special low exposure mode) as well as unfragmented"
+             " characters.");
+  BOOL_VAR_H(applybox_learn_ngrams_mode, false,
+             "Each bounding box is assumed to contain ngrams. Only"
+             " learn the ngrams whose outlines overlap horizontally.");
+  BOOL_VAR_H(tessedit_display_outwords, false, "Draw output words");
+  BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices");
+  BOOL_VAR_H(tessedit_timing_debug, false, "Print timing stats");
+  BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true, "Try to improve fuzzy spaces");
+  BOOL_VAR_H(tessedit_unrej_any_wd, false,
+             "Don't bother with word plausibility");
+  BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?");
+  BOOL_VAR_H(tessedit_enable_doc_dict, true,
+             "Add words to the document dictionary");
+  BOOL_VAR_H(tessedit_debug_fonts, false, "Output font info per char");
+  BOOL_VAR_H(tessedit_debug_block_rejection, false, "Block and Row stats");
+  BOOL_VAR_H(tessedit_enable_bigram_correction, true,
+             "Enable correction based on the word bigram dictionary.");
+  BOOL_VAR_H(tessedit_enable_dict_correction, false,
+             "Enable single word correction based on the dictionary.");
+  INT_VAR_H(tessedit_bigram_debug, 0,
+            "Amount of debug output for bigram "
+            "correction.");
+  BOOL_VAR_H(enable_noise_removal, true,
+             "Remove and conditionally reassign small outlines when they"
+             " confuse layout analysis, determining diacritics vs noise");
+  INT_VAR_H(debug_noise_removal, 0, "Debug reassignment of small outlines");
+  // Worst (min) certainty, for which a diacritic is allowed to make the base
+  // character worse and still be included.
+  double_VAR_H(noise_cert_basechar, -8.0, "Hingepoint for base char certainty");
+  // Worst (min) certainty, for which a non-overlapping diacritic is allowed to
+  // make the base character worse and still be included.
+  double_VAR_H(noise_cert_disjoint, -2.5, "Hingepoint for disjoint certainty");
+  // Worst (min) certainty, for which a diacritic is allowed to make a new
+  // stand-alone blob.
+  double_VAR_H(noise_cert_punc, -2.5, "Threshold for new punc char certainty");
+  // Factor of certainty margin for adding diacritics to not count as worse.
+  double_VAR_H(noise_cert_factor, 0.375,
+               "Scaling on certainty diff from Hingepoint");
+  INT_VAR_H(noise_maxperblob, 8, "Max diacritics to apply to a blob");
+  INT_VAR_H(noise_maxperword, 16, "Max diacritics to apply to a word");
+  INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug");
+  STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation");
+  STRING_VAR_H(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
+  STRING_VAR_H(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation");
+  double_VAR_H(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit");
+  double_VAR_H(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit");
+  double_VAR_H(quality_outline_pc, 1.0,
+               "good_quality_doc lte outline error limit");
+  double_VAR_H(quality_char_pc, 0.95, "good_quality_doc gte good char limit");
+  INT_VAR_H(quality_min_initial_alphas_reqd, 2, "alphas in a good word");
+  INT_VAR_H(tessedit_tess_adaption_mode, 0x27,
+            "Adaptation decision algorithm for tess");
+  BOOL_VAR_H(tessedit_minimal_rej_pass1, false,
+             "Do minimal rejection on pass 1 output");
+  BOOL_VAR_H(tessedit_test_adaption, false, "Test adaption criteria");
+  BOOL_VAR_H(test_pt, false, "Test for point");
+  double_VAR_H(test_pt_x, 99999.99, "xcoord");
+  double_VAR_H(test_pt_y, 99999.99, "ycoord");
+  INT_VAR_H(multilang_debug_level, 0, "Print multilang debug info.");
+  INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info.");
+  BOOL_VAR_H(paragraph_text_based, true,
+             "Run paragraph detection on the post-text-recognition "
+             "(more accurate)");
+  BOOL_VAR_H(lstm_use_matrix, 1, "Use ratings matrix/beam searct with lstm");
+  STRING_VAR_H(outlines_odd, "%| ", "Non standard number of outlines");
+  STRING_VAR_H(outlines_2, "ij!?%\":;", "Non standard number of outlines");
+  BOOL_VAR_H(tessedit_good_quality_unrej, true,
+             "Reduce rejection on good docs");
+  BOOL_VAR_H(tessedit_use_reject_spaces, true, "Reject spaces?");
+  double_VAR_H(tessedit_reject_doc_percent, 65.00,
+               "%rej allowed before rej whole doc");
+  double_VAR_H(tessedit_reject_block_percent, 45.00,
+               "%rej allowed before rej whole block");
+  double_VAR_H(tessedit_reject_row_percent, 40.00,
+               "%rej allowed before rej whole row");
+  double_VAR_H(tessedit_whole_wd_rej_row_percent, 70.00,
+               "Number of row rejects in whole word rejects"
+               "which prevents whole row rejection");
+  BOOL_VAR_H(tessedit_preserve_blk_rej_perfect_wds, true,
+             "Only rej partially rejected words in block rejection");
+  BOOL_VAR_H(tessedit_preserve_row_rej_perfect_wds, true,
+             "Only rej partially rejected words in row rejection");
+  BOOL_VAR_H(tessedit_dont_blkrej_good_wds, false,
+             "Use word segmentation quality metric");
+  BOOL_VAR_H(tessedit_dont_rowrej_good_wds, false,
+             "Use word segmentation quality metric");
+  INT_VAR_H(tessedit_preserve_min_wd_len, 2,
+            "Only preserve wds longer than this");
+  BOOL_VAR_H(tessedit_row_rej_good_docs, true,
+             "Apply row rejection to good docs");
+  double_VAR_H(tessedit_good_doc_still_rowrej_wd, 1.1,
+               "rej good doc wd if more than this fraction rejected");
+  BOOL_VAR_H(tessedit_reject_bad_qual_wds, true, "Reject all bad quality wds");
+  BOOL_VAR_H(tessedit_debug_doc_rejection, false, "Page stats");
+  BOOL_VAR_H(tessedit_debug_quality_metrics, false,
+             "Output data to debug file");
+  BOOL_VAR_H(bland_unrej, false, "unrej potential with no checks");
+  double_VAR_H(quality_rowrej_pc, 1.1, "good_quality_doc gte good char limit");
+  BOOL_VAR_H(unlv_tilde_crunching, false, "Mark v.bad words for tilde crunch");
+  BOOL_VAR_H(hocr_font_info, false, "Add font info to hocr output");
+  BOOL_VAR_H(hocr_char_boxes, false,
+             "Add coordinates for each character to hocr output");
+  BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
+  BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
+  double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");
+  BOOL_VAR_H(crunch_terrible_garbage, true, "As it says");
+  double_VAR_H(crunch_poor_garbage_cert, -9.0, "crunch garbage cert lt this");
+  double_VAR_H(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this");
+  double_VAR_H(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this");
+  double_VAR_H(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this");
+  double_VAR_H(crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
+  double_VAR_H(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
+  double_VAR_H(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
+  double_VAR_H(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
+  double_VAR_H(crunch_del_min_width, 3.0, "Del if word width lt xht x this");
+  double_VAR_H(crunch_del_high_word, 1.5, "Del if word gt xht x this above bl");
+  double_VAR_H(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl");
+  double_VAR_H(crunch_small_outlines_size, 0.6, "Small if lt xht x this");
+  INT_VAR_H(crunch_rating_max, 10, "For adj length in rating per ch");
+  INT_VAR_H(crunch_pot_indicators, 1, "How many potential indicators needed");
+  BOOL_VAR_H(crunch_leave_ok_strings, true, "Don't touch sensible strings");
+  BOOL_VAR_H(crunch_accept_ok, true, "Use acceptability in okstring");
+  BOOL_VAR_H(crunch_leave_accept_strings, false,
+             "Don't pot crunch sensible strings");
+  BOOL_VAR_H(crunch_include_numerals, false, "Fiddle alpha figures");
+  INT_VAR_H(crunch_leave_lc_strings, 4,
+            "Don't crunch words with long lower case strings");
+  INT_VAR_H(crunch_leave_uc_strings, 4,
+            "Don't crunch words with long lower case strings");
+  INT_VAR_H(crunch_long_repetitions, 3, "Crunch words with long repetitions");
+  INT_VAR_H(crunch_debug, 0, "As it says");
+  INT_VAR_H(fixsp_non_noise_limit, 1, "How many non-noise blbs either side?");
+  double_VAR_H(fixsp_small_outlines_size, 0.28, "Small if lt xht x this");
+  BOOL_VAR_H(tessedit_prefer_joined_punct, false, "Reward punctuation joins");
+  INT_VAR_H(fixsp_done_mode, 1, "What constitues done for spacing");
+  INT_VAR_H(debug_fix_space_level, 0, "Contextual fixspace debug");
+  STRING_VAR_H(numeric_punctuation, ".,", "Punct. chs expected WITHIN numbers");
+  INT_VAR_H(x_ht_acceptance_tolerance, 8,
+            "Max allowed deviation of blob top outside of font data");
+  INT_VAR_H(x_ht_min_change, 8, "Min change in xht before actually trying it");
+  INT_VAR_H(superscript_debug, 0, "Debug level for sub & superscript fixer");
+  double_VAR_H(superscript_worse_certainty, 2.0,
+               "How many times worse "
+               "certainty does a superscript position glyph need to be for us "
+               "to try classifying it as a char with a different baseline?");
+  double_VAR_H(superscript_bettered_certainty, 0.97,
+               "What reduction in "
+               "badness do we think sufficient to choose a superscript over "
+               "what we'd thought.  For example, a value of 0.6 means we want "
+               "to reduce badness of certainty by 40%");
+  double_VAR_H(superscript_scaledown_ratio, 0.4,
+               "A superscript scaled down more than this is unbelievably "
+               "small.  For example, 0.3 means we expect the font size to "
+               "be no smaller than 30% of the text line font size.");
+  double_VAR_H(subscript_max_y_top, 0.5,
+               "Maximum top of a character measured as a multiple of x-height "
+               "above the baseline for us to reconsider whether it's a "
+               "subscript.");
+  double_VAR_H(superscript_min_y_bottom, 0.3,
+               "Minimum bottom of a character measured as a multiple of "
+               "x-height above the baseline for us to reconsider whether it's "
+               "a superscript.");
+  BOOL_VAR_H(tessedit_write_block_separators, false,
+             "Write block separators in output");
+  BOOL_VAR_H(tessedit_write_rep_codes, false, "Write repetition char code");
+  BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
+  BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
+  BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
+  BOOL_VAR_H(tessedit_create_alto, false, "Write .xml ALTO output file");
+  BOOL_VAR_H(tessedit_create_lstmbox, false,
+             "Write .box file for LSTM training");
+  BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
+  BOOL_VAR_H(tessedit_create_wordstrbox, false,
+             "Write WordStr format .box output file");
+  BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
+  BOOL_VAR_H(textonly_pdf, false,
+             "Create PDF with only one invisible text layer");
+  INT_VAR_H(jpg_quality, 85, "Set JPEG quality level");
+  INT_VAR_H(user_defined_dpi, 0, "Specify DPI for input image");
+  INT_VAR_H(min_characters_to_try, 50,
+            "Specify minimum characters to try during OSD");
+  STRING_VAR_H(unrecognised_char, "|", "Output char for unidentified blobs");
+  INT_VAR_H(suspect_level, 99, "Suspect marker level");
+  INT_VAR_H(suspect_short_words, 2, "Don't Suspect dict wds longer than this");
+  BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected");
+  double_VAR_H(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit");
+  double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit");
+  BOOL_VAR_H(tessedit_minimal_rejection, false, "Only reject tess failures");
+  BOOL_VAR_H(tessedit_zero_rejection, false, "Don't reject ANYTHING");
+  BOOL_VAR_H(tessedit_word_for_word, false,
+             "Make output have exactly one word per WERD");
+  BOOL_VAR_H(tessedit_zero_kelvin_rejection, false,
+             "Don't reject ANYTHING AT ALL");
+  INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm");
+  BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug");
+  BOOL_VAR_H(tessedit_flip_0O, true, "Contextual 0O O0 flips");
+  double_VAR_H(tessedit_lower_flip_hyphen, 1.5, "Aspect ratio dot/hyphen test");
+  double_VAR_H(tessedit_upper_flip_hyphen, 1.8, "Aspect ratio dot/hyphen test");
+  BOOL_VAR_H(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector");
+  BOOL_VAR_H(rej_1Il_use_dict_word, false, "Use dictword test");
+  BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Don't double check");
+  BOOL_VAR_H(rej_use_tess_accepted, true, "Individual rejection control");
+  BOOL_VAR_H(rej_use_tess_blanks, true, "Individual rejection control");
+  BOOL_VAR_H(rej_use_good_perm, true, "Individual rejection control");
+  BOOL_VAR_H(rej_use_sensible_wd, false, "Extend permuter check");
+  BOOL_VAR_H(rej_alphas_in_number_perm, false, "Extend permuter check");
+  double_VAR_H(rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract");
+  INT_VAR_H(tessedit_image_border, 2, "Rej blbs near image edge limit");
+  STRING_VAR_H(ok_repeated_ch_non_alphanum_wds, "-?*\075", "Allow NN to unrej");
+  STRING_VAR_H(conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
+  INT_VAR_H(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this");
+  BOOL_VAR_H(tessedit_create_boxfile, false, "Output text with boxes");
+  INT_VAR_H(tessedit_page_number, -1,
+            "-1 -> All pages, else specific page to process");
+  BOOL_VAR_H(tessedit_write_images, false, "Capture the image from the IPE");
+  BOOL_VAR_H(interactive_display_mode, false, "Run interactively?");
+  STRING_VAR_H(file_type, ".tif", "Filename extension");
+  BOOL_VAR_H(tessedit_override_permuter, true, "According to dict_word");
+  STRING_VAR_H(tessedit_load_sublangs, "",
+               "List of languages to load with this one");
+  BOOL_VAR_H(tessedit_use_primary_params_model, false,
+             "In multilingual mode use params model of the primary language");
+  // Min acceptable orientation margin (difference in scores between top and 2nd
+  // choice in OSResults::orientations) to believe the page orientation.
+  double_VAR_H(min_orientation_margin, 7.0,
+               "Min acceptable orientation margin");
+  BOOL_VAR_H(textord_tabfind_show_vlines, false, "Debug line finding");
+  BOOL_VAR_H(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model");
+  BOOL_VAR_H(poly_allow_detailed_fx, false,
+             "Allow feature extractors to see the original outline");
+  BOOL_VAR_H(tessedit_init_config_only, false,
+             "Only initialize with the config file. Useful if the instance is "
+             "not going to be used for OCR but say only for layout analysis.");
+  BOOL_VAR_H(textord_equation_detect, false, "Turn on equation detector");
+  BOOL_VAR_H(textord_tabfind_vertical_text, true, "Enable vertical detection");
+  BOOL_VAR_H(textord_tabfind_force_vertical_text, false,
+             "Force using vertical text page mode");
+  double_VAR_H(textord_tabfind_vertical_text_ratio, 0.5,
+               "Fraction of textlines deemed vertical to use vertical page "
+               "mode");
+  double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75,
+               "Fraction of height used as a minimum gap for aligned blobs.");
+  INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible");
+  BOOL_VAR_H(preserve_interword_spaces, false,
+             "Preserve multiple interword spaces");
+  STRING_VAR_H(page_separator, "\f",
+               "Page separator (default is form feed control character)");
+  INT_VAR_H(lstm_choice_mode, 0,
+            "Allows to include alternative symbols choices in the hOCR "
+            "output. "
+            "Valid input values are 0, 1 and 2. 0 is the default value. "
+            "With 1 the alternative symbol choices per timestep are included. "
+            "With 2 the alternative symbol choices are extracted from the CTC "
+            "process instead of the lattice. The choices are mapped per "
+            "character.");
+  INT_VAR_H(lstm_choice_iterations, 5,
+            "Sets the number of cascading iterations for the Beamsearch in "
+            "lstm_choice_mode. Note that lstm_choice_mode must be set to "
+            "a value greater than 0 to produce results.");
+  double_VAR_H(lstm_rating_coefficient, 5,
+               "Sets the rating coefficient for the lstm choices. The smaller "
+               "the coefficient, the better are the ratings for each choice "
+               "and less information is lost due to the cut off at 0. The "
+               "standard value is 5.");
+  BOOL_VAR_H(pageseg_apply_music_mask, true,
+             "Detect music staff and remove intersecting components");
+
+  //// ambigsrecog.cpp /////////////////////////////////////////////////////////
+  FILE* init_recog_training(const char* filename);
+  void recog_training_segmented(const char* filename, PAGE_RES* page_res,
+                                volatile ETEXT_DESC* monitor,
+                                FILE* output_file);
+  void ambigs_classify_and_output(const char* label, PAGE_RES_IT* pr_it,
+                                  FILE* output_file);
+
+ private:
+  // The filename of a backup config file. If not null, then we currently
+  // have a temporary debug config file loaded, and backup_config_file_
+  // will be loaded, and set to null when debug is complete.
+  const char* backup_config_file_;
+  // The filename of a config file to read when processing a debug word.
+  STRING word_config_;
+  // Image used for input to layout analysis and tesseract recognition.
+  // May be modified by the ShiroRekhaSplitter to eliminate the top-line.
+  Pix* pix_binary_;
+  // Grey-level input image if the input was not binary, otherwise nullptr.
+  Pix* pix_grey_;
+  // Original input image. Color if the input was color.
+  Pix* pix_original_;
+  // Thresholds that were used to generate the thresholded image from grey.
+  Pix* pix_thresholds_;
+  // Debug images. If non-empty, will be written on destruction.
+  DebugPixa pixa_debug_;
+  // Input image resolution after any scaling. The resolution is not well
+  // transmitted by operations on Pix, so we keep an independent record here.
+  int source_resolution_;
+  // The shiro-rekha splitter object which is used to split top-lines in
+  // Devanagari words to provide a better word and grapheme segmentation.
+  ShiroRekhaSplitter splitter_;
+  // Page segmentation/layout
+  Textord textord_;
+  // True if the primary language uses right_to_left reading order.
+  bool right_to_left_;
+  Pix* scaled_color_;
+  int scaled_factor_;
+  FCOORD deskew_;
+  FCOORD reskew_;
+  TesseractStats stats_;
+  // Sub-languages to be tried in addition to this.
+  std::vector<Tesseract*> sub_langs_;
+  // Most recently used Tesseract out of this and sub_langs_. The default
+  // language for the next word.
+  Tesseract* most_recently_used_;
+  // The size of the font table, ie max possible font id + 1.
+  int font_table_size_;
+  // Equation detector. Note: this pointer is NOT owned by the class.
+  EquationDetect* equ_detect_;
+  // LSTM recognizer, if available.
+  LSTMRecognizer* lstm_recognizer_;
+  // Output "page" number (actually line number) using TrainLineRecognizer.
+  int train_line_page_num_;
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_CCMAIN_TESSERACTCLASS_H_
diff --git a/tesseract/src/ccmain/tessvars.cpp b/tesseract/src/ccmain/tessvars.cpp
new file mode 100644
index 00000000..f72b0c27
--- /dev/null
+++ b/tesseract/src/ccmain/tessvars.cpp
@@ -0,0 +1,24 @@
+/**********************************************************************
+ * File:        tessvars.cpp  (Formerly tessvars.c)
+ * Description: Variables and other globals for tessedit.
+ * Author:      Ray Smith
+ * Created:     Mon Apr 13 13:13:23 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <cstdio>
+
+#include  "tessvars.h"
+
+FILE *debug_fp = stderr;  // write debug stuff here
diff --git a/tesseract/src/ccmain/tessvars.h b/tesseract/src/ccmain/tessvars.h
new file mode 100644
index 00000000..8c063a11
--- /dev/null
+++ b/tesseract/src/ccmain/tessvars.h
@@ -0,0 +1,27 @@
+/**********************************************************************
+ * File:        tessvars.h  (Formerly tessvars.h)
+ * Description: Variables and other globals for tessedit.
+ * Author:      Ray Smith
+ * Created:     Mon Apr 13 13:13:23 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSVARS_H
+#define TESSVARS_H
+
+#include <cstdio>
+
+extern FILE *debug_fp;    // write debug stuff here
+
+#endif
diff --git a/tesseract/src/ccmain/tfacepp.cpp b/tesseract/src/ccmain/tfacepp.cpp
new file mode 100644
index 00000000..e5bbb4e4
--- /dev/null
+++ b/tesseract/src/ccmain/tfacepp.cpp
@@ -0,0 +1,322 @@
+/**********************************************************************
+ * File:        tfacepp.cpp  (Formerly tface++.c)
+ * Description: C++ side of the C/C++ Tess/Editor interface.
+ * Author:      Ray Smith
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <cmath>
+
+#include "blamer.h"
+#include "errcode.h"
+#include "ratngs.h"
+#include "reject.h"
+#include "tesseractclass.h"
+#include "werd.h"
+
+#define MAX_UNDIVIDED_LENGTH 24
+
+
+
+/**********************************************************************
+ * recog_word
+ *
+ * Convert the word to tess form and pass it to the tess segmenter.
+ * Convert the output back to editor form.
+ **********************************************************************/
+namespace tesseract {
+void Tesseract::recog_word(WERD_RES *word) {
+  if (wordrec_skip_no_truth_words && (word->blamer_bundle == nullptr ||
+      word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {
+    if (classify_debug_level) tprintf("No truth for word - skipping\n");
+    word->tess_failed = true;
+    return;
+  }
+  ASSERT_HOST(!word->chopped_word->blobs.empty());
+  recog_word_recursive(word);
+  word->SetupBoxWord();
+  if (word->best_choice->length() != word->box_word->length()) {
+    tprintf("recog_word ASSERT FAIL String:\"%s\"; "
+            "Strlen=%d; #Blobs=%d\n",
+            word->best_choice->debug_string().c_str(),
+            word->best_choice->length(), word->box_word->length());
+  }
+  ASSERT_HOST(word->best_choice->length() == word->box_word->length());
+  // Check that the ratings matrix size matches the sum of all the
+  // segmentation states.
+  if (!word->StatesAllValid()) {
+    tprintf("Not all words have valid states relative to ratings matrix!!");
+    word->DebugWordChoices(true, nullptr);
+    ASSERT_HOST(word->StatesAllValid());
+  }
+  if (tessedit_override_permuter) {
+    /* Override the permuter type if a straight dictionary check disagrees. */
+    uint8_t perm_type = word->best_choice->permuter();
+    if ((perm_type != SYSTEM_DAWG_PERM) &&
+        (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
+      uint8_t real_dict_perm_type = dict_word(*word->best_choice);
+      if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
+           (real_dict_perm_type == FREQ_DAWG_PERM) ||
+           (real_dict_perm_type == USER_DAWG_PERM)) &&
+          (alpha_count(word->best_choice->unichar_string().c_str(),
+                       word->best_choice->unichar_lengths().c_str()) > 0)) {
+        word->best_choice->set_permuter(real_dict_perm_type);  // use dict perm
+      }
+    }
+    if (tessedit_rejection_debug &&
+        perm_type != word->best_choice->permuter()) {
+      tprintf("Permuter Type Flipped from %d to %d\n",
+              perm_type, word->best_choice->permuter());
+    }
+  }
+  // Factored out from control.cpp
+  ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));
+  if (word->best_choice == nullptr || word->best_choice->length() == 0 ||
+      static_cast<int>(strspn(word->best_choice->unichar_string().c_str(),
+                              " ")) == word->best_choice->length()) {
+    word->tess_failed = true;
+    word->reject_map.initialise(word->box_word->length());
+    word->reject_map.rej_word_tess_failure();
+  } else {
+    word->tess_failed = false;
+  }
+}
+
+
+/**********************************************************************
+ * recog_word_recursive
+ *
+ * Convert the word to tess form and pass it to the tess segmenter.
+ * Convert the output back to editor form.
+ **********************************************************************/
+void Tesseract::recog_word_recursive(WERD_RES *word) {
+  int word_length = word->chopped_word->NumBlobs();  // no of blobs
+  if (word_length > MAX_UNDIVIDED_LENGTH) {
+    return split_and_recog_word(word);
+  }
+  cc_recog(word);
+  word_length = word->rebuild_word->NumBlobs();  // No of blobs in output.
+
+  // Do sanity checks and minor fixes on best_choice.
+  if (word->best_choice->length() > word_length) {
+    word->best_choice->make_bad();  // should never happen
+    tprintf("recog_word: Discarded long string \"%s\""
+            " (%d characters vs %d blobs)\n",
+            word->best_choice->unichar_string().c_str(),
+            word->best_choice->length(), word_length);
+    tprintf("Word is at:");
+    word->word->bounding_box().print();
+  }
+  if (word->best_choice->length() < word_length) {
+    UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
+    while (word->best_choice->length() < word_length) {
+      word->best_choice->append_unichar_id(space_id, 1, 0.0,
+                                           word->best_choice->certainty());
+    }
+  }
+}
+
+
+/**********************************************************************
+ * split_and_recog_word
+ *
+ * Split the word into 2 smaller pieces at the largest gap.
+ * Recognize the pieces and stick the results back together.
+ **********************************************************************/
+void Tesseract::split_and_recog_word(WERD_RES *word) {
+  // Find the biggest blob gap in the chopped_word.
+  int bestgap = -INT32_MAX;
+  int split_index = 0;
+  for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
+    TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
+    TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
+    int gap = blob_box.left() - prev_box.right();
+    if (gap > bestgap) {
+      bestgap = gap;
+      split_index = b;
+    }
+  }
+  ASSERT_HOST(split_index > 0);
+
+  WERD_RES *word2 = nullptr;
+  BlamerBundle *orig_bb = nullptr;
+  split_word(word, split_index, &word2, &orig_bb);
+
+  // Recognize the first part of the word.
+  recog_word_recursive(word);
+  // Recognize the second part of the word.
+  recog_word_recursive(word2);
+
+  join_words(word, word2, orig_bb);
+}
+
+
+/**********************************************************************
+ * split_word
+ *
+ * Split a given WERD_RES in place into two smaller words for recognition.
+ * split_pt is the index of the first blob to go in the second word.
+ * The underlying word is left alone, only the TWERD (and subsequent data)
+ * are split up.  orig_blamer_bundle is set to the original blamer bundle,
+ * and will now be owned by the caller.  New blamer bundles are forged for the
+ * two pieces.
+ **********************************************************************/
+void Tesseract::split_word(WERD_RES *word,
+                           int split_pt,
+                           WERD_RES **right_piece,
+                           BlamerBundle **orig_blamer_bundle) const {
+  ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs());
+
+  // Save a copy of the blamer bundle so we can try to reconstruct it below.
+  BlamerBundle *orig_bb =
+      word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : nullptr;
+
+  auto *word2 = new WERD_RES(*word);
+
+  // blow away the copied chopped_word, as we want to work with
+  // the blobs from the input chopped_word so seam_arrays can be merged.
+  TWERD *chopped = word->chopped_word;
+  auto *chopped2 = new TWERD;
+  chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
+  for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
+    chopped2->blobs.push_back(chopped->blobs[i]);
+  }
+  chopped->blobs.truncate(split_pt);
+  word->chopped_word = nullptr;
+  delete word2->chopped_word;
+  word2->chopped_word = nullptr;
+
+  const UNICHARSET &unicharset = *word->uch_set;
+  word->ClearResults();
+  word2->ClearResults();
+  word->chopped_word = chopped;
+  word2->chopped_word = chopped2;
+  word->SetupBasicsFromChoppedWord(unicharset);
+  word2->SetupBasicsFromChoppedWord(unicharset);
+
+  // Try to adjust the blamer bundle.
+  if (orig_bb != nullptr) {
+    // TODO(rays) Looks like a leak to me.
+    // orig_bb should take, rather than copy.
+    word->blamer_bundle = new BlamerBundle();
+    word2->blamer_bundle = new BlamerBundle();
+    orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
+                         word2->chopped_word->blobs[0]->bounding_box().left(),
+                         wordrec_debug_blamer,
+                         word->blamer_bundle, word2->blamer_bundle);
+  }
+
+  *right_piece = word2;
+  *orig_blamer_bundle = orig_bb;
+}
+
+
+/**********************************************************************
+ * join_words
+ *
+ * The opposite of split_word():
+ *  join word2 (including any recognized data / seam array / etc)
+ *  onto the right of word and then delete word2.
+ *  Also, if orig_bb is provided, stitch it back into word.
+ **********************************************************************/
+void Tesseract::join_words(WERD_RES *word,
+                           WERD_RES *word2,
+                           BlamerBundle *orig_bb) const {
+  TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
+  TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
+  // Tack the word2 outputs onto the end of the word outputs.
+  word->chopped_word->blobs += word2->chopped_word->blobs;
+  word->rebuild_word->blobs += word2->rebuild_word->blobs;
+  word2->chopped_word->blobs.clear();
+  word2->rebuild_word->blobs.clear();
+  TPOINT split_pt;
+  split_pt.x = (prev_box.right() + blob_box.left()) / 2;
+  split_pt.y = (prev_box.top() + prev_box.bottom() +
+                blob_box.top() + blob_box.bottom()) / 4;
+  // Move the word2 seams onto the end of the word1 seam_array.
+  // Since the seam list is one element short, an empty seam marking the
+  // end of the last blob in the first word is needed first.
+  word->seam_array.push_back(new SEAM(0.0f, split_pt));
+  word->seam_array += word2->seam_array;
+  word2->seam_array.truncate(0);
+  // Fix widths and gaps.
+  word->blob_widths += word2->blob_widths;
+  word->blob_gaps += word2->blob_gaps;
+  // Fix the ratings matrix.
+  int rat1 = word->ratings->dimension();
+  int rat2 = word2->ratings->dimension();
+  word->ratings->AttachOnCorner(word2->ratings);
+  ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
+  word->best_state += word2->best_state;
+  // Append the word choices.
+  *word->raw_choice += *word2->raw_choice;
+
+  // How many alt choices from each should we try to get?
+  const int kAltsPerPiece = 2;
+  // When do we start throwing away extra alt choices?
+  const int kTooManyAltChoices = 100;
+
+  // Construct the cartesian product of the best_choices of word(1) and word2.
+  WERD_CHOICE_LIST joined_choices;
+  WERD_CHOICE_IT jc_it(&joined_choices);
+  WERD_CHOICE_IT bc1_it(&word->best_choices);
+  WERD_CHOICE_IT bc2_it(&word2->best_choices);
+  int num_word1_choices = word->best_choices.length();
+  int total_joined_choices = num_word1_choices;
+  // Nota Bene: For the main loop here, we operate only on the 2nd and greater
+  // word2 choices, and put them in the joined_choices list. The 1st word2
+  // choice gets added to the original word1 choices in-place after we have
+  // finished with them.
+  int bc2_index = 1;
+  for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
+    if (total_joined_choices >= kTooManyAltChoices &&
+        bc2_index > kAltsPerPiece)
+      break;
+    int bc1_index = 0;
+    for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
+        ++bc1_index, bc1_it.forward()) {
+      if (total_joined_choices >= kTooManyAltChoices &&
+          bc1_index > kAltsPerPiece)
+        break;
+      auto *wc = new WERD_CHOICE(*bc1_it.data());
+      *wc += *bc2_it.data();
+      jc_it.add_after_then_move(wc);
+      ++total_joined_choices;
+    }
+  }
+  // Now that we've filled in as many alternates as we want, paste the best
+  // choice for word2 onto the original word alt_choices.
+  bc1_it.move_to_first();
+  bc2_it.move_to_first();
+  for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
+    *bc1_it.data() += *bc2_it.data();
+  }
+  bc1_it.move_to_last();
+  bc1_it.add_list_after(&joined_choices);
+
+  // Restore the pointer to original blamer bundle and combine blamer
+  // information recorded in the splits.
+  if (orig_bb != nullptr) {
+    orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle,
+                        wordrec_debug_blamer);
+    delete word->blamer_bundle;
+    word->blamer_bundle = orig_bb;
+  }
+  word->SetupBoxWord();
+  word->reject_map.initialise(word->box_word->length());
+  delete word2;
+}
+
+
+}  // namespace tesseract
diff --git a/tesseract/src/ccmain/thresholder.cpp b/tesseract/src/ccmain/thresholder.cpp
new file mode 100644
index 00000000..e3934ea6
--- /dev/null
+++ b/tesseract/src/ccmain/thresholder.cpp
@@ -0,0 +1,334 @@
+///////////////////////////////////////////////////////////////////////
+// File:        thresholder.cpp
+// Description: Base API for thresholding images in tesseract.
+// Author:      Ray Smith
+//
+// (C) Copyright 2008, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "allheaders.h"
+
+#include <tesseract/thresholder.h>
+
+#include <cstdint>      // for uint32_t
+#include <cstring>
+
+#include "otsuthr.h"
+#include "tprintf.h"    // for tprintf
+
+#if defined(USE_OPENCL)
+#include "openclwrapper.h" // for OpenclDevice
+#endif
+
+namespace tesseract {
+
+ImageThresholder::ImageThresholder()
+  : pix_(nullptr),
+    image_width_(0), image_height_(0),
+    pix_channels_(0), pix_wpl_(0),
+    scale_(1), yres_(300), estimated_res_(300) {
+  SetRectangle(0, 0, 0, 0);
+}
+
+ImageThresholder::~ImageThresholder() {
+  Clear();
+}
+
+// Destroy the Pix if there is one, freeing memory.
+void ImageThresholder::Clear() {
+  pixDestroy(&pix_);
+}
+
+// Return true if no image has been set.
+bool ImageThresholder::IsEmpty() const {
+  return pix_ == nullptr;
+}
+
+// SetImage makes a copy of all the image data, so it may be deleted
+// immediately after this call.
+// Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
+// Palette color images will not work properly and must be converted to
+// 24 bit.
+// Binary images of 1 bit per pixel may also be given but they must be
+// byte packed with the MSB of the first byte being the first pixel, and a
+// one pixel is WHITE. For binary images set bytes_per_pixel=0.
+void ImageThresholder::SetImage(const unsigned char* imagedata,
+                                int width, int height,
+                                int bytes_per_pixel, int bytes_per_line) {
+  int bpp = bytes_per_pixel * 8;
+  if (bpp == 0) bpp = 1;
+  Pix* pix = pixCreate(width, height, bpp == 24 ? 32 : bpp);
+  l_uint32* data = pixGetData(pix);
+  int wpl = pixGetWpl(pix);
+  switch (bpp) {
+  case 1:
+    for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) {
+      for (int x = 0; x < width; ++x) {
+        if (imagedata[x / 8] & (0x80 >> (x % 8)))
+          CLEAR_DATA_BIT(data, x);
+        else
+          SET_DATA_BIT(data, x);
+      }
+    }
+    break;
+
+  case 8:
+    // Greyscale just copies the bytes in the right order.
+    for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) {
+      for (int x = 0; x < width; ++x)
+        SET_DATA_BYTE(data, x, imagedata[x]);
+    }
+    break;
+
+  case 24:
+    // Put the colors in the correct places in the line buffer.
+    for (int y = 0; y < height; ++y, imagedata += bytes_per_line) {
+      for (int x = 0; x < width; ++x, ++data) {
+        SET_DATA_BYTE(data, COLOR_RED, imagedata[3 * x]);
+        SET_DATA_BYTE(data, COLOR_GREEN, imagedata[3 * x + 1]);
+        SET_DATA_BYTE(data, COLOR_BLUE, imagedata[3 * x + 2]);
+      }
+    }
+    break;
+
+  case 32:
+    // Maintain byte order consistency across different endianness.
+    for (int y = 0; y < height; ++y, imagedata += bytes_per_line, data += wpl) {
+      for (int x = 0; x < width; ++x) {
+        data[x] = (imagedata[x * 4] << 24) | (imagedata[x * 4 + 1] << 16) |
+                  (imagedata[x * 4 + 2] << 8) | imagedata[x * 4 + 3];
+      }
+    }
+    break;
+
+  default:
+    tprintf("Cannot convert RAW image to Pix with bpp = %d\n", bpp);
+  }
+  SetImage(pix);
+  pixDestroy(&pix);
+}
+
+// Store the coordinates of the rectangle to process for later use.
+// Doesn't actually do any thresholding.
+void ImageThresholder::SetRectangle(int left, int top, int width, int height) {
+  rect_left_ = left;
+  rect_top_ = top;
+  rect_width_ = width;
+  rect_height_ = height;
+}
+
+// Get enough parameters to be able to rebuild bounding boxes in the
+// original image (not just within the rectangle).
+// Left and top are enough with top-down coordinates, but
+// the height of the rectangle and the image are needed for bottom-up.
+void ImageThresholder::GetImageSizes(int* left, int* top,
+                                     int* width, int* height,
+                                     int* imagewidth, int* imageheight) {
+  *left = rect_left_;
+  *top = rect_top_;
+  *width = rect_width_;
+  *height = rect_height_;
+  *imagewidth = image_width_;
+  *imageheight = image_height_;
+}
+
+// Pix vs raw, which to use? Pix is the preferred input for efficiency,
+// since raw buffers are copied.
+// SetImage for Pix clones its input, so the source pix may be pixDestroyed
+// immediately after, but may not go away until after the Thresholder has
+// finished with it.
+void ImageThresholder::SetImage(const Pix* pix) {
+  if (pix_ != nullptr)
+    pixDestroy(&pix_);
+  Pix* src = const_cast<Pix*>(pix);
+  int depth;
+  pixGetDimensions(src, &image_width_, &image_height_, &depth);
+  // Convert the image as necessary so it is one of binary, plain RGB, or
+  // 8 bit with no colormap. Guarantee that we always end up with our own copy,
+  // not just a clone of the input.
+  if (pixGetColormap(src)) {
+    Pix* tmp = pixRemoveColormap(src, REMOVE_CMAP_BASED_ON_SRC);
+    depth = pixGetDepth(tmp);
+    if (depth > 1 && depth < 8) {
+      pix_ = pixConvertTo8(tmp, false);
+      pixDestroy(&tmp);
+    } else {
+      pix_ = tmp;
+    }
+  } else if (depth > 1 && depth < 8) {
+    pix_ = pixConvertTo8(src, false);
+  } else {
+    pix_ = pixCopy(nullptr, src);
+  }
+  depth = pixGetDepth(pix_);
+  pix_channels_ = depth / 8;
+  pix_wpl_ = pixGetWpl(pix_);
+  scale_ = 1;
+  estimated_res_ = yres_ = pixGetYRes(pix_);
+  Init();
+}
+
+// Threshold the source image as efficiently as possible to the output Pix.
+// Creates a Pix and sets pix to point to the resulting pointer.
+// Caller must use pixDestroy to free the created Pix.
+/// Returns false on error.
+bool ImageThresholder::ThresholdToPix(PageSegMode pageseg_mode, Pix** pix) {
+  if (image_width_ > INT16_MAX || image_height_ > INT16_MAX) {
+    tprintf("Image too large: (%d, %d)\n", image_width_, image_height_);
+    return false;
+  }
+  if (pix_channels_ == 0) {
+    // We have a binary image, but it still has to be copied, as this API
+    // allows the caller to modify the output.
+    Pix* original = GetPixRect();
+    *pix = pixCopy(nullptr, original);
+    pixDestroy(&original);
+  } else {
+    OtsuThresholdRectToPix(pix_, pix);
+  }
+  return true;
+}
+
+// Gets a pix that contains an 8 bit threshold value at each pixel. The
+// returned pix may be an integer reduction of the binary image such that
+// the scale factor may be inferred from the ratio of the sizes, even down
+// to the extreme of a 1x1 pixel thresholds image.
+// Ideally the 8 bit threshold should be the exact threshold used to generate
+// the binary image in ThresholdToPix, but this is not a hard constraint.
+// Returns nullptr if the input is binary. PixDestroy after use.
+Pix* ImageThresholder::GetPixRectThresholds() {
+  if (IsBinary()) return nullptr;
+  Pix* pix_grey = GetPixRectGrey();
+  int width = pixGetWidth(pix_grey);
+  int height = pixGetHeight(pix_grey);
+  int* thresholds;
+  int* hi_values;
+  OtsuThreshold(pix_grey, 0, 0, width, height, &thresholds, &hi_values);
+  pixDestroy(&pix_grey);
+  Pix* pix_thresholds = pixCreate(width, height, 8);
+  int threshold = thresholds[0] > 0 ? thresholds[0] : 128;
+  pixSetAllArbitrary(pix_thresholds, threshold);
+  delete [] thresholds;
+  delete [] hi_values;
+  return pix_thresholds;
+}
+
+// Common initialization shared between SetImage methods.
+void ImageThresholder::Init() {
+  SetRectangle(0, 0, image_width_, image_height_);
+}
+
+// Get a clone/copy of the source image rectangle.
+// The returned Pix must be pixDestroyed.
+// This function will be used in the future by the page layout analysis, and
+// the layout analysis that uses it will only be available with Leptonica,
+// so there is no raw equivalent.
+Pix* ImageThresholder::GetPixRect() {
+  if (IsFullImage()) {
+    // Just clone the whole thing.
+    return pixClone(pix_);
+  } else {
+    // Crop to the given rectangle.
+    Box* box = boxCreate(rect_left_, rect_top_, rect_width_, rect_height_);
+    Pix* cropped = pixClipRectangle(pix_, box, nullptr);
+    boxDestroy(&box);
+    return cropped;
+  }
+}
+
+// Get a clone/copy of the source image rectangle, reduced to greyscale,
+// and at the same resolution as the output binary.
+// The returned Pix must be pixDestroyed.
+// Provided to the classifier to extract features from the greyscale image.
+Pix* ImageThresholder::GetPixRectGrey() {
+  auto pix = GetPixRect();  // May have to be reduced to grey.
+  int depth = pixGetDepth(pix);
+  if (depth != 8) {
+    if (depth == 24) {
+      auto tmp = pixConvert24To32(pix);
+      pixDestroy(&pix);
+      pix = tmp;
+    }
+    auto result = pixConvertTo8(pix, false);
+    pixDestroy(&pix);
+    return result;
+  }
+  return pix;
+}
+
+// Otsu thresholds the rectangle, taking the rectangle from *this.
+void ImageThresholder::OtsuThresholdRectToPix(Pix* src_pix,
+                                              Pix** out_pix) const {
+  int* thresholds;
+  int* hi_values;
+
+  int num_channels = OtsuThreshold(src_pix, rect_left_, rect_top_, rect_width_,
+                                   rect_height_, &thresholds, &hi_values);
+  // only use opencl if compiled w/ OpenCL and selected device is opencl
+#ifdef USE_OPENCL
+  OpenclDevice od;
+  if (num_channels == 4 &&
+      od.selectedDeviceIsOpenCL() && rect_top_ == 0 && rect_left_ == 0) {
+    od.ThresholdRectToPixOCL((unsigned char*)pixGetData(src_pix), num_channels,
+                             pixGetWpl(src_pix) * 4, thresholds, hi_values,
+                             out_pix /*pix_OCL*/, rect_height_, rect_width_,
+                             rect_top_, rect_left_);
+  } else {
+#endif
+    ThresholdRectToPix(src_pix, num_channels, thresholds, hi_values, out_pix);
+#ifdef USE_OPENCL
+  }
+#endif
+  delete [] thresholds;
+  delete [] hi_values;
+}
+
+/// Threshold the rectangle, taking everything except the src_pix
+/// from the class, using thresholds/hi_values to the output pix.
+/// NOTE that num_channels is the size of the thresholds and hi_values
+// arrays and also the bytes per pixel in src_pix.
+void ImageThresholder::ThresholdRectToPix(Pix* src_pix,
+                                          int num_channels,
+                                          const int* thresholds,
+                                          const int* hi_values,
+                                          Pix** pix) const {
+  *pix = pixCreate(rect_width_, rect_height_, 1);
+  uint32_t* pixdata = pixGetData(*pix);
+  int wpl = pixGetWpl(*pix);
+  int src_wpl = pixGetWpl(src_pix);
+  uint32_t* srcdata = pixGetData(src_pix);
+  pixSetXRes(*pix, pixGetXRes(src_pix));
+  pixSetYRes(*pix, pixGetYRes(src_pix));
+  for (int y = 0; y < rect_height_; ++y) {
+    const uint32_t* linedata = srcdata + (y + rect_top_) * src_wpl;
+    uint32_t* pixline = pixdata + y * wpl;
+    for (int x = 0; x < rect_width_; ++x) {
+      bool white_result = true;
+      for (int ch = 0; ch < num_channels; ++ch) {
+        int pixel =
+            GET_DATA_BYTE(linedata, (x + rect_left_) * num_channels + ch);
+        if (hi_values[ch] >= 0 &&
+            (pixel > thresholds[ch]) == (hi_values[ch] == 0)) {
+          white_result = false;
+          break;
+        }
+      }
+      if (white_result)
+        CLEAR_DATA_BIT(pixline, x);
+      else
+        SET_DATA_BIT(pixline, x);
+    }
+  }
+}
+
+}  // namespace tesseract.
diff --git a/tesseract/src/ccmain/werdit.cpp b/tesseract/src/ccmain/werdit.cpp
new file mode 100644
index 00000000..17834023
--- /dev/null
+++ b/tesseract/src/ccmain/werdit.cpp
@@ -0,0 +1,68 @@
+/**********************************************************************
+ * File:        werdit.cpp  (Formerly wordit.c)
+ * Description: An iterator for passing over all the words in a document.
+ * Author:      Ray Smith
+ * Created:     Mon Apr 27 08:51:22 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "werdit.h"
+
+#include "errcode.h"   // for ASSERT_HOST
+#include "pageres.h"   // for PAGE_RES_IT, PAGE_RES (ptr only), WERD_RES
+#include "stepblob.h"  // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
+#include "werd.h"      // for WERD
+
+namespace tesseract {
+
+/**********************************************************************
+ * make_pseudo_word
+ *
+ * Make all the blobs inside a selection into a single word.
+ * The returned PAGE_RES_IT* it points to the new word. After use, call
+ * it->DeleteCurrentWord() to delete the fake word, and then
+ * delete it to get rid of the iterator itself.
+ **********************************************************************/
+
+PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box) {
+  PAGE_RES_IT pr_it(page_res);
+  C_BLOB_LIST new_blobs;               // list of gathered blobs
+  C_BLOB_IT new_blob_it = &new_blobs;  // iterator
+
+  for (WERD_RES* word_res = pr_it.word(); word_res != nullptr;
+       word_res = pr_it.forward()) {
+    WERD* word = word_res->word;
+    if (word->bounding_box().overlap(selection_box)) {
+      C_BLOB_IT blob_it(word->cblob_list());
+      for (blob_it.mark_cycle_pt();
+           !blob_it.cycled_list(); blob_it.forward()) {
+        C_BLOB* blob = blob_it.data();
+        if (blob->bounding_box().overlap(selection_box)) {
+          new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob));
+        }
+      }
+      if (!new_blobs.empty()) {
+        WERD* pseudo_word = new WERD(&new_blobs, 1, nullptr);
+        word_res = pr_it.InsertSimpleCloneWord(*word_res, pseudo_word);
+        auto* it = new PAGE_RES_IT(page_res);
+        while (it->word() != word_res && it->word() != nullptr) it->forward();
+        ASSERT_HOST(it->word() == word_res);
+        return it;
+      }
+    }
+  }
+  return nullptr;
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/werdit.h b/tesseract/src/ccmain/werdit.h
new file mode 100644
index 00000000..b49bda29
--- /dev/null
+++ b/tesseract/src/ccmain/werdit.h
@@ -0,0 +1,34 @@
+/**********************************************************************
+ * File:        wordit.h
+ * Description: An iterator for passing over all the words in a document.
+ * Author:      Ray Smith
+ * Created:     Mon Apr 27 08:51:22 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef WERDIT_H
+#define WERDIT_H
+
+#include "rect.h"  // for TBOX
+
+namespace tesseract {
+
+class PAGE_RES;
+class PAGE_RES_IT;
+
+PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box);
+
+} // namespace tesseract
+
+#endif