Import Ghostscript 9.54ghostscript-9.54

Signed-off-by: Thomas Deutschmann <whissi@gentoo.org>
author: Thomas Deutschmann <whissi@gentoo.org> 2021-03-30 10:59:39 +0200
committer: Thomas Deutschmann <whissi@gentoo.org> 2021-04-01 00:04:14 +0200
commit: 5ff1d6955496b3cf9a35042c9ac35db43bc336b1 (patch)
tree: 6d470f7eb448f59f53e8df1010aec9dad8ce1f72 /tesseract/src/dict
parent: Import Ghostscript 9.53.1 (diff)
download: ghostscript-gpl-patches-5ff1d6955496b3cf9a35042c9ac35db43bc336b1.tar.gz
ghostscript-gpl-patches-5ff1d6955496b3cf9a35042c9ac35db43bc336b1.tar.bz2
ghostscript-gpl-patches-5ff1d6955496b3cf9a35042c9ac35db43bc336b1.zip
14 files changed, 4949 insertions, 0 deletions
diff --git a/tesseract/src/dict/context.cpp b/tesseract/src/dict/context.cpp
new file mode 100644
index 00000000..93cff5ff
--- /dev/null
+++ b/tesseract/src/dict/context.cpp
@@ -0,0 +1,76 @@
+/******************************************************************************
+ *
+ * File:         context.cpp  (Formerly context.c)
+ * Description:  Context checking functions
+ * Author:       Mark Seaman, OCR Technology
+ *
+ * (c) Copyright 1990, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *****************************************************************************/
+
+#include "dict.h"
+#include "unicharset.h"
+
+namespace tesseract {
+
+static const int kMinAbsoluteGarbageWordLength = 10;
+static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;
+
+const int case_state_table[6][4] = {
+    {/*  0. Beginning of word       */
+     /*    P   U   L   D                                          */
+     /* -1. Error on case           */
+     0, 1, 5, 4},
+    {/*  1. After initial capital    */
+     0, 3, 2, 4},
+    {/*  2. After lower case         */
+     0, -1, 2, -1},
+    {/*  3. After upper case         */
+     0, 3, -1, 4},
+    {/*  4. After a digit            */
+     0, -1, -1, 4},
+    {/*  5. After initial lower case */
+     5, -1, 2, -1},
+};
+
+int Dict::case_ok(const WERD_CHOICE &word) const {
+  int state = 0;
+  int x;
+  const UNICHARSET* unicharset = word.unicharset();
+  for (x = 0; x < word.length(); ++x) {
+    UNICHAR_ID ch_id = word.unichar_id(x);
+    if (unicharset->get_isupper(ch_id))
+      state = case_state_table[state][1];
+    else if (unicharset->get_islower(ch_id))
+      state = case_state_table[state][2];
+    else if (unicharset->get_isdigit(ch_id))
+      state = case_state_table[state][3];
+    else
+      state = case_state_table[state][0];
+    if (state == -1) return false;
+  }
+  return state != 5; // single lower is bad
+}
+
+bool Dict::absolute_garbage(const WERD_CHOICE &word,
+                            const UNICHARSET &unicharset) {
+  if (word.length() < kMinAbsoluteGarbageWordLength) return false;
+  int num_alphanum = 0;
+  for (int x = 0; x < word.length(); ++x) {
+    num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
+                     unicharset.get_isdigit(word.unichar_id(x)));
+  }
+  return (static_cast<float>(num_alphanum) /
+          static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
+}
+
+}  // namespace tesseract
diff --git a/tesseract/src/dict/dawg.cpp b/tesseract/src/dict/dawg.cpp
new file mode 100644
index 00000000..aa339727
--- /dev/null
+++ b/tesseract/src/dict/dawg.cpp
@@ -0,0 +1,417 @@
+/********************************************************************************
+ *
+ * File:         dawg.cpp  (Formerly dawg.c)
+ * Description:  Use a Directed Acyclic Word Graph
+ * Author:       Mark Seaman, OCR Technology
+ *
+ * (c) Copyright 1987, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *********************************************************************************/
+/*----------------------------------------------------------------------
+              I n c l u d e s
+----------------------------------------------------------------------*/
+
+#include "dawg.h"
+
+#include "dict.h"
+#include "helpers.h"
+#include "strngs.h"
+#include "tprintf.h"
+
+#include <memory>
+
+/*----------------------------------------------------------------------
+              F u n c t i o n s   f o r   D a w g
+----------------------------------------------------------------------*/
+namespace tesseract {
+
+// Destructor.
+// It is defined here, so the compiler can create a single vtable
+// instead of weak vtables in every compilation unit.
+Dawg::~Dawg() = default;
+
+bool Dawg::prefix_in_dawg(const WERD_CHOICE &word,
+                          bool requires_complete) const {
+  if (word.length() == 0) return !requires_complete;
+  NODE_REF node = 0;
+  int end_index = word.length() - 1;
+  for (int i = 0; i < end_index; i++) {
+    EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false);
+    if (edge == NO_EDGE) {
+      return false;
+    }
+    if ((node = next_node(edge)) == 0) {
+      // This only happens if all words following this edge terminate --
+      // there are no larger words.  See Trie::add_word_to_dawg()
+      return false;
+    }
+  }
+  // Now check the last character.
+  return edge_char_of(node, word.unichar_id(end_index), requires_complete) !=
+      NO_EDGE;
+}
+
+bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
+  return prefix_in_dawg(word, true);
+}
+
+int Dawg::check_for_words(const char *filename,
+                          const UNICHARSET &unicharset,
+                          bool enable_wildcard) const {
+  if (filename == nullptr) return 0;
+
+  FILE       *word_file;
+  char       string [CHARS_PER_LINE];
+  int misses = 0;
+  UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);
+
+  word_file = fopen(filename, "r");
+  if (word_file == nullptr) {
+    tprintf("Error: Could not open file %s\n", filename);
+    ASSERT_HOST(word_file);
+  }
+
+  while (fgets (string, CHARS_PER_LINE, word_file) != nullptr) {
+    chomp_string(string);  // remove newline
+    WERD_CHOICE word(string, unicharset);
+    if (word.length() > 0 &&
+        !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
+      if (!match_words(&word, 0, 0,
+                       enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
+        tprintf("Missing word: %s\n", string);
+        ++misses;
+      }
+    } else {
+      tprintf("Failed to create a valid word from %s\n", string);
+    }
+  }
+  fclose (word_file);
+  // Make sure the user sees this with fprintf instead of tprintf.
+  if (debug_level_) tprintf("Number of lost words=%d\n", misses);
+  return misses;
+}
+
+void Dawg::iterate_words(const UNICHARSET& unicharset,
+                         std::function<void(const WERD_CHOICE*)> cb) const {
+  WERD_CHOICE word(&unicharset);
+  iterate_words_rec(word, 0, cb);
+}
+
+static void CallWithUTF8(std::function<void(const char*)> cb,
+                         const WERD_CHOICE* wc) {
+  STRING s;
+  wc->string_and_lengths(&s, nullptr);
+  cb(s.c_str());
+}
+
+void Dawg::iterate_words(const UNICHARSET& unicharset,
+                         std::function<void(const char*)> cb) const {
+  using namespace std::placeholders;  // for _1
+  std::function<void(const WERD_CHOICE*)> shim(
+      std::bind(CallWithUTF8, cb, _1));
+  WERD_CHOICE word(&unicharset);
+  iterate_words_rec(word, 0, shim);
+}
+
+void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far,
+                             NODE_REF to_explore,
+                             std::function<void(const WERD_CHOICE*)> cb) const {
+  NodeChildVector children;
+  this->unichar_ids_of(to_explore, &children, false);
+  for (int i = 0; i < children.size(); i++) {
+    WERD_CHOICE next_word(word_so_far);
+    next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
+    if (this->end_of_word(children[i].edge_ref)) {
+      cb(&next_word);
+    }
+    NODE_REF next = next_node(children[i].edge_ref);
+    if (next != 0) {
+      iterate_words_rec(next_word, next, cb);
+    }
+  }
+}
+
+bool Dawg::match_words(WERD_CHOICE *word, int32_t index,
+                       NODE_REF node, UNICHAR_ID wildcard) const {
+  EDGE_REF edge;
+  int32_t word_end;
+
+  if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
+    bool any_matched = false;
+    NodeChildVector vec;
+    this->unichar_ids_of(node, &vec, false);
+    for (int i = 0; i < vec.size(); ++i) {
+      word->set_unichar_id(vec[i].unichar_id, index);
+      if (match_words(word, index, node, wildcard))
+        any_matched = true;
+    }
+    word->set_unichar_id(wildcard, index);
+    return any_matched;
+  } else {
+    word_end = index == word->length() - 1;
+    edge = edge_char_of(node, word->unichar_id(index), word_end);
+    if (edge != NO_EDGE) {  // normal edge in DAWG
+      node = next_node(edge);
+      if (word_end) {
+        if (debug_level_ > 1) word->print("match_words() found: ");
+        return true;
+      } else if (node != 0) {
+        return match_words(word, index+1, node, wildcard);
+      }
+    }
+  }
+  return false;
+}
+
+void Dawg::init(int unicharset_size) {
+  ASSERT_HOST(unicharset_size > 0);
+  unicharset_size_ = unicharset_size;
+  // Set bit masks. We will use the value unicharset_size_ as a null char, so
+  // the actual number of unichars is unicharset_size_ + 1.
+  flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0));
+  next_node_start_bit_ = flag_start_bit_ + NUM_FLAG_BITS;
+  letter_mask_ = ~(~0ull << flag_start_bit_);
+  next_node_mask_ = ~0ull << (flag_start_bit_ + NUM_FLAG_BITS);
+  flags_mask_ = ~(letter_mask_ | next_node_mask_);
+}
+
+
+/*----------------------------------------------------------------------
+         F u n c t i o n s   f o r   S q u i s h e d    D a w g
+----------------------------------------------------------------------*/
+
+SquishedDawg::~SquishedDawg() { delete[] edges_; }
+
+EDGE_REF SquishedDawg::edge_char_of(NODE_REF node,
+                                    UNICHAR_ID unichar_id,
+                                    bool word_end) const {
+  EDGE_REF edge = node;
+  if (node == 0) {  // binary search
+    EDGE_REF start = 0;
+    EDGE_REF end = num_forward_edges_in_node0 - 1;
+    int compare;
+    while (start <= end) {
+      edge = (start + end) >> 1;  // (start + end) / 2
+      compare = given_greater_than_edge_rec(NO_EDGE, word_end,
+                                            unichar_id, edges_[edge]);
+      if (compare == 0) {  // given == vec[k]
+        return edge;
+      } else if (compare == 1) {  // given > vec[k]
+        start = edge + 1;
+      } else {  // given < vec[k]
+        end = edge - 1;
+      }
+    }
+  } else {  // linear search
+    if (edge != NO_EDGE && edge_occupied(edge)) {
+      do {
+        if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
+            (!word_end || end_of_word_from_edge_rec(edges_[edge])))
+          return (edge);
+      } while (!last_edge(edge++));
+    }
+  }
+  return (NO_EDGE);  // not found
+}
+
+int32_t SquishedDawg::num_forward_edges(NODE_REF node) const {
+  EDGE_REF   edge = node;
+  int32_t        num  = 0;
+
+  if (forward_edge (edge)) {
+    do {
+      num++;
+    } while (!last_edge(edge++));
+  }
+
+  return (num);
+}
+
+void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const {
+  if (node == NO_EDGE) return;  // nothing to print
+
+  EDGE_REF   edge = node;
+  const char       *forward_string  = "FORWARD";
+  const char       *backward_string = "       ";
+
+  const char       *last_string     = "LAST";
+  const char       *not_last_string = "    ";
+
+  const char       *eow_string      = "EOW";
+  const char       *not_eow_string  = "   ";
+
+  const char       *direction;
+  const char       *is_last;
+  const char       *eow;
+
+  UNICHAR_ID unichar_id;
+
+  if (edge_occupied(edge)) {
+    do {
+      direction =
+        forward_edge(edge) ? forward_string : backward_string;
+      is_last = last_edge(edge) ? last_string : not_last_string;
+      eow = end_of_word(edge) ? eow_string : not_eow_string;
+
+      unichar_id = edge_letter(edge);
+      tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
+              edge, next_node(edge), unichar_id,
+              direction, is_last, eow);
+
+      if (edge - node > max_num_edges) return;
+    } while (!last_edge(edge++));
+
+    if (edge < num_edges_ &&
+        edge_occupied(edge) && backward_edge(edge)) {
+      do {
+        direction =
+          forward_edge(edge) ? forward_string : backward_string;
+        is_last = last_edge(edge) ? last_string : not_last_string;
+        eow = end_of_word(edge) ? eow_string : not_eow_string;
+
+        unichar_id = edge_letter(edge);
+        tprintf(REFFORMAT " : next = " REFFORMAT
+                ", unichar_id = %d, %s %s %s\n",
+                edge, next_node(edge), unichar_id,
+                direction, is_last, eow);
+
+        if (edge - node > MAX_NODE_EDGES_DISPLAY) return;
+      } while (!last_edge(edge++));
+    }
+  }
+  else {
+    tprintf(REFFORMAT " : no edges in this node\n", node);
+  }
+  tprintf("\n");
+}
+
+void SquishedDawg::print_edge(EDGE_REF edge) const {
+  if (edge == NO_EDGE) {
+    tprintf("NO_EDGE\n");
+  } else {
+    tprintf(REFFORMAT " : next = " REFFORMAT
+            ", unichar_id = '%d', %s %s %s\n", edge,
+            next_node(edge), edge_letter(edge),
+            (forward_edge(edge) ? "FORWARD" : "       "),
+            (last_edge(edge) ? "LAST"    : "    "),
+            (end_of_word(edge) ? "EOW"     : ""));
+  }
+}
+
+bool SquishedDawg::read_squished_dawg(TFile *file) {
+  if (debug_level_) tprintf("Reading squished dawg\n");
+
+  // Read the magic number and check that it matches kDawgMagicNumber, as
+  // auto-endian fixing should make sure it is always correct.
+  int16_t magic;
+  if (!file->DeSerialize(&magic)) return false;
+  if (magic != kDawgMagicNumber) {
+    tprintf("Bad magic number on dawg: %d vs %d\n", magic, kDawgMagicNumber);
+    return false;
+  }
+
+  int32_t unicharset_size;
+  if (!file->DeSerialize(&unicharset_size)) return false;
+  if (!file->DeSerialize(&num_edges_)) return false;
+  ASSERT_HOST(num_edges_ > 0);  // DAWG should not be empty
+  Dawg::init(unicharset_size);
+
+  edges_ = new EDGE_RECORD[num_edges_];
+  if (!file->DeSerialize(&edges_[0], num_edges_)) return false;
+  if (debug_level_ > 2) {
+    tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
+            type_, lang_.c_str(), perm_, unicharset_size_, num_edges_);
+    for (EDGE_REF edge = 0; edge < num_edges_; ++edge) print_edge(edge);
+  }
+  return true;
+}
+
+std::unique_ptr<EDGE_REF[]> SquishedDawg::build_node_map(
+    int32_t *num_nodes) const {
+  EDGE_REF   edge;
+  std::unique_ptr<EDGE_REF[]> node_map(new EDGE_REF[num_edges_]);
+  int32_t    node_counter;
+  int32_t    num_edges;
+
+  for (edge = 0; edge < num_edges_; edge++)       // init all slots
+    node_map[edge] = -1;
+
+  node_counter = num_forward_edges(0);
+
+  *num_nodes   = 0;
+  for (edge = 0; edge < num_edges_; edge++) {     // search all slots
+
+    if (forward_edge(edge)) {
+      (*num_nodes)++;                          // count nodes links
+      node_map[edge] = (edge ? node_counter : 0);
+      num_edges = num_forward_edges(edge);
+      if (edge != 0) node_counter += num_edges;
+      edge += num_edges;
+      if (edge >= num_edges_) break;
+      if (backward_edge(edge)) while (!last_edge(edge++));
+      edge--;
+    }
+  }
+  return node_map;
+}
+
+bool SquishedDawg::write_squished_dawg(TFile *file) {
+  EDGE_REF    edge;
+  int32_t     num_edges;
+  int32_t     node_count = 0;
+  EDGE_REF    old_index;
+  EDGE_RECORD temp_record;
+
+  if (debug_level_) tprintf("write_squished_dawg\n");
+
+  std::unique_ptr<EDGE_REF[]> node_map(build_node_map(&node_count));
+
+  // Write the magic number to help detecting a change in endianness.
+  int16_t magic = kDawgMagicNumber;
+  if (!file->Serialize(&magic)) return false;
+  if (!file->Serialize(&unicharset_size_)) return false;
+
+  // Count the number of edges in this Dawg.
+  num_edges = 0;
+  for (edge=0; edge < num_edges_; edge++)
+    if (forward_edge(edge))
+      num_edges++;
+
+  // Write edge count to file.
+  if (!file->Serialize(&num_edges)) return false;
+
+  if (debug_level_) {
+    tprintf("%d nodes in DAWG\n", node_count);
+    tprintf("%d edges in DAWG\n", num_edges);
+  }
+
+  for (edge = 0; edge < num_edges_; edge++) {
+    if (forward_edge(edge)) {  // write forward edges
+      do {
+        old_index = next_node_from_edge_rec(edges_[edge]);
+        set_next_node(edge, node_map[old_index]);
+        temp_record = edges_[edge];
+        if (!file->Serialize(&temp_record)) return false;
+        set_next_node(edge, old_index);
+      } while (!last_edge(edge++));
+
+      if (edge >= num_edges_) break;
+      if (backward_edge(edge))  // skip back links
+        while (!last_edge(edge++));
+
+      edge--;
+    }
+  }
+  return true;
+}
+
+}  // namespace tesseract
diff --git a/tesseract/src/dict/dawg.h b/tesseract/src/dict/dawg.h
new file mode 100644
index 00000000..26d76d1e
--- /dev/null
+++ b/tesseract/src/dict/dawg.h
@@ -0,0 +1,554 @@
+/******************************************************************************
+ *
+ * File:         dawg.h
+ * Description:  Definition of a class that represents Directed Acyclic Word
+ *               Graph (DAWG), functions to build and manipulate the DAWG.
+ * Author:       Mark Seaman, SW Productivity
+ *
+ * (c) Copyright 1987, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *****************************************************************************/
+
+#ifndef DICT_DAWG_H_
+#define DICT_DAWG_H_
+
+/*----------------------------------------------------------------------
+              I n c l u d e s
+----------------------------------------------------------------------*/
+
+#include <cinttypes>            // for PRId64
+#include <functional>           // for std::function
+#include <memory>
+#include "elst.h"
+#include "params.h"
+#include "ratngs.h"
+
+#ifndef __GNUC__
+#ifdef _WIN32
+#define NO_EDGE                (int64_t) 0xffffffffffffffffi64
+#endif  /*_WIN32*/
+#else
+#define NO_EDGE                (int64_t) 0xffffffffffffffffll
+#endif /*__GNUC__*/
+
+namespace tesseract {
+
+class UNICHARSET;
+
+using EDGE_RECORD = uint64_t;
+using EDGE_ARRAY = EDGE_RECORD *;
+using EDGE_REF = int64_t;
+using NODE_REF = int64_t;
+using NODE_MAP = EDGE_REF *;
+
+struct NodeChild {
+  UNICHAR_ID unichar_id;
+  EDGE_REF edge_ref;
+  NodeChild(UNICHAR_ID id, EDGE_REF ref): unichar_id(id), edge_ref(ref) {}
+  NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
+};
+
+using NodeChildVector = GenericVector<NodeChild>;
+using SuccessorList = GenericVector<int>;
+using SuccessorListsVector = GenericVector<SuccessorList *>;
+
+enum DawgType {
+  DAWG_TYPE_PUNCTUATION,
+  DAWG_TYPE_WORD,
+  DAWG_TYPE_NUMBER,
+  DAWG_TYPE_PATTERN,
+
+  DAWG_TYPE_COUNT  // number of enum entries
+};
+
+/*----------------------------------------------------------------------
+              C o n s t a n t s
+----------------------------------------------------------------------*/
+
+#define FORWARD_EDGE           (int32_t) 0
+#define BACKWARD_EDGE          (int32_t) 1
+#define MAX_NODE_EDGES_DISPLAY (int64_t) 100
+#define MARKER_FLAG            (int64_t) 1
+#define DIRECTION_FLAG         (int64_t) 2
+#define WERD_END_FLAG          (int64_t) 4
+#define LETTER_START_BIT       0
+#define NUM_FLAG_BITS          3
+#define REFFORMAT "%" PRId64
+
+static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = {
+  { false, true, true, false },  // for DAWG_TYPE_PUNCTUATION
+  { true, false, false, false },  // for DAWG_TYPE_WORD
+  { true, false, false, false },  // for DAWG_TYPE_NUMBER
+  { false, false, false, false },  // for DAWG_TYPE_PATTERN
+};
+
+static const char kWildcard[] = "*";
+
+
+/*----------------------------------------------------------------------
+              C l a s s e s   a n d   S t r u c t s
+----------------------------------------------------------------------*/
+//
+/// Abstract class (an interface) that declares methods needed by the
+/// various tesseract classes to operate on SquishedDawg and Trie objects.
+///
+/// This class initializes all the edge masks (since their usage by
+/// SquishedDawg and Trie is identical) and implements simple accessors
+/// for each of the fields encoded in an EDGE_RECORD.
+/// This class also implements word_in_dawg() and check_for_words()
+/// (since they use only the public methods of SquishedDawg and Trie
+/// classes that are inherited from the Dawg base class).
+//
+class TESS_API Dawg {
+ public:
+  /// Magic number to determine endianness when reading the Dawg from file.
+  static const int16_t kDawgMagicNumber = 42;
+  /// A special unichar id that indicates that any appropriate pattern
+  /// (e.g.dicitonary word, 0-9 digit, etc) can be inserted instead
+  /// Used for expressing patterns in punctuation and number Dawgs.
+  static const UNICHAR_ID kPatternUnicharID = 0;
+
+  inline DawgType type() const { return type_; }
+  inline const STRING &lang() const { return lang_; }
+  inline PermuterType permuter() const { return perm_; }
+
+  virtual ~Dawg();
+
+  /// Returns true if the given word is in the Dawg.
+  bool word_in_dawg(const WERD_CHOICE &word) const;
+
+  // Returns true if the given word prefix is not contraindicated by the dawg.
+  // If requires_complete is true, then the exact complete word must be present.
+  bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const;
+
+  /// Checks the Dawg for the words that are listed in the requested file.
+  /// Returns the number of words in the given file missing from the Dawg.
+  int check_for_words(const char *filename,
+                      const UNICHARSET &unicharset,
+                      bool enable_wildcard) const;
+
+  // For each word in the Dawg, call the given (permanent) callback with the
+  // text (UTF-8) version of the word.
+  void iterate_words(const UNICHARSET& unicharset,
+                     std::function<void(const WERD_CHOICE*)> cb) const;
+
+  // For each word in the Dawg, call the given (permanent) callback with the
+  // text (UTF-8) version of the word.
+  void iterate_words(const UNICHARSET& unicharset,
+                     std::function<void(const char*)> cb) const;
+
+  // Pure virtual function that should be implemented by the derived classes.
+
+  /// Returns the edge that corresponds to the letter out of this node.
+  virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
+                                bool word_end) const = 0;
+
+  /// Fills the given NodeChildVector with all the unichar ids (and the
+  /// corresponding EDGE_REFs) for which there is an edge out of this node.
+  virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
+                              bool word_end) const = 0;
+
+  /// Returns the next node visited by following the edge
+  /// indicated by the given EDGE_REF.
+  virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0;
+
+  /// Returns true if the edge indicated by the given EDGE_REF
+  /// marks the end of a word.
+  virtual bool end_of_word(EDGE_REF edge_ref) const = 0;
+
+  /// Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
+  virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0;
+
+  /// Prints the contents of the node indicated by the given NODE_REF.
+  /// At most max_num_edges will be printed.
+  virtual void print_node(NODE_REF node, int max_num_edges) const = 0;
+
+  /// Fills vec with unichar ids that represent the character classes
+  /// of the given unichar_id.
+  virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id,
+                                      const UNICHARSET &unicharset,
+                                      GenericVector<UNICHAR_ID> *vec) const {
+    (void)unichar_id;
+    (void)unicharset;
+    (void)vec;
+  }
+
+  /// Returns the given EDGE_REF if the EDGE_RECORD that it points to has
+  /// a self loop and the given unichar_id matches the unichar_id stored in the
+  /// EDGE_RECORD, returns NO_EDGE otherwise.
+  virtual EDGE_REF pattern_loop_edge(
+      EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const {
+    (void)edge_ref;
+    (void)unichar_id;
+    (void)word_end;
+    return false;
+  }
+
+ protected:
+  Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
+      : lang_(lang),
+        type_(type),
+        perm_(perm),
+        unicharset_size_(0),
+        debug_level_(debug_level) {}
+
+  /// Returns the next node visited by following this edge.
+  inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const {
+    return ((edge_rec & next_node_mask_) >> next_node_start_bit_);
+  }
+  /// Returns the marker flag of this edge.
+  inline bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const {
+    return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0;
+  }
+  /// Returns the direction flag of this edge.
+  inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const {
+    return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ?
+      BACKWARD_EDGE : FORWARD_EDGE;
+  }
+  /// Returns true if this edge marks the end of a word.
+  inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const {
+    return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0;
+  }
+  /// Returns UNICHAR_ID recorded in this edge.
+  inline UNICHAR_ID unichar_id_from_edge_rec(
+      const EDGE_RECORD &edge_rec) const {
+    return ((edge_rec & letter_mask_) >> LETTER_START_BIT);
+  }
+  /// Sets the next node link for this edge in the Dawg.
+  inline void set_next_node_in_edge_rec(
+      EDGE_RECORD *edge_rec, EDGE_REF value) {
+    *edge_rec &= (~next_node_mask_);
+    *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_);
+  }
+  /// Sets this edge record to be the last one in a sequence of edges.
+  inline void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec) {
+    *edge_rec |= (MARKER_FLAG << flag_start_bit_);
+  }
+  /// Sequentially compares the given values of unichar ID, next node
+  /// and word end marker with the values in the given EDGE_RECORD.
+  /// Returns: 1 if at any step the given input value exceeds
+  ///            that of edge_rec (and all the values already
+  ///            checked are the same)
+  ///          0 if edge_rec_match() returns true
+  ///         -1 otherwise
+  inline int given_greater_than_edge_rec(NODE_REF next_node,
+                                         bool word_end,
+                                         UNICHAR_ID unichar_id,
+                                         const EDGE_RECORD &edge_rec) const {
+    UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);
+    NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);
+    bool curr_word_end = end_of_word_from_edge_rec(edge_rec);
+    if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,
+                       curr_word_end, curr_unichar_id)) return 0;
+    if (unichar_id > curr_unichar_id) return 1;
+    if (unichar_id == curr_unichar_id) {
+      if (next_node > curr_next_node) return 1;
+      if (next_node == curr_next_node) {
+        if (word_end > curr_word_end) return 1;
+      }
+    }
+    return -1;
+  }
+  /// Returns true if all the values are equal (any value matches
+  /// next_node if next_node == NO_EDGE, any value matches word_end
+  /// if word_end is false).
+  inline bool edge_rec_match(NODE_REF next_node,
+                             bool word_end,
+                             UNICHAR_ID unichar_id,
+                             NODE_REF other_next_node,
+                             bool other_word_end,
+                             UNICHAR_ID other_unichar_id) const {
+    return ((unichar_id == other_unichar_id) &&
+            (next_node == NO_EDGE || next_node == other_next_node) &&
+            (!word_end || (word_end == other_word_end)));
+  }
+
+  /// Sets unicharset_size_.
+  /// Initializes the values of various masks from unicharset_size_.
+  void init(int unicharset_size);
+
+  /// Matches all of the words that are represented by this string.
+  /// If wildcard is set to something other than INVALID_UNICHAR_ID,
+  /// the *'s in this string are interpreted as wildcards.
+  /// WERD_CHOICE param is not passed by const so that wildcard searches
+  /// can modify it and work without having to copy WERD_CHOICEs.
+  bool match_words(WERD_CHOICE *word, int32_t index,
+                   NODE_REF node, UNICHAR_ID wildcard) const;
+
+  // Recursively iterate over all words in a dawg (see public iterate_words).
+  void iterate_words_rec(const WERD_CHOICE& word_so_far,
+                         NODE_REF to_explore,
+                         std::function<void(const WERD_CHOICE*)> cb) const;
+
+  // Member Variables.
+  STRING lang_;
+  DawgType type_;
+  /// Permuter code that should be used if the word is found in this Dawg.
+  PermuterType perm_;
+  // Variables to construct various edge masks. Formerly:
+  // #define NEXT_EDGE_MASK (int64_t) 0xfffffff800000000i64
+  // #define FLAGS_MASK     (int64_t) 0x0000000700000000i64
+  // #define LETTER_MASK    (int64_t) 0x00000000ffffffffi64
+  uint64_t next_node_mask_ = 0;
+  uint64_t flags_mask_ = 0;
+  uint64_t letter_mask_ = 0;
+  int unicharset_size_;
+  int flag_start_bit_ = 0;
+  int next_node_start_bit_ = 0;
+  // Level of debug statements to print to stdout.
+  int debug_level_;
+};
+
+//
+// DawgPosition keeps track of where we are in the primary dawg we're searching
+// as well as where we may be in the "punctuation dawg" which may provide
+// surrounding context.
+//
+// Example:
+//   punctuation dawg  -- space is the "pattern character"
+//     " "     // no punctuation
+//     "' '"   // leading and trailing apostrophes
+//     " '"    // trailing apostrophe
+//   word dawg:
+//     "cat"
+//     "cab"
+//     "cat's"
+//
+//  DawgPosition(dawg_index, dawg_ref, punc_index, punc_ref, rtp)
+//
+//  DawgPosition(-1, NO_EDGE, p, pe, false)
+//    We're in the punctuation dawg, no other dawg has been started.
+//    (1) If there's a pattern edge as a punc dawg child of us,
+//        for each punc-following dawg starting with ch, produce:
+//        Result: DawgPosition(k, w, p', false)
+//    (2) If there's a valid continuation in the punc dawg, produce:
+//        Result: DawgPosition(-k, NO_EDGE, p', false)
+//
+//  DawgPosition(k, w, -1, NO_EDGE, false)
+//    We're in dawg k.  Going back to punctuation dawg is not an option.
+//    Follow ch in dawg k.
+//
+//  DawgPosition(k, w, p, pe, false)
+//    We're in dawg k.  Continue in dawg k and/or go back to the punc dawg.
+//    If ending, check that the punctuation dawg is also ok to end here.
+//
+//  DawgPosition(k, w, p, pe true)
+//    We're back in the punctuation dawg.  Continuing there is the only option.
+struct DawgPosition {
+  DawgPosition() = default;
+  DawgPosition(int dawg_idx, EDGE_REF dawgref,
+               int punc_idx, EDGE_REF puncref,
+               bool backtopunc)
+      : dawg_ref(dawgref), punc_ref(puncref),
+        dawg_index(dawg_idx), punc_index(punc_idx),
+        back_to_punc(backtopunc) {
+  }
+  bool operator==(const DawgPosition &other) {
+    return dawg_index == other.dawg_index &&
+        dawg_ref == other.dawg_ref &&
+        punc_index == other.punc_index &&
+        punc_ref == other.punc_ref &&
+        back_to_punc == other.back_to_punc;
+  }
+
+  EDGE_REF dawg_ref = NO_EDGE;
+  EDGE_REF punc_ref = NO_EDGE;
+  int8_t dawg_index = -1;
+  int8_t punc_index = -1;
+  // Have we returned to the punc dawg at the end of the word?
+  bool back_to_punc = false;
+};
+
+class DawgPositionVector : public GenericVector<DawgPosition> {
+ public:
+  /// Adds an entry for the given dawg_index with the given node to the vec.
+  /// Returns false if the same entry already exists in the vector,
+  /// true otherwise.
+  inline bool add_unique(const DawgPosition &new_pos,
+                         bool debug,
+                         const char *debug_msg) {
+    for (int i = 0; i < size(); ++i) {
+      if (data_[i] == new_pos) return false;
+    }
+    push_back(new_pos);
+    if (debug) {
+      tprintf("%s[%d, " REFFORMAT "] [punc: " REFFORMAT "%s]\n",
+              debug_msg, new_pos.dawg_index, new_pos.dawg_ref,
+              new_pos.punc_ref, new_pos.back_to_punc ? " returned" : "");
+    }
+    return true;
+  }
+};
+
+//
+/// Concrete class that can operate on a compacted (squished) Dawg (read,
+/// search and write to file). This class is read-only in the sense that
+/// new words can not be added to an instance of SquishedDawg.
+/// The underlying representation of the nodes and edges in SquishedDawg
+/// is stored as a contiguous EDGE_ARRAY (read from file or given as an
+/// argument to the constructor).
+//
+class TESS_API SquishedDawg : public Dawg {
+ public:
+  SquishedDawg(DawgType type, const STRING &lang, PermuterType perm,
+               int debug_level)
+      : Dawg(type, lang, perm, debug_level) {}
+  SquishedDawg(const char *filename, DawgType type, const STRING &lang,
+               PermuterType perm, int debug_level)
+      : Dawg(type, lang, perm, debug_level) {
+    TFile file;
+    ASSERT_HOST(file.Open(filename, nullptr));
+    ASSERT_HOST(read_squished_dawg(&file));
+    num_forward_edges_in_node0 = num_forward_edges(0);
+  }
+  SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type,
+               const STRING &lang, PermuterType perm, int unicharset_size,
+               int debug_level)
+      : Dawg(type, lang, perm, debug_level),
+        edges_(edges),
+        num_edges_(num_edges) {
+    init(unicharset_size);
+    num_forward_edges_in_node0 = num_forward_edges(0);
+    if (debug_level > 3) print_all("SquishedDawg:");
+  }
+  ~SquishedDawg() override;
+
+  // Loads using the given TFile. Returns false on failure.
+  bool Load(TFile *fp) {
+    if (!read_squished_dawg(fp)) return false;
+    num_forward_edges_in_node0 = num_forward_edges(0);
+    return true;
+  }
+
+  int NumEdges() { return num_edges_; }
+
+  /// Returns the edge that corresponds to the letter out of this node.
+  EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
+                        bool word_end) const override;
+
+  /// Fills the given NodeChildVector with all the unichar ids (and the
+  /// corresponding EDGE_REFs) for which there is an edge out of this node.
+  void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
+                      bool word_end) const override {
+    EDGE_REF edge = node;
+    if (!edge_occupied(edge) || edge == NO_EDGE) return;
+    assert(forward_edge(edge));  // we don't expect any backward edges to
+    do {                         // be present when this function is called
+      if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
+        vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
+      }
+    } while (!last_edge(edge++));
+  }
+
+  /// Returns the next node visited by following the edge
+  /// indicated by the given EDGE_REF.
+  NODE_REF next_node(EDGE_REF edge) const override {
+    return next_node_from_edge_rec((edges_[edge]));
+  }
+
+  /// Returns true if the edge indicated by the given EDGE_REF
+  /// marks the end of a word.
+  bool end_of_word(EDGE_REF edge_ref) const override {
+    return end_of_word_from_edge_rec((edges_[edge_ref]));
+  }
+
+  /// Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
+  UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override {
+    return unichar_id_from_edge_rec((edges_[edge_ref]));
+  }
+
+  /// Prints the contents of the node indicated by the given NODE_REF.
+  /// At most max_num_edges will be printed.
+  void print_node(NODE_REF node, int max_num_edges) const override;
+
+  /// Writes the squished/reduced Dawg to a file.
+  bool write_squished_dawg(TFile *file);
+
+  /// Opens the file with the given filename and writes the
+  /// squished/reduced Dawg to the file.
+  bool write_squished_dawg(const char *filename) {
+    TFile file;
+    file.OpenWrite(nullptr);
+    if (!this->write_squished_dawg(&file)) {
+      tprintf("Error serializing %s\n", filename);
+      return false;
+    }
+    if (!file.CloseWrite(filename, nullptr)) {
+      tprintf("Error writing file %s\n", filename);
+      return false;
+    }
+    return true;
+  }
+
+ private:
+  /// Sets the next node link for this edge.
+  inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) {
+    set_next_node_in_edge_rec(&(edges_[edge_ref]), value);
+  }
+  /// Sets the edge to be empty.
+  inline void set_empty_edge(EDGE_REF edge_ref) {
+    (edges_[edge_ref] = next_node_mask_);
+  }
+  /// Goes through all the edges and clears each one out.
+  inline void clear_all_edges() {
+    for (int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge);
+  }
+  /// Clears the last flag of this edge.
+  inline void clear_marker_flag(EDGE_REF edge_ref) {
+     (edges_[edge_ref] &= ~(MARKER_FLAG << flag_start_bit_));
+  }
+  /// Returns true if this edge is in the forward direction.
+  inline bool forward_edge(EDGE_REF edge_ref) const {
+    return (edge_occupied(edge_ref) &&
+            (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
+  }
+  /// Returns true if this edge is in the backward direction.
+  inline bool backward_edge(EDGE_REF edge_ref) const {
+    return (edge_occupied(edge_ref) &&
+            (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
+  }
+  /// Returns true if the edge spot in this location is occupied.
+  inline bool edge_occupied(EDGE_REF edge_ref) const {
+    return (edges_[edge_ref] != next_node_mask_);
+  }
+  /// Returns true if this edge is the last edge in a sequence.
+  inline bool last_edge(EDGE_REF edge_ref) const {
+    return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0;
+  }
+
+  /// Counts and returns the number of forward edges in this node.
+  int32_t num_forward_edges(NODE_REF node) const;
+
+  /// Reads SquishedDawg from a file.
+  bool read_squished_dawg(TFile *file);
+
+  /// Prints the contents of an edge indicated by the given EDGE_REF.
+  void print_edge(EDGE_REF edge) const;
+
+  /// Prints the contents of the SquishedDawg.
+  void print_all(const char* msg) {
+    tprintf("\n__________________________\n%s\n", msg);
+    for (int i = 0; i < num_edges_; ++i) print_edge(i);
+    tprintf("__________________________\n");
+  }
+  /// Constructs a mapping from the memory node indices to disk node indices.
+  std::unique_ptr<EDGE_REF[]> build_node_map(int32_t *num_nodes) const;
+
+  // Member variables.
+  EDGE_ARRAY edges_ = nullptr;
+  int32_t num_edges_ = 0;
+  int num_forward_edges_in_node0 = 0;
+};
+
+}  // namespace tesseract
+
+#endif  // DICT_DAWG_H_
diff --git a/tesseract/src/dict/dawg_cache.cpp b/tesseract/src/dict/dawg_cache.cpp
new file mode 100644
index 00000000..0ade27e6
--- /dev/null
+++ b/tesseract/src/dict/dawg_cache.cpp
@@ -0,0 +1,96 @@
+///////////////////////////////////////////////////////////////////////
+// File:        dawg_cache.cpp
+// Description: A class that knows about loading and caching dawgs.
+// Author:      David Eger
+//
+// (C) Copyright 2012, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "dawg_cache.h"
+
+#include "dawg.h"
+#include "object_cache.h"
+#include "strngs.h"
+#include "tessdatamanager.h"
+
+namespace tesseract {
+
+struct DawgLoader {
+  DawgLoader(const STRING &lang, TessdataType tessdata_dawg_type,
+             int dawg_debug_level, TessdataManager *data_file)
+      : lang_(lang),
+        data_file_(data_file),
+        tessdata_dawg_type_(tessdata_dawg_type),
+        dawg_debug_level_(dawg_debug_level) {}
+
+  Dawg *Load();
+
+  STRING lang_;
+  TessdataManager *data_file_;
+  TessdataType tessdata_dawg_type_;
+  int dawg_debug_level_;
+};
+
+Dawg *DawgCache::GetSquishedDawg(const STRING &lang,
+                                 TessdataType tessdata_dawg_type,
+                                 int debug_level, TessdataManager *data_file) {
+  std::string data_id = data_file->GetDataFileName();
+  data_id += kTessdataFileSuffixes[tessdata_dawg_type];
+  DawgLoader loader(lang, tessdata_dawg_type, debug_level, data_file);
+  return dawgs_.Get(data_id, std::bind(&DawgLoader::Load, &loader));
+}
+
+Dawg *DawgLoader::Load() {
+  TFile fp;
+  if (!data_file_->GetComponent(tessdata_dawg_type_, &fp)) return nullptr;
+  DawgType dawg_type;
+  PermuterType perm_type;
+  switch (tessdata_dawg_type_) {
+    case TESSDATA_PUNC_DAWG:
+    case TESSDATA_LSTM_PUNC_DAWG:
+      dawg_type = DAWG_TYPE_PUNCTUATION;
+      perm_type = PUNC_PERM;
+      break;
+    case TESSDATA_SYSTEM_DAWG:
+    case TESSDATA_LSTM_SYSTEM_DAWG:
+      dawg_type = DAWG_TYPE_WORD;
+      perm_type = SYSTEM_DAWG_PERM;
+      break;
+    case TESSDATA_NUMBER_DAWG:
+    case TESSDATA_LSTM_NUMBER_DAWG:
+      dawg_type = DAWG_TYPE_NUMBER;
+      perm_type = NUMBER_PERM;
+      break;
+    case TESSDATA_BIGRAM_DAWG:
+      dawg_type = DAWG_TYPE_WORD;  // doesn't actually matter
+      perm_type = COMPOUND_PERM;   // doesn't actually matter
+      break;
+    case TESSDATA_UNAMBIG_DAWG:
+      dawg_type = DAWG_TYPE_WORD;
+      perm_type = SYSTEM_DAWG_PERM;
+      break;
+    case TESSDATA_FREQ_DAWG:
+      dawg_type = DAWG_TYPE_WORD;
+      perm_type = FREQ_DAWG_PERM;
+      break;
+    default:
+      return nullptr;
+  }
+  auto *retval =
+      new SquishedDawg(dawg_type, lang_, perm_type, dawg_debug_level_);
+  if (retval->Load(&fp)) return retval;
+  delete retval;
+  return nullptr;
+}
+
+}  // namespace tesseract
diff --git a/tesseract/src/dict/dawg_cache.h b/tesseract/src/dict/dawg_cache.h
new file mode 100644
index 00000000..83233802
--- /dev/null
+++ b/tesseract/src/dict/dawg_cache.h
@@ -0,0 +1,53 @@
+///////////////////////////////////////////////////////////////////////
+// File:        dawg_cache.h
+// Description: A class that knows about loading and caching dawgs.
+// Author:      David Eger
+// Created:     Fri Jan 27 12:08:00 PST 2012
+//
+// (C) Copyright 2012, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_DICT_DAWG_CACHE_H_
+#define TESSERACT_DICT_DAWG_CACHE_H_
+
+#include "dawg.h"
+#include "object_cache.h"
+#include "strngs.h"
+#include "tessdatamanager.h"
+
+namespace tesseract {
+
+class DawgCache {
+ public:
+  Dawg *GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type,
+                        int debug_level, TessdataManager *data_file);
+
+  // If we manage the given dawg, decrement its count,
+  // and possibly delete it if the count reaches zero.
+  // If dawg is unknown to us, return false.
+  bool FreeDawg(Dawg *dawg) {
+    return dawgs_.Free(dawg);
+  }
+
+  // Free up any currently unused dawgs.
+  void DeleteUnusedDawgs() {
+    dawgs_.DeleteUnusedObjects();
+  }
+
+ private:
+  ObjectCache<Dawg> dawgs_;
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_DICT_DAWG_CACHE_H_
diff --git a/tesseract/src/dict/dict.cpp b/tesseract/src/dict/dict.cpp
new file mode 100644
index 00000000..eb69b569
--- /dev/null
+++ b/tesseract/src/dict/dict.cpp
@@ -0,0 +1,888 @@
+///////////////////////////////////////////////////////////////////////
+// File:        dict.cpp
+// Description: dict class.
+// Author:      Samuel Charron
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "dict.h"
+
+#include "tprintf.h"
+
+#include <cstdio>
+
+namespace tesseract {
+
+class Image;
+
+Dict::Dict(CCUtil* ccutil)
+    : letter_is_okay_(&tesseract::Dict::def_letter_is_okay),
+      probability_in_context_(&tesseract::Dict::def_probability_in_context),
+      ccutil_(ccutil),
+      wildcard_unichar_id_(INVALID_UNICHAR_ID),
+      apostrophe_unichar_id_(INVALID_UNICHAR_ID),
+      question_unichar_id_(INVALID_UNICHAR_ID),
+      slash_unichar_id_(INVALID_UNICHAR_ID),
+      hyphen_unichar_id_(INVALID_UNICHAR_ID),
+      STRING_MEMBER(user_words_file, "", "A filename of user-provided words.",
+                    getCCUtil()->params()),
+      STRING_INIT_MEMBER(user_words_suffix, "",
+                         "A suffix of user-provided words located in tessdata.",
+                         getCCUtil()->params()),
+      STRING_MEMBER(user_patterns_file, "",
+                    "A filename of user-provided patterns.",
+                    getCCUtil()->params()),
+      STRING_INIT_MEMBER(user_patterns_suffix, "",
+                         "A suffix of user-provided patterns located in "
+                         "tessdata.",
+                         getCCUtil()->params()),
+      BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
+                       getCCUtil()->params()),
+      BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
+                       getCCUtil()->params()),
+      BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
+                       getCCUtil()->params()),
+      BOOL_INIT_MEMBER(load_punc_dawg, true,
+                       "Load dawg with punctuation"
+                       " patterns.",
+                       getCCUtil()->params()),
+      BOOL_INIT_MEMBER(load_number_dawg, true,
+                       "Load dawg with number"
+                       " patterns.",
+                       getCCUtil()->params()),
+      BOOL_INIT_MEMBER(load_bigram_dawg, true,
+                       "Load dawg with special word "
+                       "bigrams.",
+                       getCCUtil()->params()),
+      double_MEMBER(xheight_penalty_subscripts, 0.125,
+                    "Score penalty (0.1 = 10%) added if there are subscripts "
+                    "or superscripts in a word, but it is otherwise OK.",
+                    getCCUtil()->params()),
+      double_MEMBER(xheight_penalty_inconsistent, 0.25,
+                    "Score penalty (0.1 = 10%) added if an xheight is "
+                    "inconsistent.",
+                    getCCUtil()->params()),
+      double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
+                    "Score multiplier for word matches which have good case and"
+                    " are frequent in the given language (lower is better).",
+                    getCCUtil()->params()),
+      double_MEMBER(segment_penalty_dict_case_ok, 1.1,
+                    "Score multiplier for word matches that have good case "
+                    "(lower is better).",
+                    getCCUtil()->params()),
+      double_MEMBER(segment_penalty_dict_case_bad, 1.3125,
+                    "Default score multiplier for word matches, which may have "
+                    "case issues (lower is better).",
+                    getCCUtil()->params()),
+      double_MEMBER(segment_penalty_dict_nonword, 1.25,
+                    "Score multiplier for glyph fragment segmentations which "
+                    "do not match a dictionary word (lower is better).",
+                    getCCUtil()->params()),
+      double_MEMBER(segment_penalty_garbage, 1.50,
+                    "Score multiplier for poorly cased strings that are not in"
+                    " the dictionary and generally look like garbage (lower is"
+                    " better).",
+                    getCCUtil()->params()),
+      STRING_MEMBER(output_ambig_words_file, "",
+                    "Output file for ambiguities found in the dictionary",
+                    getCCUtil()->params()),
+      INT_MEMBER(dawg_debug_level, 0,
+                 "Set to 1 for general debug info"
+                 ", to 2 for more details, to 3 to see all the debug messages",
+                 getCCUtil()->params()),
+      INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
+                 getCCUtil()->params()),
+      BOOL_MEMBER(use_only_first_uft8_step, false,
+                  "Use only the first UTF8 step of the given string"
+                  " when computing log probabilities.",
+                  getCCUtil()->params()),
+      double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
+                    getCCUtil()->params()),
+      double_MEMBER(stopper_nondict_certainty_base, -2.50,
+                    "Certainty threshold for non-dict words",
+                    getCCUtil()->params()),
+      double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0,
+                    "Reject certainty offset", getCCUtil()->params()),
+      INT_MEMBER(stopper_smallword_size, 2,
+                 "Size of dict word to be treated as non-dict word",
+                 getCCUtil()->params()),
+      double_MEMBER(stopper_certainty_per_char, -0.50,
+                    "Certainty to add"
+                    " for each dict char above small word size.",
+                    getCCUtil()->params()),
+      double_MEMBER(stopper_allowable_character_badness, 3.0,
+                    "Max certaintly variation allowed in a word (in sigma)",
+                    getCCUtil()->params()),
+      INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
+                 getCCUtil()->params()),
+      BOOL_MEMBER(stopper_no_acceptable_choices, false,
+                  "Make AcceptableChoice() always return false. Useful"
+                  " when there is a need to explore all segmentations",
+                  getCCUtil()->params()),
+      INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
+                 "Max words to keep in list", getCCUtil()->params()),
+      STRING_MEMBER(word_to_debug, "",
+                    "Word for which stopper debug"
+                    " information should be printed to stdout",
+                    getCCUtil()->params()),
+      BOOL_MEMBER(segment_nonalphabetic_script, false,
+                  "Don't use any alphabetic-specific tricks."
+                  " Set to true in the traineddata config file for"
+                  " scripts that are cursive or inherently fixed-pitch",
+                  getCCUtil()->params()),
+      BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
+                  getCCUtil()->params()),
+      double_MEMBER(doc_dict_pending_threshold, 0.0,
+                    "Worst certainty for using pending dictionary",
+                    getCCUtil()->params()),
+      double_MEMBER(doc_dict_certainty_threshold, -2.25,
+                    "Worst certainty for words that can be inserted into the"
+                    " document dictionary",
+                    getCCUtil()->params()),
+      INT_MEMBER(max_permuter_attempts, 10000,
+                 "Maximum number of different"
+                 " character choices to consider during permutation."
+                 " This limit is especially useful when user patterns"
+                 " are specified, since overly generic patterns can result in"
+                 " dawg search exploring an overly large number of options.",
+                 getCCUtil()->params()) {
+  reject_offset_ = 0.0;
+  go_deeper_fxn_ = nullptr;
+  hyphen_word_ = nullptr;
+  last_word_on_line_ = false;
+  document_words_ = nullptr;
+  dawg_cache_ = nullptr;
+  dawg_cache_is_ours_ = false;
+  pending_words_ = nullptr;
+  bigram_dawg_ = nullptr;
+  freq_dawg_ = nullptr;
+  punc_dawg_ = nullptr;
+  unambig_dawg_ = nullptr;
+  wordseg_rating_adjust_factor_ = -1.0f;
+  output_ambig_words_file_ = nullptr;
+}
+
+Dict::~Dict() {
+  End();
+  delete hyphen_word_;
+  if (output_ambig_words_file_ != nullptr) fclose(output_ambig_words_file_);
+}
+
+DawgCache* Dict::GlobalDawgCache() {
+  // This global cache (a singleton) will outlive every Tesseract instance
+  // (even those that someone else might declare as global statics).
+  static DawgCache cache;
+  return &cache;
+}
+
+// Sets up ready for a Load or LoadLSTM.
+void Dict::SetupForLoad(DawgCache* dawg_cache) {
+  if (dawgs_.size() != 0) this->End();
+
+  apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
+  question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
+  slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
+  hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
+
+  if (dawg_cache != nullptr) {
+    dawg_cache_ = dawg_cache;
+    dawg_cache_is_ours_ = false;
+  } else {
+    dawg_cache_ = new DawgCache();
+    dawg_cache_is_ours_ = true;
+  }
+}
+
+// Loads the dawgs needed by Tesseract. Call FinishLoad() after.
+void Dict::Load(const STRING& lang, TessdataManager* data_file) {
+  // Load dawgs_.
+  if (load_punc_dawg) {
+    punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG,
+                                              dawg_debug_level, data_file);
+    if (punc_dawg_) dawgs_ += punc_dawg_;
+  }
+  if (load_system_dawg) {
+    Dawg* system_dawg = dawg_cache_->GetSquishedDawg(
+        lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
+    if (system_dawg) dawgs_ += system_dawg;
+  }
+  if (load_number_dawg) {
+    Dawg* number_dawg = dawg_cache_->GetSquishedDawg(
+        lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
+    if (number_dawg) dawgs_ += number_dawg;
+  }
+  if (load_bigram_dawg) {
+    bigram_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG,
+                                                dawg_debug_level, data_file);
+    // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the
+    // dawgs_!!
+  }
+  if (load_freq_dawg) {
+    freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG,
+                                              dawg_debug_level, data_file);
+    if (freq_dawg_) dawgs_ += freq_dawg_;
+  }
+  if (load_unambig_dawg) {
+    unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG,
+                                                 dawg_debug_level, data_file);
+    if (unambig_dawg_) dawgs_ += unambig_dawg_;
+  }
+
+  STRING name;
+  if (!user_words_suffix.empty() || !user_words_file.empty()) {
+    Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
+                              getUnicharset().size(), dawg_debug_level);
+    if (!user_words_file.empty()) {
+      name = user_words_file;
+    } else {
+      name = getCCUtil()->language_data_path_prefix;
+      name += user_words_suffix;
+    }
+    if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
+                                          Trie::RRP_REVERSE_IF_HAS_RTL)) {
+      tprintf("Error: failed to load %s\n", name.c_str());
+      delete trie_ptr;
+    } else {
+      dawgs_ += trie_ptr;
+    }
+  }
+
+  if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
+    Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
+                              getUnicharset().size(), dawg_debug_level);
+    trie_ptr->initialize_patterns(&(getUnicharset()));
+    if (!user_patterns_file.empty()) {
+      name = user_patterns_file;
+    } else {
+      name = getCCUtil()->language_data_path_prefix;
+      name += user_patterns_suffix;
+    }
+    if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
+      tprintf("Error: failed to load %s\n", name.c_str());
+      delete trie_ptr;
+    } else {
+      dawgs_ += trie_ptr;
+    }
+  }
+
+  document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
+                             getUnicharset().size(), dawg_debug_level);
+  dawgs_ += document_words_;
+
+  // This dawg is temporary and should not be searched by letter_is_ok.
+  pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
+                            getUnicharset().size(), dawg_debug_level);
+}
+
+// Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
+void Dict::LoadLSTM(const STRING& lang, TessdataManager* data_file) {
+  // Load dawgs_.
+  if (load_punc_dawg) {
+    punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG,
+                                              dawg_debug_level, data_file);
+    if (punc_dawg_) dawgs_ += punc_dawg_;
+  }
+  if (load_system_dawg) {
+    Dawg* system_dawg = dawg_cache_->GetSquishedDawg(
+        lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
+    if (system_dawg) dawgs_ += system_dawg;
+  }
+  if (load_number_dawg) {
+    Dawg* number_dawg = dawg_cache_->GetSquishedDawg(
+        lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
+    if (number_dawg) dawgs_ += number_dawg;
+  }
+
+  // stolen from Dict::Load (but needs params_ from Tesseract
+  // langdata/config/api):
+  STRING name;
+  if (!user_words_suffix.empty() || !user_words_file.empty()) {
+    Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
+                              getUnicharset().size(), dawg_debug_level);
+    if (!user_words_file.empty()) {
+      name = user_words_file;
+    } else {
+      name = getCCUtil()->language_data_path_prefix;
+      name += user_words_suffix;
+    }
+    if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
+                                          Trie::RRP_REVERSE_IF_HAS_RTL)) {
+      tprintf("Error: failed to load %s\n", name.c_str());
+      delete trie_ptr;
+    } else {
+      dawgs_ += trie_ptr;
+    }
+  }
+
+  if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
+    Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
+                              getUnicharset().size(), dawg_debug_level);
+    trie_ptr->initialize_patterns(&(getUnicharset()));
+    if (!user_patterns_file.empty()) {
+      name = user_patterns_file;
+    } else {
+      name = getCCUtil()->language_data_path_prefix;
+      name += user_patterns_suffix;
+    }
+    if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
+      tprintf("Error: failed to load %s\n", name.c_str());
+      delete trie_ptr;
+    } else {
+      dawgs_ += trie_ptr;
+    }
+  }
+}
+
+// Completes the loading process after Load() and/or LoadLSTM().
+// Returns false if no dictionaries were loaded.
+bool Dict::FinishLoad() {
+  if (dawgs_.empty()) return false;
+  // Construct a list of corresponding successors for each dawg. Each entry, i,
+  // in the successors_ vector is a vector of integers that represent the
+  // indices into the dawgs_ vector of the successors for dawg i.
+  successors_.reserve(dawgs_.size());
+  for (int i = 0; i < dawgs_.size(); ++i) {
+    const Dawg* dawg = dawgs_[i];
+    auto* lst = new SuccessorList();
+    for (int j = 0; j < dawgs_.size(); ++j) {
+      const Dawg* other = dawgs_[j];
+      if (dawg != nullptr && other != nullptr &&
+          (dawg->lang() == other->lang()) &&
+          kDawgSuccessors[dawg->type()][other->type()])
+        *lst += j;
+    }
+    successors_ += lst;
+  }
+  return true;
+}
+
+void Dict::End() {
+  if (dawgs_.size() == 0) return;  // Not safe to call twice.
+  for (int i = 0; i < dawgs_.size(); i++) {
+    if (!dawg_cache_->FreeDawg(dawgs_[i])) {
+      delete dawgs_[i];
+    }
+  }
+  dawg_cache_->FreeDawg(bigram_dawg_);
+  if (dawg_cache_is_ours_) {
+    delete dawg_cache_;
+    dawg_cache_ = nullptr;
+  }
+  successors_.delete_data_pointers();
+  dawgs_.clear();
+  successors_.clear();
+  document_words_ = nullptr;
+  delete pending_words_;
+  pending_words_ = nullptr;
+}
+
+// Returns true if in light of the current state unichar_id is allowed
+// according to at least one of the dawgs in the dawgs_ vector.
+// See more extensive comments in dict.h where this function is declared.
+int Dict::def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset,
+                             UNICHAR_ID unichar_id, bool word_end) const {
+  auto* dawg_args = static_cast<DawgArgs*>(void_dawg_args);
+
+  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
+
+  if (dawg_debug_level >= 3) {
+    tprintf(
+        "def_letter_is_okay: current unichar=%s word_end=%d"
+        " num active dawgs=%d\n",
+        getUnicharset().debug_str(unichar_id).c_str(), word_end,
+        dawg_args->active_dawgs->size());
+  }
+
+  // Do not accept words that contain kPatternUnicharID.
+  // (otherwise pattern dawgs would not function correctly).
+  // Do not accept words containing INVALID_UNICHAR_IDs.
+  if (unichar_id == Dawg::kPatternUnicharID ||
+      unichar_id == INVALID_UNICHAR_ID) {
+    dawg_args->permuter = NO_PERM;
+    return NO_PERM;
+  }
+
+  // Initialization.
+  PermuterType curr_perm = NO_PERM;
+  dawg_args->updated_dawgs->clear();
+  dawg_args->valid_end = false;
+
+  // Go over the active_dawgs vector and insert DawgPosition records
+  // with the updated ref (an edge with the corresponding unichar id) into
+  // dawg_args->updated_pos.
+  for (int a = 0; a < dawg_args->active_dawgs->size(); ++a) {
+    const DawgPosition& pos = (*dawg_args->active_dawgs)[a];
+    const Dawg* punc_dawg =
+        pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
+    const Dawg* dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
+
+    if (!dawg && !punc_dawg) {
+      // shouldn't happen.
+      tprintf("Received DawgPosition with no dawg or punc_dawg.  wth?\n");
+      continue;
+    }
+    if (!dawg) {
+      // We're in the punctuation dawg.  A core dawg has not been chosen.
+      NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
+      EDGE_REF punc_transition_edge =
+          punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end);
+      if (punc_transition_edge != NO_EDGE) {
+        // Find all successors, and see which can transition.
+        const SuccessorList& slist = *(successors_[pos.punc_index]);
+        for (int s = 0; s < slist.size(); ++s) {
+          int sdawg_index = slist[s];
+          const Dawg* sdawg = dawgs_[sdawg_index];
+          UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
+          EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
+          if (dawg_edge != NO_EDGE) {
+            if (dawg_debug_level >= 3) {
+              tprintf("Letter found in dawg %d\n", sdawg_index);
+            }
+            dawg_args->updated_dawgs->add_unique(
+                DawgPosition(sdawg_index, dawg_edge, pos.punc_index,
+                             punc_transition_edge, false),
+                dawg_debug_level > 0,
+                "Append transition from punc dawg to current dawgs: ");
+            if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
+            if (sdawg->end_of_word(dawg_edge) &&
+                punc_dawg->end_of_word(punc_transition_edge))
+              dawg_args->valid_end = true;
+          }
+        }
+      }
+      EDGE_REF punc_edge =
+          punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
+      if (punc_edge != NO_EDGE) {
+        if (dawg_debug_level >= 3) {
+          tprintf("Letter found in punctuation dawg\n");
+        }
+        dawg_args->updated_dawgs->add_unique(
+            DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false),
+            dawg_debug_level > 0, "Extend punctuation dawg: ");
+        if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM;
+        if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
+      }
+      continue;
+    }
+
+    if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
+      // We can end the main word here.
+      //  If we can continue on the punc ref, add that possibility.
+      NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
+      EDGE_REF punc_edge =
+          punc_node == NO_EDGE
+              ? NO_EDGE
+              : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
+      if (punc_edge != NO_EDGE) {
+        dawg_args->updated_dawgs->add_unique(
+            DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index,
+                         punc_edge, true),
+            dawg_debug_level > 0, "Return to punctuation dawg: ");
+        if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
+        if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
+      }
+    }
+
+    if (pos.back_to_punc) continue;
+
+    // If we are dealing with the pattern dawg, look up all the
+    // possible edges, not only for the exact unichar_id, but also
+    // for all its character classes (alpha, digit, etc).
+    if (dawg->type() == DAWG_TYPE_PATTERN) {
+      ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args,
+                          &curr_perm);
+      // There can't be any successors to dawg that is of type
+      // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
+      continue;
+    }
+
+    // Find the edge out of the node for the unichar_id.
+    NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
+    EDGE_REF edge =
+        (node == NO_EDGE)
+            ? NO_EDGE
+            : dawg->edge_char_of(
+                  node, char_for_dawg(unicharset, unichar_id, dawg), word_end);
+
+    if (dawg_debug_level >= 3) {
+      tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
+              pos.dawg_index, node, edge);
+    }
+
+    if (edge != NO_EDGE) {  // the unichar was found in the current dawg
+      if (dawg_debug_level >= 3) {
+        tprintf("Letter found in dawg %d\n", pos.dawg_index);
+      }
+      if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
+        if (dawg_debug_level >= 3) {
+          tprintf("Punctuation constraint not satisfied at end of word.\n");
+        }
+        continue;
+      }
+      if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
+      if (dawg->end_of_word(edge) &&
+          (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref)))
+        dawg_args->valid_end = true;
+      dawg_args->updated_dawgs->add_unique(
+          DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
+                       false),
+          dawg_debug_level > 0,
+          "Append current dawg to updated active dawgs: ");
+    }
+  }  // end for
+  // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
+  // or if we found the current letter in a non-punctuation dawg. This
+  // allows preserving information on which dawg the "core" word came from.
+  // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
+  if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
+      (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
+    dawg_args->permuter = curr_perm;
+  }
+  if (dawg_debug_level >= 2) {
+    tprintf("Returning %d for permuter code for this character.\n",
+            dawg_args->permuter);
+  }
+  return dawg_args->permuter;
+}
+
+void Dict::ProcessPatternEdges(const Dawg* dawg, const DawgPosition& pos,
+                               UNICHAR_ID unichar_id, bool word_end,
+                               DawgArgs* dawg_args,
+                               PermuterType* curr_perm) const {
+  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
+  // Try to find the edge corresponding to the exact unichar_id and to all the
+  // edges corresponding to the character class of unichar_id.
+  GenericVector<UNICHAR_ID> unichar_id_patterns;
+  unichar_id_patterns.push_back(unichar_id);
+  dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
+                               &unichar_id_patterns);
+  for (int i = 0; i < unichar_id_patterns.size(); ++i) {
+    // On the first iteration check all the outgoing edges.
+    // On the second iteration check all self-loops.
+    for (int k = 0; k < 2; ++k) {
+      EDGE_REF edge =
+          (k == 0) ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
+                   : dawg->pattern_loop_edge(pos.dawg_ref,
+                                             unichar_id_patterns[i], word_end);
+      if (edge == NO_EDGE) continue;
+      if (dawg_debug_level >= 3) {
+        tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
+                pos.dawg_index, node, edge);
+        tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
+      }
+      if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
+      if (dawg->end_of_word(edge)) dawg_args->valid_end = true;
+      dawg_args->updated_dawgs->add_unique(
+          DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
+                       pos.back_to_punc),
+          dawg_debug_level > 0,
+          "Append current dawg to updated active dawgs: ");
+    }
+  }
+}
+
+// Fill the given active_dawgs vector with dawgs that could contain the
+// beginning of the word. If hyphenated() returns true, copy the entries
+// from hyphen_active_dawgs_ instead.
+void Dict::init_active_dawgs(DawgPositionVector* active_dawgs,
+                             bool ambigs_mode) const {
+  int i;
+  if (hyphenated()) {
+    *active_dawgs = hyphen_active_dawgs_;
+    if (dawg_debug_level >= 3) {
+      for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
+        tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
+                hyphen_active_dawgs_[i].dawg_index,
+                hyphen_active_dawgs_[i].dawg_ref);
+      }
+    }
+  } else {
+    default_dawgs(active_dawgs, ambigs_mode);
+  }
+}
+
+void Dict::default_dawgs(DawgPositionVector* dawg_pos_vec,
+                         bool suppress_patterns) const {
+  bool punc_dawg_available =
+      (punc_dawg_ != nullptr) &&
+      punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
+
+  for (int i = 0; i < dawgs_.size(); i++) {
+    if (dawgs_[i] != nullptr &&
+        !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
+      int dawg_ty = dawgs_[i]->type();
+      bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
+      if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
+        *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
+        if (dawg_debug_level >= 3) {
+          tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i,
+                  NO_EDGE);
+        }
+      } else if (!punc_dawg_available || !subsumed_by_punc) {
+        *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
+        if (dawg_debug_level >= 3) {
+          tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
+        }
+      }
+    }
+  }
+}
+
+void Dict::add_document_word(const WERD_CHOICE& best_choice) {
+  // Do not add hyphenated word parts to the document dawg.
+  // hyphen_word_ will be non-nullptr after the set_hyphen_word() is
+  // called when the first part of the hyphenated word is
+  // discovered and while the second part of the word is recognized.
+  // hyphen_word_ is cleared in cc_recg() before the next word on
+  // the line is recognized.
+  if (hyphen_word_) return;
+
+  int stringlen = best_choice.length();
+
+  if (valid_word(best_choice) || stringlen < 2) return;
+
+  // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
+  if (best_choice.length() >= kDocDictMaxRepChars) {
+    int num_rep_chars = 1;
+    UNICHAR_ID uch_id = best_choice.unichar_id(0);
+    for (int i = 1; i < best_choice.length(); ++i) {
+      if (best_choice.unichar_id(i) != uch_id) {
+        num_rep_chars = 1;
+        uch_id = best_choice.unichar_id(i);
+      } else {
+        ++num_rep_chars;
+        if (num_rep_chars == kDocDictMaxRepChars) return;
+      }
+    }
+  }
+
+  if (best_choice.certainty() < doc_dict_certainty_threshold ||
+      stringlen == 2) {
+    if (best_choice.certainty() < doc_dict_pending_threshold) return;
+
+    if (!pending_words_->word_in_dawg(best_choice)) {
+      if (stringlen > 2 ||
+          (stringlen == 2 &&
+           getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
+           getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
+        pending_words_->add_word_to_dawg(best_choice);
+      }
+      return;
+    }
+  }
+
+  if (save_doc_words) {
+    STRING filename(getCCUtil()->imagefile);
+    filename += ".doc";
+    FILE* doc_word_file = fopen(filename.c_str(), "a");
+    if (doc_word_file == nullptr) {
+      tprintf("Error: Could not open file %s\n", filename.c_str());
+      ASSERT_HOST(doc_word_file);
+    }
+    fprintf(doc_word_file, "%s\n", best_choice.debug_string().c_str());
+    fclose(doc_word_file);
+  }
+  document_words_->add_word_to_dawg(best_choice);
+}
+
+void Dict::adjust_word(WERD_CHOICE* word, bool nonword,
+                       XHeightConsistencyEnum xheight_consistency,
+                       float additional_adjust, bool modify_rating,
+                       bool debug) {
+  bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
+                 word->GetTopScriptID() == getUnicharset().han_sid());
+  bool case_is_ok = (is_han || case_ok(*word));
+  bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
+
+  float adjust_factor = additional_adjust;
+  float new_rating = word->rating();
+  new_rating += kRatingPad;
+  const char* xheight_triggered = "";
+  if (word->length() > 1) {
+    // Calculate x-height and y-offset consistency penalties.
+    switch (xheight_consistency) {
+      case XH_INCONSISTENT:
+        adjust_factor += xheight_penalty_inconsistent;
+        xheight_triggered = ", xhtBAD";
+        break;
+      case XH_SUBNORMAL:
+        adjust_factor += xheight_penalty_subscripts;
+        xheight_triggered = ", xhtSUB";
+        break;
+      case XH_GOOD:
+        // leave the factor alone - all good!
+        break;
+    }
+    // TODO(eger): if nonword is true, but there is a "core" that is a dict
+    // word, negate nonword status.
+  } else {
+    if (debug) {
+      tprintf("Consistency could not be calculated.\n");
+    }
+  }
+  if (debug) {
+    tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "",
+            word->unichar_string().c_str(), word->rating(), xheight_triggered);
+  }
+
+  if (nonword) {  // non-dictionary word
+    if (case_is_ok && punc_is_ok) {
+      adjust_factor += segment_penalty_dict_nonword;
+      new_rating *= adjust_factor;
+      if (debug) tprintf(", W");
+    } else {
+      adjust_factor += segment_penalty_garbage;
+      new_rating *= adjust_factor;
+      if (debug) {
+        if (!case_is_ok) tprintf(", C");
+        if (!punc_is_ok) tprintf(", P");
+      }
+    }
+  } else {  // dictionary word
+    if (case_is_ok) {
+      if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) {
+        word->set_permuter(FREQ_DAWG_PERM);
+        adjust_factor += segment_penalty_dict_frequent_word;
+        new_rating *= adjust_factor;
+        if (debug) tprintf(", F");
+      } else {
+        adjust_factor += segment_penalty_dict_case_ok;
+        new_rating *= adjust_factor;
+        if (debug) tprintf(", ");
+      }
+    } else {
+      adjust_factor += segment_penalty_dict_case_bad;
+      new_rating *= adjust_factor;
+      if (debug) tprintf(", C");
+    }
+  }
+  new_rating -= kRatingPad;
+  if (modify_rating) word->set_rating(new_rating);
+  if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
+  word->set_adjust_factor(adjust_factor);
+}
+
+int Dict::valid_word(const WERD_CHOICE& word, bool numbers_ok) const {
+  const WERD_CHOICE* word_ptr = &word;
+  WERD_CHOICE temp_word(word.unicharset());
+  if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
+    copy_hyphen_info(&temp_word);
+    temp_word += word;
+    word_ptr = &temp_word;
+  }
+  if (word_ptr->length() == 0) return NO_PERM;
+  // Allocate vectors for holding current and updated
+  // active_dawgs and initialize them.
+  DawgPositionVector active_dawgs[2];
+  init_active_dawgs(&(active_dawgs[0]), false);
+  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
+  int last_index = word_ptr->length() - 1;
+  // Call letter_is_okay for each letter in the word.
+  for (int i = hyphen_base_size(); i <= last_index; ++i) {
+    if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(),
+                                   word_ptr->unichar_id(i), i == last_index)))
+      break;
+    // Swap active_dawgs, constraints with the corresponding updated vector.
+    if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
+      dawg_args.updated_dawgs = &(active_dawgs[0]);
+      ++(dawg_args.active_dawgs);
+    } else {
+      ++(dawg_args.updated_dawgs);
+      dawg_args.active_dawgs = &(active_dawgs[0]);
+    }
+  }
+  return valid_word_permuter(dawg_args.permuter, numbers_ok)
+             ? dawg_args.permuter
+             : NO_PERM;
+}
+
+bool Dict::valid_bigram(const WERD_CHOICE& word1,
+                        const WERD_CHOICE& word2) const {
+  if (bigram_dawg_ == nullptr) return false;
+
+  // Extract the core word from the middle of each word with any digits
+  //         replaced with question marks.
+  int w1start, w1end, w2start, w2end;
+  word1.punct_stripped(&w1start, &w1end);
+  word2.punct_stripped(&w2start, &w2end);
+
+  // We don't want to penalize a single guillemet, hyphen, etc.
+  // But our bigram list doesn't have any information about punctuation.
+  if (w1start >= w1end) return word1.length() < 3;
+  if (w2start >= w2end) return word2.length() < 3;
+
+  const UNICHARSET& uchset = getUnicharset();
+  std::vector<UNICHAR_ID> bigram_string;
+  bigram_string.reserve(w1end + w2end + 1);
+  for (int i = w1start; i < w1end; i++) {
+    const auto &normed_ids =
+        getUnicharset().normed_ids(word1.unichar_id(i));
+    if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
+      bigram_string.push_back(question_unichar_id_);
+    else
+      bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
+  }
+  bigram_string.push_back(UNICHAR_SPACE);
+  for (int i = w2start; i < w2end; i++) {
+    const auto &normed_ids =
+        getUnicharset().normed_ids(word2.unichar_id(i));
+    if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
+      bigram_string.push_back(question_unichar_id_);
+    else
+      bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
+  }
+  WERD_CHOICE normalized_word(&uchset, bigram_string.size());
+  for (int i = 0; i < bigram_string.size(); ++i) {
+    normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1, 0.0f,
+                                                      0.0f);
+  }
+  return bigram_dawg_->word_in_dawg(normalized_word);
+}
+
+bool Dict::valid_punctuation(const WERD_CHOICE& word) {
+  if (word.length() == 0) return NO_PERM;
+  int i;
+  WERD_CHOICE new_word(word.unicharset());
+  int last_index = word.length() - 1;
+  int new_len = 0;
+  for (i = 0; i <= last_index; ++i) {
+    UNICHAR_ID unichar_id = (word.unichar_id(i));
+    if (getUnicharset().get_ispunctuation(unichar_id)) {
+      new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
+    } else if (!getUnicharset().get_isalpha(unichar_id) &&
+               !getUnicharset().get_isdigit(unichar_id)) {
+      return false;  // neither punc, nor alpha, nor digit
+    } else if ((new_len = new_word.length()) == 0 ||
+               new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) {
+      new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
+    }
+  }
+  for (i = 0; i < dawgs_.size(); ++i) {
+    if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
+        dawgs_[i]->word_in_dawg(new_word))
+      return true;
+  }
+  return false;
+}
+
+/// Returns true if the language is space-delimited (not CJ, or T).
+bool Dict::IsSpaceDelimitedLang() const {
+  const UNICHARSET& u_set = getUnicharset();
+  if (u_set.han_sid() > 0) return false;
+  if (u_set.katakana_sid() > 0) return false;
+  if (u_set.thai_sid() > 0) return false;
+  return true;
+}
+
+}  // namespace tesseract
diff --git a/tesseract/src/dict/dict.h b/tesseract/src/dict/dict.h
new file mode 100644
index 00000000..e8ec2e37
--- /dev/null
+++ b/tesseract/src/dict/dict.h
@@ -0,0 +1,651 @@
+///////////////////////////////////////////////////////////////////////
+// File:        dict.h
+// Description: dict class.
+// Author:      Samuel Charron
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_DICT_DICT_H_
+#define TESSERACT_DICT_DICT_H_
+
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h" // DISABLED_LEGACY_ENGINE
+#endif
+
+#ifndef DISABLED_LEGACY_ENGINE
+#include "ambigs.h"
+#endif
+#include "dawg.h"
+#include "dawg_cache.h"
+#include "ratngs.h"
+#include "stopper.h"
+#include "trie.h"
+#include "unicharset.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#include "params_training_featdef.h"
+#endif  // ndef DISABLED_LEGACY_ENGINE
+
+namespace tesseract {
+
+class MATRIX;
+class WERD_RES;
+
+#define CHARS_PER_LINE 500
+#define MAX_WERD_LENGTH        (int64_t) 128
+#define NO_RATING               -1
+
+/** Struct used to hold temporary information about fragments. */
+struct CHAR_FRAGMENT_INFO {
+  UNICHAR_ID unichar_id;
+  const CHAR_FRAGMENT *fragment;
+  int num_fragments;
+  float rating;
+  float certainty;
+};
+
+using DawgVector = GenericVector<Dawg *>;
+
+//
+// Constants
+//
+static const int kRatingPad = 4;
+static const int kDictMaxWildcards = 2;  // max wildcards for a word
+// TODO(daria): If hyphens are different in different languages and can be
+// inferred from training data we should load their values dynamically.
+static const char kHyphenSymbol[] = "-";
+static const char kSlashSymbol[] = "/";
+static const char kQuestionSymbol[] = "?";
+static const char kApostropheSymbol[] = "'";
+static const float kSimCertaintyScale = -10.0;   // similarity matcher scaling
+static const float kSimCertaintyOffset = -10.0;  // similarity matcher offset
+static const float kSimilarityFloor = 100.0;  // worst E*L product to stop on
+static const int kDocDictMaxRepChars = 4;
+
+// Enum for describing whether the x-height for the word is consistent:
+//  0 - everything is good.
+//  1 - there are one or two secondary (but consistent) baselines
+//      [think subscript and superscript], or there is an oversized
+//      first character.
+//  2 - the word is inconsistent.
+enum XHeightConsistencyEnum {XH_GOOD, XH_SUBNORMAL, XH_INCONSISTENT};
+
+struct DawgArgs {
+  DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
+      : active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {}
+
+  DawgPositionVector *active_dawgs;
+  DawgPositionVector *updated_dawgs;
+  PermuterType permuter;
+  // True if the current position is a valid word end.
+  bool valid_end;
+};
+
+class TESS_API Dict {
+ public:
+  Dict(CCUtil* image_ptr);
+  ~Dict();
+  const CCUtil* getCCUtil() const {
+    return ccutil_;
+  }
+  CCUtil* getCCUtil() {
+    return ccutil_;
+  }
+  const UNICHARSET& getUnicharset() const {
+    return getCCUtil()->unicharset;
+  }
+  UNICHARSET& getUnicharset() {
+    return getCCUtil()->unicharset;
+  }
+#ifndef DISABLED_LEGACY_ENGINE
+  const UnicharAmbigs &getUnicharAmbigs() const {
+    return getCCUtil()->unichar_ambigs;
+  }
+#endif
+  // Returns true if unichar_id is a word compounding character like - or /.
+  inline bool compound_marker(UNICHAR_ID unichar_id) {
+    const UNICHARSET& unicharset = getUnicharset();
+    ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
+    const auto &normed_ids =
+        unicharset.normed_ids(unichar_id);
+    return normed_ids.size() == 1 &&
+        (normed_ids[0] == hyphen_unichar_id_ ||
+         normed_ids[0] == slash_unichar_id_);
+  }
+  // Returns true if unichar_id is an apostrophe-like character that may
+  // separate prefix/suffix words from a main body word.
+  inline bool is_apostrophe(UNICHAR_ID unichar_id) {
+    const UNICHARSET& unicharset = getUnicharset();
+    ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
+    const auto &normed_ids =
+        unicharset.normed_ids(unichar_id);
+    return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
+  }
+
+  /* hyphen.cpp ************************************************************/
+
+  /// Returns true if we've recorded the beginning of a hyphenated word.
+  inline bool hyphenated() const { return
+    !last_word_on_line_ && hyphen_word_;
+  }
+  /// Size of the base word (the part on the line before) of a hyphenated word.
+  inline int hyphen_base_size() const {
+    return this->hyphenated() ? hyphen_word_->length() : 0;
+  }
+  /// If this word is hyphenated copy the base word (the part on
+  /// the line before) of a hyphenated word into the given word.
+  /// This function assumes that word is not nullptr.
+  inline void copy_hyphen_info(WERD_CHOICE *word) const {
+    if (this->hyphenated()) {
+      *word = *hyphen_word_;
+      if (hyphen_debug_level) word->print("copy_hyphen_info: ");
+    }
+  }
+  /// Check whether the word has a hyphen at the end.
+  inline bool has_hyphen_end(const UNICHARSET* unicharset,
+                             UNICHAR_ID unichar_id, bool first_pos) const {
+    if (!last_word_on_line_ || first_pos)
+      return false;
+    ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
+    const auto &normed_ids =
+        unicharset->normed_ids(unichar_id);
+    return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
+  }
+  /// Same as above, but check the unichar at the end of the word.
+  inline bool has_hyphen_end(const WERD_CHOICE &word) const {
+    int word_index = word.length() - 1;
+    return has_hyphen_end(word.unicharset(), word.unichar_id(word_index),
+                          word_index == 0);
+  }
+  /// Unless the previous word was the last one on the line, and the current
+  /// one is not (thus it is the first one on the line), erase hyphen_word_,
+  /// clear hyphen_active_dawgs_, update last_word_on_line_.
+  void reset_hyphen_vars(bool last_word_on_line);
+  /// Update hyphen_word_, and copy the given DawgPositionVectors into
+  /// hyphen_active_dawgs_ .
+  void set_hyphen_word(const WERD_CHOICE &word,
+                       const DawgPositionVector &active_dawgs);
+
+  /* permdawg.cpp ************************************************************/
+  // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig().
+  // When this function is refactored, permdawg.cpp can be removed.
+
+  /// Copies word into best_choice if its rating is smaller
+  /// than that of best_choice.
+  inline void update_best_choice(const WERD_CHOICE &word,
+                                 WERD_CHOICE *best_choice) {
+    if (word.rating() < best_choice->rating()) {
+      *best_choice = word;
+    }
+  }
+  /// Fill the given active_dawgs vector with dawgs that could contain the
+  /// beginning of the word. If hyphenated() returns true, copy the entries
+  /// from hyphen_active_dawgs_ instead.
+  void init_active_dawgs(DawgPositionVector *active_dawgs,
+                         bool ambigs_mode) const;
+  // Fill the given vector with the default collection of any-length dawgs
+  void default_dawgs(DawgPositionVector *anylength_dawgs,
+                               bool suppress_patterns) const;
+
+
+  /// Recursively explore all the possible character combinations in
+  /// the given char_choices. Use go_deeper_dawg_fxn() to explore all the
+  /// dawgs in the dawgs_ vector in parallel and discard invalid words.
+  ///
+  /// Allocate and return a WERD_CHOICE with the best valid word found.
+  WERD_CHOICE *dawg_permute_and_select(
+      const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit);
+  /// If the choice being composed so far could be a dictionary word
+  /// and we have not reached the end of the word keep exploring the
+  /// char_choices further.
+  void go_deeper_dawg_fxn(
+      const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
+      int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+      bool word_ending, WERD_CHOICE *word, float certainties[],
+      float *limit, WERD_CHOICE *best_choice, int *attempts_left,
+      void *void_more_args);
+
+  /// Pointer to go_deeper function.
+  void (Dict::*go_deeper_fxn_)(const char *debug,
+                               const BLOB_CHOICE_LIST_VECTOR &char_choices,
+                               int char_choice_index,
+                               const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+                               bool word_ending, WERD_CHOICE *word,
+                               float certainties[], float *limit,
+                               WERD_CHOICE *best_choice, int *attempts_left,
+                               void *void_more_args);
+  //
+  // Helper functions for dawg_permute_and_select().
+  //
+  void permute_choices(
+      const char *debug,
+      const BLOB_CHOICE_LIST_VECTOR &char_choices,
+      int char_choice_index,
+      const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+      WERD_CHOICE *word,
+      float certainties[],
+      float *limit,
+      WERD_CHOICE *best_choice,
+      int *attempts_left,
+      void *more_args);
+
+  void append_choices(
+      const char *debug,
+      const BLOB_CHOICE_LIST_VECTOR &char_choices,
+      const BLOB_CHOICE &blob_choice,
+      int char_choice_index,
+      const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+      WERD_CHOICE *word,
+      float certainties[],
+      float *limit,
+      WERD_CHOICE *best_choice,
+      int *attempts_left,
+      void *more_args);
+
+    bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
+                             float curr_rating, float curr_certainty,
+                             const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+                             const char *debug, int word_ending,
+                             CHAR_FRAGMENT_INFO *char_frag_info);
+
+  /* stopper.cpp *************************************************************/
+#if !defined(DISABLED_LEGACY_ENGINE)
+  bool NoDangerousAmbig(WERD_CHOICE *BestChoice,
+                        DANGERR *fixpt,
+                        bool fix_replaceable,
+                        MATRIX* ratings);
+#endif  // !defined(DISABLED_LEGACY_ENGINE)
+  // Replaces the corresponding wrong ngram in werd_choice with the correct
+  // one. The whole correct n-gram is inserted into the ratings matrix and
+  // the werd_choice: no more fragments!. Rating and certainty of new entries
+  // in matrix and werd_choice are the sum and mean of the wrong ngram
+  // respectively.
+  // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes
+  // mystring", with a new entry in the ratings matrix for ".
+  void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
+                    UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
+                    MATRIX *ratings);
+
+  /// Returns the length of the shortest alpha run in WordChoice.
+  int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const;
+  /// Returns true if the certainty of the BestChoice word is within a
+  /// reasonable range of the average certainties for the best choices for
+  /// each character in the segmentation.  This test is used to catch words
+  /// in which one character is much worse than the other characters in the
+  /// word (i.e. false will be returned in that case). The algorithm computes
+  /// the mean and std deviation of the certainties in the word with the worst
+  /// certainty thrown out.
+  int UniformCertainties(const WERD_CHOICE& word);
+  /// Returns true if the given best_choice is good enough to stop.
+  bool AcceptableChoice(const WERD_CHOICE& best_choice,
+                        XHeightConsistencyEnum xheight_consistency);
+  /// Returns false if the best choice for the current word is questionable
+  /// and should be tried again on the second pass or should be flagged to
+  /// the user.
+  bool AcceptableResult(WERD_RES *word) const;
+#if !defined(DISABLED_LEGACY_ENGINE)
+  void EndDangerousAmbigs();
+#endif   // !defined(DISABLED_LEGACY_ENGINE)
+  /// Prints the current choices for this word to stdout.
+  void DebugWordChoices();
+  /// Sets up stopper variables in preparation for the first pass.
+  void SettupStopperPass1();
+  /// Sets up stopper variables in preparation for the second pass.
+  void SettupStopperPass2();
+  /* context.cpp *************************************************************/
+  /// Check a string to see if it matches a set of lexical rules.
+  int case_ok(const WERD_CHOICE& word) const;
+  /// Returns true if the word looks like an absolute garbage
+  /// (e.g. image mistakenly recognized as text).
+  bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);
+
+  /* dict.cpp ****************************************************************/
+
+  /// Initialize Dict class - load dawgs from [lang].traineddata and
+  /// user-specified wordlist and parttern list.
+  static DawgCache *GlobalDawgCache();
+  // Sets up ready for a Load or LoadLSTM.
+  void SetupForLoad(DawgCache *dawg_cache);
+  // Loads the dawgs needed by Tesseract. Call FinishLoad() after.
+  void Load(const STRING &lang, TessdataManager *data_file);
+  // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
+  void LoadLSTM(const STRING &lang, TessdataManager *data_file);
+  // Completes the loading process after Load() and/or LoadLSTM().
+  // Returns false if no dictionaries were loaded.
+  bool FinishLoad();
+  void End();
+
+  // Resets the document dictionary analogous to ResetAdaptiveClassifier.
+  void ResetDocumentDictionary() {
+    if (pending_words_ != nullptr)
+      pending_words_->clear();
+    if (document_words_ != nullptr)
+      document_words_->clear();
+  }
+
+  /**
+   * Returns the maximal permuter code (from ccstruct/ratngs.h) if in light
+   * of the current state the letter at word_index in the given word
+   * is allowed according to at least one of the dawgs in dawgs_,
+   * otherwise returns NO_PERM.
+   *
+   * The state is described by void_dawg_args, which are interpreted as
+   * DawgArgs and contain relevant active dawg positions.
+   * Each entry in the active_dawgs vector contains an index
+   * into the dawgs_ vector and an EDGE_REF that indicates the last edge
+   * followed in the dawg.  It also may contain a position in the punctuation
+   * dawg which describes surrounding punctuation (see struct DawgPosition).
+   *
+   * Input:
+   * At word_index 0 dawg_args->active_dawgs should contain an entry for each
+   * dawg that may start at the beginning of a word, with punc_ref and edge_ref
+   * initialized to NO_EDGE.  Since the punctuation dawg includes the empty
+   * pattern " " (meaning anything without surrounding punctuation), having a
+   * single entry for the punctuation dawg will cover all dawgs reachable
+   * therefrom -- that includes all number and word dawgs. The only dawg
+   * non-reachable from the punctuation_dawg is the pattern dawg.
+   * If hyphen state needs to be applied, initial dawg_args->active_dawgs can
+   * be copied from the saved hyphen state (maintained by Dict).
+   * For word_index > 0 the corresponding state (active_dawgs and punc position)
+   * can be obtained from dawg_args->updated_dawgs passed to
+   * def_letter_is_okay for word_index-1.
+   * Note: the function assumes that active_dawgs, and updated_dawgs
+   * member variables of dawg_args are not nullptr.
+   *
+   * Output:
+   * The function fills in dawg_args->updated_dawgs vector with the
+   * entries for dawgs that contain the word up to the letter at word_index.
+   *
+   */
+
+  //
+  int def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset,
+                         UNICHAR_ID unichar_id, bool word_end) const;
+
+  int (Dict::*letter_is_okay_)(void* void_dawg_args,
+                               const UNICHARSET& unicharset,
+                               UNICHAR_ID unichar_id, bool word_end) const;
+  /// Calls letter_is_okay_ member function.
+  int LetterIsOkay(void* void_dawg_args, const UNICHARSET& unicharset,
+                   UNICHAR_ID unichar_id, bool word_end) const {
+    return (this->*letter_is_okay_)(void_dawg_args,
+                                    unicharset, unichar_id, word_end);
+  }
+
+
+  /// Probability in context function used by the ngram permuter.
+  double (Dict::*probability_in_context_)(const char* lang,
+                                          const char* context,
+                                          int context_bytes,
+                                          const char* character,
+                                          int character_bytes);
+  /// Calls probability_in_context_ member function.
+  double ProbabilityInContext(const char* context,
+                              int context_bytes,
+                              const char* character,
+                              int character_bytes) {
+    return (this->*probability_in_context_)(
+        getCCUtil()->lang.c_str(),
+        context, context_bytes,
+        character, character_bytes);
+  }
+
+  /// Default (no-op) implementation of probability in context function.
+  double def_probability_in_context(
+      const char* lang, const char* context, int context_bytes,
+      const char* character, int character_bytes) {
+    (void)lang;
+    (void)context;
+    (void)context_bytes;
+    (void)character;
+    (void)character_bytes;
+    return 0.0;
+  }
+
+  inline void SetWildcardID(UNICHAR_ID id) { wildcard_unichar_id_ = id; }
+  inline UNICHAR_ID WildcardID() const { return wildcard_unichar_id_; }
+  /// Return the number of dawgs in the dawgs_ vector.
+  inline int NumDawgs() const { return dawgs_.size(); }
+  /// Return i-th dawg pointer recorded in the dawgs_ vector.
+  inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
+  /// Return the points to the punctuation dawg.
+  inline const Dawg *GetPuncDawg() const { return punc_dawg_; }
+  /// Return the points to the unambiguous words dawg.
+  inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; }
+  /// Returns the appropriate next node given the EDGE_REF.
+  static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {
+    if (edge_ref == NO_EDGE) return 0;  // beginning to explore the dawg
+    NODE_REF node = dawg->next_node(edge_ref);
+    if (node == 0) node = NO_EDGE;  // end of word
+    return node;
+  }
+
+  // Given a unichar from a string and a given dawg, return the unichar
+  // we should use to match in that dawg type.  (for example, in the number
+  // dawg, all numbers are transformed to kPatternUnicharId).
+  UNICHAR_ID char_for_dawg(const UNICHARSET& unicharset, UNICHAR_ID ch,
+                           const Dawg *dawg) const {
+    if (!dawg) return ch;
+    switch (dawg->type()) {
+      case DAWG_TYPE_NUMBER:
+        return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
+      default:
+        return ch;
+    }
+  }
+
+  /// For each of the character classes of the given unichar_id (and the
+  /// unichar_id itself) finds the corresponding outgoing node or self-loop
+  /// in the given dawg and (after checking that it is valid) records it in
+  /// dawg_args->updated_ative_dawgs. Updates current_permuter if any valid
+  /// edges were found.
+  void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info,
+                           UNICHAR_ID unichar_id, bool word_end,
+                           DawgArgs *dawg_args,
+                           PermuterType *current_permuter) const;
+
+  /// Read/Write/Access special purpose dawgs which contain words
+  /// only of a certain length (used for phrase search for
+  /// non-space-delimited languages).
+
+  /// Check all the DAWGs to see if this word is in any of them.
+  inline static bool valid_word_permuter(uint8_t perm, bool numbers_ok) {
+    return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
+            perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
+            perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
+            (numbers_ok && perm == NUMBER_PERM));
+  }
+  int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
+  int valid_word(const WERD_CHOICE &word) const {
+    return valid_word(word, false);  // return NO_PERM for words with digits
+  }
+  int valid_word_or_number(const WERD_CHOICE &word) const {
+    return valid_word(word, true);  // return NUMBER_PERM for valid numbers
+  }
+  /// This function is used by api/tesseract_cube_combiner.cpp
+  int valid_word(const char *string) const {
+    WERD_CHOICE word(string, getUnicharset());
+    return valid_word(word);
+  }
+  // Do the two WERD_CHOICEs form a meaningful bigram?
+  bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
+  /// Returns true if the word contains a valid punctuation pattern.
+  /// Note: Since the domains of punctuation symbols and symblos
+  /// used in numbers are not disjoint, a valid number might contain
+  /// an invalid punctuation pattern (e.g. .99).
+  bool valid_punctuation(const WERD_CHOICE &word);
+  /// Returns true if a good answer is found for the unknown blob rating.
+  int good_choice(const WERD_CHOICE &choice);
+  /// Adds a word found on this document to the document specific dictionary.
+  void add_document_word(const WERD_CHOICE &best_choice);
+  /// Adjusts the rating of the given word.
+  void adjust_word(WERD_CHOICE *word,
+                   bool nonword, XHeightConsistencyEnum xheight_consistency,
+                   float additional_adjust,
+                   bool modify_rating,
+                   bool debug);
+  /// Set wordseg_rating_adjust_factor_ to the given value.
+  inline void SetWordsegRatingAdjustFactor(float f) {
+    wordseg_rating_adjust_factor_ = f;
+  }
+  /// Returns true if the language is space-delimited (not CJ, or T).
+  bool IsSpaceDelimitedLang() const;
+
+ private:
+  /** Private member variables. */
+  CCUtil* ccutil_;
+  /**
+   * Table that stores ambiguities computed during training
+   * (loaded when NoDangerousAmbigs() is called for the first time).
+   * Each entry i in the table stores a set of amibiguities whose
+   * wrong ngram starts with unichar id i.
+   */
+#ifndef DISABLED_LEGACY_ENGINE
+  UnicharAmbigs* dang_ambigs_table_ = nullptr;
+  /** Same as above, but for ambiguities with replace flag set. */
+  UnicharAmbigs* replace_ambigs_table_ = nullptr;
+#endif
+  /** Additional certainty padding allowed before a word is rejected. */
+  float reject_offset_;
+  // Cached UNICHAR_IDs:
+  UNICHAR_ID wildcard_unichar_id_;    // kDictWildcard.
+  UNICHAR_ID apostrophe_unichar_id_;  // kApostropheSymbol.
+  UNICHAR_ID question_unichar_id_;    // kQuestionSymbol.
+  UNICHAR_ID slash_unichar_id_;       // kSlashSymbol.
+  UNICHAR_ID hyphen_unichar_id_;      // kHyphenSymbol.
+  // Hyphen-related variables.
+  WERD_CHOICE *hyphen_word_;
+  DawgPositionVector hyphen_active_dawgs_;
+  bool last_word_on_line_;
+  // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
+  // matching.  The first member of each list is taken as canonical.  For
+  // example, the first list contains hyphens and dashes with the first symbol
+  // being the ASCII hyphen minus.
+  std::vector<GenericVector<UNICHAR_ID> > equivalent_symbols_;
+  // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.
+  DawgCache *dawg_cache_;
+  bool dawg_cache_is_ours_;  // we should delete our own dawg_cache_
+  // Dawgs.
+  DawgVector dawgs_;
+  SuccessorListsVector successors_;
+  Trie *pending_words_;
+  /// The following pointers are only cached for convenience.
+  /// The dawgs will be deleted when dawgs_ vector is destroyed.
+  // bigram_dawg_ points to a dawg of two-word bigrams which always supersede if
+  // any of them are present on the best choices list for a word pair.
+  // the bigrams are stored as space-separated words where:
+  // (1) leading and trailing punctuation has been removed from each word and
+  // (2) any digits have been replaced with '?' marks.
+  Dawg *bigram_dawg_;
+  // TODO(daria): need to support multiple languages in the future,
+  // so maybe will need to maintain a list of dawgs of each kind.
+  Dawg *freq_dawg_;
+  Dawg *unambig_dawg_;
+  Dawg *punc_dawg_;
+  Trie *document_words_;
+  /// Current segmentation cost adjust factor for word rating.
+  /// See comments in incorporate_segcost.
+  float wordseg_rating_adjust_factor_;
+  // File for recording ambiguities discovered during dictionary search.
+  FILE *output_ambig_words_file_;
+
+ public:
+  /// Variable members.
+  /// These have to be declared and initialized after image_ptr_, which contains
+  /// the pointer to the params vector - the member of its base CCUtil class.
+  STRING_VAR_H(user_words_file, "", "A filename of user-provided words.");
+  STRING_VAR_H(user_words_suffix, "",
+               "A suffix of user-provided words located in tessdata.");
+  STRING_VAR_H(user_patterns_file, "",
+               "A filename of user-provided patterns.");
+  STRING_VAR_H(user_patterns_suffix, "",
+               "A suffix of user-provided patterns located in tessdata.");
+  BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
+  BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
+  BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
+  BOOL_VAR_H(load_punc_dawg, true,
+             "Load dawg with punctuation patterns.");
+  BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns.");
+  BOOL_VAR_H(load_bigram_dawg, true,
+             "Load dawg with special word bigrams.");
+  double_VAR_H(xheight_penalty_subscripts, 0.125,
+               "Score penalty (0.1 = 10%) added if there are subscripts "
+               "or superscripts in a word, but it is otherwise OK.");
+  double_VAR_H(xheight_penalty_inconsistent, 0.25,
+               "Score penalty (0.1 = 10%) added if an xheight is "
+               "inconsistent.");
+  double_VAR_H(segment_penalty_dict_frequent_word, 1.0,
+               "Score multiplier for word matches which have good case and"
+               "are frequent in the given language (lower is better).");
+
+  double_VAR_H(segment_penalty_dict_case_ok, 1.1,
+               "Score multiplier for word matches that have good case "
+               "(lower is better).");
+
+  double_VAR_H(segment_penalty_dict_case_bad, 1.3125,
+               "Default score multiplier for word matches, which may have "
+               "case issues (lower is better).");
+
+  double_VAR_H(segment_penalty_dict_nonword, 1.25,
+               "Score multiplier for glyph fragment segmentations which "
+               "do not match a dictionary word (lower is better).");
+
+  double_VAR_H(segment_penalty_garbage, 1.50,
+               "Score multiplier for poorly cased strings that are not in"
+               " the dictionary and generally look like garbage (lower is"
+               " better).");
+  STRING_VAR_H(output_ambig_words_file, "",
+               "Output file for ambiguities found in the dictionary");
+  INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info"
+            ", to 2 for more details, to 3 to see all the debug messages");
+  INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
+  BOOL_VAR_H(use_only_first_uft8_step, false,
+             "Use only the first UTF8 step of the given string"
+             " when computing log probabilities.");
+  double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
+  double_VAR_H(stopper_nondict_certainty_base, -2.50,
+               "Certainty threshold for non-dict words");
+  double_VAR_H(stopper_phase2_certainty_rejection_offset, 1.0,
+               "Reject certainty offset");
+  INT_VAR_H(stopper_smallword_size, 2,
+            "Size of dict word to be treated as non-dict word");
+  double_VAR_H(stopper_certainty_per_char, -0.50,
+               "Certainty to add for each dict char above small word size.");
+  double_VAR_H(stopper_allowable_character_badness, 3.0,
+               "Max certaintly variation allowed in a word (in sigma)");
+  INT_VAR_H(stopper_debug_level, 0, "Stopper debug level");
+  BOOL_VAR_H(stopper_no_acceptable_choices, false,
+             "Make AcceptableChoice() always return false. Useful"
+             " when there is a need to explore all segmentations");
+  INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
+  STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
+               " should be printed to stdout");
+  BOOL_VAR_H(segment_nonalphabetic_script, false,
+             "Don't use any alphabetic-specific tricks."
+             "Set to true in the traineddata config file for"
+             " scripts that are cursive or inherently fixed-pitch");
+  BOOL_VAR_H(save_doc_words, 0, "Save Document Words");
+  double_VAR_H(doc_dict_pending_threshold, 0.0,
+               "Worst certainty for using pending dictionary");
+  double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty"
+               " for words that can be inserted into the document dictionary");
+  INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different"
+              " character choices to consider during permutation."
+              " This limit is especially useful when user patterns"
+              " are specified, since overly generic patterns can result in"
+              " dawg search exploring an overly large number of options.");
+};
+
+}  // namespace tesseract
+
+#endif  // THIRD_PARTY_TESSERACT_DICT_DICT_H_
diff --git a/tesseract/src/dict/hyphen.cpp b/tesseract/src/dict/hyphen.cpp
new file mode 100644
index 00000000..7a0a622c
--- /dev/null
+++ b/tesseract/src/dict/hyphen.cpp
@@ -0,0 +1,61 @@
+/******************************************************************************
+ * File:         hyphen.cpp  (Formerly hyphen.c)
+ * Description:  Functions for maintaining information about hyphenated words.
+ * Author:       Mark Seaman, OCR Technology
+ * Status:       Reusable Software Component
+ *
+ * (c) Copyright 1987, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *****************************************************************************/
+
+#include "dict.h"
+
+namespace tesseract {
+
+// Unless the previous word was the last one on the line, and the current
+// one is not (thus it is the first one on the line), erase hyphen_word_,
+// clear hyphen_active_dawgs_, hyphen_constraints_ update last_word_on_line_.
+void Dict::reset_hyphen_vars(bool last_word_on_line) {
+  if (!(last_word_on_line_ == true && last_word_on_line == false)) {
+    if (hyphen_word_ != nullptr) {
+      delete hyphen_word_;
+      hyphen_word_ = nullptr;
+      hyphen_active_dawgs_.clear();
+    }
+  }
+  if (hyphen_debug_level) {
+    tprintf("reset_hyphen_vars: last_word_on_line %d -> %d\n",
+            last_word_on_line_, last_word_on_line);
+  }
+  last_word_on_line_ = last_word_on_line;
+}
+
+// Update hyphen_word_, and copy the given DawgPositionVectors into
+// hyphen_active_dawgs_.
+void Dict::set_hyphen_word(const WERD_CHOICE &word,
+                           const DawgPositionVector &active_dawgs) {
+  if (hyphen_word_ == nullptr) {
+    hyphen_word_ = new WERD_CHOICE(word.unicharset());
+    hyphen_word_->make_bad();
+  }
+  if (hyphen_word_->rating() > word.rating()) {
+    *hyphen_word_ = word;
+    // Remove the last unichar id as it is a hyphen, and remove
+    // any unichar_string/lengths that are present.
+    hyphen_word_->remove_last_unichar_id();
+    hyphen_active_dawgs_ = active_dawgs;
+  }
+  if (hyphen_debug_level) {
+    hyphen_word_->print("set_hyphen_word: ");
+  }
+}
+}  // namespace tesseract
diff --git a/tesseract/src/dict/matchdefs.h b/tesseract/src/dict/matchdefs.h
new file mode 100644
index 00000000..b95dbc2b
--- /dev/null
+++ b/tesseract/src/dict/matchdefs.h
@@ -0,0 +1,50 @@
+/******************************************************************************
+ ** Filename:    matchdefs.h
+ ** Purpose:     Generic interface definitions for feature matchers.
+ ** Author:      Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#ifndef MATCHDEFS_H
+#define MATCHDEFS_H
+
+#include <tesseract/unichar.h>
+
+#include <climits> // INT16_MAX
+#include <cstdint> // int16_t
+
+namespace tesseract {
+
+/* define the maximum number of classes defined for any matcher
+  and the maximum class id for any matcher. This must be changed
+  if more different classes need to be classified */
+#define MAX_NUM_CLASSES   INT16_MAX
+
+/** a CLASS_ID is the ascii character to be associated with a class */
+using CLASS_ID = UNICHAR_ID;
+#define NO_CLASS      (0)
+
+/** a PROTO_ID is the index of a prototype within it's class.  Valid proto
+  id's are 0 to N-1 where N is the number of prototypes that make up the
+  class. */
+using PROTO_ID = int16_t;
+#define NO_PROTO  (-1)
+
+/** FEATURE_ID is the index of a feature within a character description
+  The feature id ranges from 0 to N-1 where N is the number
+  of features in a character description. */
+using FEATURE_ID = uint8_t;
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/dict/permdawg.cpp b/tesseract/src/dict/permdawg.cpp
new file mode 100644
index 00000000..33f6b697
--- /dev/null
+++ b/tesseract/src/dict/permdawg.cpp
@@ -0,0 +1,390 @@
+/******************************************************************************
+ *
+ * File:         permdawg.cpp  (Formerly permdawg.c)
+ * Description:  Scale word choices by a dictionary
+ * Author:       Mark Seaman, OCR Technology
+ *
+ * (c) Copyright 1987, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *****************************************************************************/
+/*----------------------------------------------------------------------
+              I n c l u d e s
+----------------------------------------------------------------------*/
+
+#include "dawg.h"
+#include "stopper.h"
+#include "tprintf.h"
+#include "params.h"
+
+#include <algorithm>
+#include <cctype>
+#include "dict.h"
+
+/*----------------------------------------------------------------------
+              F u n c t i o n s
+----------------------------------------------------------------------*/
+namespace tesseract {
+
+/**
+ * @name go_deeper_dawg_fxn
+ *
+ * If the choice being composed so far could be a dictionary word
+ * keep exploring choices.
+ */
+void Dict::go_deeper_dawg_fxn(
+    const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
+    int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+    bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
+    WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) {
+  auto *more_args = static_cast<DawgArgs *>(void_more_args);
+  word_ending = (char_choice_index == char_choices.size()-1);
+  int word_index = word->length() - 1;
+  if (best_choice->rating() < *limit) return;
+  // Look up char in DAWG
+
+  // If the current unichar is an ngram first try calling
+  // letter_is_okay() for each unigram it contains separately.
+  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
+  bool checked_unigrams = false;
+  if (getUnicharset().get_isngram(orig_uch_id)) {
+    if (dawg_debug_level) {
+      tprintf("checking unigrams in an ngram %s\n",
+              getUnicharset().debug_str(orig_uch_id).c_str());
+    }
+    int num_unigrams = 0;
+    word->remove_last_unichar_id();
+    std::vector<UNICHAR_ID> encoding;
+    const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
+    // Since the string came out of the unicharset, failure is impossible.
+    ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr,
+                                              nullptr));
+    bool unigrams_ok = true;
+    // Construct DawgArgs that reflect the current state.
+    DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
+    DawgPositionVector unigram_updated_dawgs;
+    DawgArgs unigram_dawg_args(&unigram_active_dawgs,
+                               &unigram_updated_dawgs,
+                               more_args->permuter);
+    // Check unigrams in the ngram with letter_is_okay().
+    for (int i = 0; unigrams_ok && i < encoding.size(); ++i) {
+      UNICHAR_ID uch_id = encoding[i];
+      ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
+      ++num_unigrams;
+      word->append_unichar_id(uch_id, 1, 0.0, 0.0);
+      unigrams_ok = (this->*letter_is_okay_)(
+          &unigram_dawg_args, *word->unicharset(),
+          word->unichar_id(word_index+num_unigrams-1),
+          word_ending && i == encoding.size() - 1);
+      (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
+      if (dawg_debug_level) {
+        tprintf("unigram %s is %s\n",
+                getUnicharset().debug_str(uch_id).c_str(),
+                unigrams_ok ? "OK" : "not OK");
+      }
+    }
+    // Restore the word and copy the updated dawg state if needed.
+    while (num_unigrams-- > 0) word->remove_last_unichar_id();
+    word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
+    if (unigrams_ok) {
+      checked_unigrams = true;
+      more_args->permuter = unigram_dawg_args.permuter;
+      *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
+    }
+  }
+
+  // Check which dawgs from the dawgs_ vector contain the word
+  // up to and including the current unichar.
+  if (checked_unigrams || (this->*letter_is_okay_)(
+      more_args, *word->unicharset(), word->unichar_id(word_index),
+      word_ending)) {
+    // Add a new word choice
+    if (word_ending) {
+      if (dawg_debug_level) {
+        tprintf("found word = %s\n", word->debug_string().c_str());
+      }
+      if (strcmp(output_ambig_words_file.c_str(), "") != 0) {
+        if (output_ambig_words_file_ == nullptr) {
+          output_ambig_words_file_ =
+              fopen(output_ambig_words_file.c_str(), "wb+");
+          if (output_ambig_words_file_ == nullptr) {
+            tprintf("Failed to open output_ambig_words_file %s\n",
+                    output_ambig_words_file.c_str());
+            exit(1);
+          }
+          STRING word_str;
+          word->string_and_lengths(&word_str, nullptr);
+          word_str += " ";
+          fprintf(output_ambig_words_file_, "%s", word_str.c_str());
+        }
+        STRING word_str;
+        word->string_and_lengths(&word_str, nullptr);
+        word_str += " ";
+        fprintf(output_ambig_words_file_, "%s", word_str.c_str());
+      }
+      WERD_CHOICE *adjusted_word = word;
+      adjusted_word->set_permuter(more_args->permuter);
+      update_best_choice(*adjusted_word, best_choice);
+    } else {  // search the next letter
+      // Make updated_* point to the next entries in the DawgPositionVector
+      // arrays (that were originally created in dawg_permute_and_select)
+      ++(more_args->updated_dawgs);
+      // Make active_dawgs and constraints point to the updated ones.
+      ++(more_args->active_dawgs);
+      permute_choices(debug, char_choices, char_choice_index + 1,
+                      prev_char_frag_info, word, certainties, limit,
+                      best_choice, attempts_left, more_args);
+      // Restore previous state to explore another letter in this position.
+      --(more_args->updated_dawgs);
+      --(more_args->active_dawgs);
+    }
+  } else {
+      if (dawg_debug_level) {
+        tprintf("last unichar not OK at index %d in %s\n",
+                word_index, word->debug_string().c_str());
+    }
+  }
+}
+
+
+/**
+ * dawg_permute_and_select
+ *
+ * Recursively explore all the possible character combinations in
+ * the given char_choices. Use go_deeper_dawg_fxn() to search all the
+ * dawgs in the dawgs_ vector in parallel and discard invalid words.
+ *
+ * Allocate and return a WERD_CHOICE with the best valid word found.
+ */
+WERD_CHOICE *Dict::dawg_permute_and_select(
+    const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
+  auto *best_choice = new WERD_CHOICE(&getUnicharset());
+  best_choice->make_bad();
+  best_choice->set_rating(rating_limit);
+  if (char_choices.size() == 0 || char_choices.size() > MAX_WERD_LENGTH)
+    return best_choice;
+  auto *active_dawgs =
+      new DawgPositionVector[char_choices.size() + 1];
+  init_active_dawgs(&(active_dawgs[0]), true);
+  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
+  WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);
+
+  float certainties[MAX_WERD_LENGTH];
+  this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
+  int attempts_left = max_permuter_attempts;
+  permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr,
+      char_choices, 0, nullptr, &word, certainties, &rating_limit, best_choice,
+      &attempts_left, &dawg_args);
+  delete[] active_dawgs;
+  return best_choice;
+}
+
+/**
+ * permute_choices
+ *
+ * Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST
+ * with the given char_choice_index in char_choices.
+ */
+void Dict::permute_choices(
+    const char *debug,
+    const BLOB_CHOICE_LIST_VECTOR &char_choices,
+    int char_choice_index,
+    const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+    WERD_CHOICE *word,
+    float certainties[],
+    float *limit,
+    WERD_CHOICE *best_choice,
+    int *attempts_left,
+    void *more_args) {
+  if (debug) {
+    tprintf("%s permute_choices: char_choice_index=%d"
+            " limit=%g rating=%g, certainty=%g word=%s\n",
+            debug, char_choice_index, *limit, word->rating(),
+            word->certainty(), word->debug_string().c_str());
+  }
+  if (char_choice_index < char_choices.size()) {
+    BLOB_CHOICE_IT blob_choice_it;
+    blob_choice_it.set_to_list(char_choices.get(char_choice_index));
+    for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
+         blob_choice_it.forward()) {
+      (*attempts_left)--;
+      append_choices(debug, char_choices, *(blob_choice_it.data()),
+                     char_choice_index, prev_char_frag_info, word,
+                     certainties, limit, best_choice, attempts_left, more_args);
+      if (*attempts_left <= 0) {
+        if (debug) tprintf("permute_choices(): attempts_left is 0\n");
+        break;
+      }
+    }
+  }
+}
+
+/**
+ * append_choices
+ *
+ * Checks to see whether or not the next choice is worth appending to
+ * the word being generated. If so then keeps going deeper into the word.
+ *
+ * This function assumes that Dict::go_deeper_fxn_ is set.
+ */
+void Dict::append_choices(
+    const char *debug,
+    const BLOB_CHOICE_LIST_VECTOR &char_choices,
+    const BLOB_CHOICE &blob_choice,
+    int char_choice_index,
+    const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+    WERD_CHOICE *word,
+    float certainties[],
+    float *limit,
+    WERD_CHOICE *best_choice,
+    int *attempts_left,
+    void *more_args) {
+  int word_ending = (char_choice_index == char_choices.size() - 1);
+
+  // Deal with fragments.
+  CHAR_FRAGMENT_INFO char_frag_info;
+  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
+                           blob_choice.certainty(), prev_char_frag_info, debug,
+                           word_ending, &char_frag_info)) {
+    return;  // blob_choice must be an invalid fragment
+  }
+  // Search the next letter if this character is a fragment.
+  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
+    permute_choices(debug, char_choices, char_choice_index + 1,
+                    &char_frag_info, word, certainties, limit,
+                    best_choice, attempts_left, more_args);
+    return;
+  }
+
+  // Add the next unichar.
+  float old_rating = word->rating();
+  float old_certainty = word->certainty();
+  uint8_t old_permuter = word->permuter();
+  certainties[word->length()] = char_frag_info.certainty;
+  word->append_unichar_id_space_allocated(
+      char_frag_info.unichar_id, char_frag_info.num_fragments,
+      char_frag_info.rating, char_frag_info.certainty);
+
+  // Explore the next unichar.
+  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
+                          &char_frag_info, word_ending, word, certainties,
+                          limit, best_choice, attempts_left, more_args);
+
+  // Remove the unichar we added to explore other choices in it's place.
+  word->remove_last_unichar_id();
+  word->set_rating(old_rating);
+  word->set_certainty(old_certainty);
+  word->set_permuter(old_permuter);
+}
+
+/**
+ * @name fragment_state
+ *
+ * Given the current char choice and information about previously seen
+ * fragments, determines whether adjacent character fragments are
+ * present and whether they can be concatenated.
+ *
+ * The given prev_char_frag_info contains:
+ * - fragment: if not nullptr contains information about immediately
+ *   preceding fragmented character choice
+ * - num_fragments: number of fragments that have been used so far
+ *   to construct a character
+ * - certainty: certainty of the current choice or minimum
+ *   certainty of all fragments concatenated so far
+ * - rating: rating of the current choice or sum of fragment
+ *   ratings concatenated so far
+ *
+ * The output char_frag_info is filled in as follows:
+ * - character: is set to be nullptr if the choice is a non-matching
+ *   or non-ending fragment piece; is set to unichar of the given choice
+ *   if it represents a regular character or a matching ending fragment
+ * - fragment,num_fragments,certainty,rating are set as described above
+ *
+ * @returns false if a non-matching fragment is discovered, true otherwise.
+ */
+bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id,
+                               float curr_rating, float curr_certainty,
+                               const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+                               const char *debug, int word_ending,
+                               CHAR_FRAGMENT_INFO *char_frag_info) {
+  const CHAR_FRAGMENT *this_fragment =
+    getUnicharset().get_fragment(curr_unichar_id);
+  const CHAR_FRAGMENT *prev_fragment =
+    prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
+
+  // Print debug info for fragments.
+  if (debug && (prev_fragment || this_fragment)) {
+    tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
+            getUnicharset().debug_str(curr_unichar_id).c_str(),
+            word_ending);
+    if (prev_fragment) {
+      tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str());
+    }
+    if (this_fragment) {
+      tprintf("this_fragment %s\n", this_fragment->to_string().c_str());
+    }
+  }
+
+  char_frag_info->unichar_id = curr_unichar_id;
+  char_frag_info->fragment = this_fragment;
+  char_frag_info->rating = curr_rating;
+  char_frag_info->certainty = curr_certainty;
+  char_frag_info->num_fragments = 1;
+  if (prev_fragment && !this_fragment) {
+    if (debug) tprintf("Skip choice with incomplete fragment\n");
+    return false;
+  }
+  if (this_fragment) {
+    // We are dealing with a fragment.
+    char_frag_info->unichar_id = INVALID_UNICHAR_ID;
+    if (prev_fragment) {
+      if (!this_fragment->is_continuation_of(prev_fragment)) {
+        if (debug) tprintf("Non-matching fragment piece\n");
+        return false;
+      }
+      if (this_fragment->is_ending()) {
+        char_frag_info->unichar_id =
+          getUnicharset().unichar_to_id(this_fragment->get_unichar());
+        char_frag_info->fragment = nullptr;
+        if (debug) {
+          tprintf("Built character %s from fragments\n",
+                  getUnicharset().debug_str(
+                      char_frag_info->unichar_id).c_str());
+        }
+      } else {
+        if (debug) tprintf("Record fragment continuation\n");
+        char_frag_info->fragment = this_fragment;
+      }
+      // Update certainty and rating.
+      char_frag_info->rating =
+        prev_char_frag_info->rating + curr_rating;
+      char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
+      char_frag_info->certainty =
+              std::min(curr_certainty, prev_char_frag_info->certainty);
+    } else {
+      if (this_fragment->is_beginning()) {
+        if (debug) tprintf("Record fragment beginning\n");
+      } else {
+        if (debug) {
+          tprintf("Non-starting fragment piece with no prev_fragment\n");
+        }
+        return false;
+      }
+    }
+  }
+  if (word_ending && char_frag_info->fragment) {
+    if (debug) tprintf("Word can not end with a fragment\n");
+    return false;
+  }
+  return true;
+}
+
+}  // namespace tesseract
diff --git a/tesseract/src/dict/stopper.cpp b/tesseract/src/dict/stopper.cpp
new file mode 100644
index 00000000..36207d00
--- /dev/null
+++ b/tesseract/src/dict/stopper.cpp
@@ -0,0 +1,515 @@
+/******************************************************************************
+ ** Filename:    stopper.c
+ ** Purpose:     Stopping criteria for word classifier.
+ ** Author:      Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#include <cstdio>
+#include <cstring>
+#include <cctype>
+#include <cmath>
+
+#include "stopper.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#include "ambigs.h"
+#endif
+#include "ccutil.h"
+#include "dict.h"
+#include "helpers.h"
+#include "matchdefs.h"
+#include "pageres.h"
+#include "params.h"
+#include "ratngs.h"
+#include <tesseract/unichar.h>
+
+/*----------------------------------------------------------------------------
+              Private Code
+----------------------------------------------------------------------------*/
+
+namespace tesseract {
+
+bool Dict::AcceptableChoice(const WERD_CHOICE& best_choice,
+                            XHeightConsistencyEnum xheight_consistency) {
+  float CertaintyThreshold = stopper_nondict_certainty_base;
+  int WordSize;
+
+  if (stopper_no_acceptable_choices) return false;
+
+  if (best_choice.length() == 0) return false;
+
+  bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
+  bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
+  bool is_case_ok = case_ok(best_choice);
+
+  if (stopper_debug_level >= 1) {
+    const char *xht = "UNKNOWN";
+    switch (xheight_consistency) {
+      case XH_GOOD:  xht = "NORMAL"; break;
+      case XH_SUBNORMAL:  xht = "SUBNORMAL"; break;
+      case XH_INCONSISTENT:  xht = "INCONSISTENT"; break;
+      default: xht = "UNKNOWN";
+    }
+    tprintf("\nStopper:  %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
+            best_choice.unichar_string().c_str(),
+            (is_valid_word ? 'y' : 'n'),
+            (is_case_ok ? 'y' : 'n'),
+            xht,
+            best_choice.min_x_height(),
+            best_choice.max_x_height());
+  }
+  // Do not accept invalid words in PASS1.
+  if (reject_offset_ <= 0.0f && !is_valid_word) return false;
+  if (is_valid_word && is_case_ok) {
+    WordSize = LengthOfShortestAlphaRun(best_choice);
+    WordSize -= stopper_smallword_size;
+    if (WordSize < 0)
+      WordSize = 0;
+    CertaintyThreshold += WordSize * stopper_certainty_per_char;
+  }
+
+  if (stopper_debug_level >= 1)
+    tprintf("Stopper:  Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
+            best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
+
+  if (no_dang_ambigs &&
+      best_choice.certainty() > CertaintyThreshold &&
+      xheight_consistency < XH_INCONSISTENT &&
+      UniformCertainties(best_choice)) {
+    return true;
+  } else {
+    if (stopper_debug_level >= 1) {
+      tprintf("AcceptableChoice() returned false"
+              " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
+              no_dang_ambigs, best_choice.certainty(),
+              CertaintyThreshold,
+              UniformCertainties(best_choice));
+    }
+    return false;
+  }
+}
+
+bool Dict::AcceptableResult(WERD_RES *word) const {
+  if (word->best_choice == nullptr) return false;
+  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
+  int WordSize;
+
+  if (stopper_debug_level >= 1) {
+    tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
+            word->best_choice->debug_string().c_str(),
+            (valid_word(*word->best_choice) ? 'y' : 'n'),
+            (case_ok(*word->best_choice) ? 'y' : 'n'),
+            word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
+            word->best_choices.singleton() ? 'n' : 'y');
+  }
+
+  if (word->best_choice->length() == 0 || !word->best_choices.singleton())
+    return false;
+  if (valid_word(*word->best_choice) && case_ok(*word->best_choice)) {
+    WordSize = LengthOfShortestAlphaRun(*word->best_choice);
+    WordSize -= stopper_smallword_size;
+    if (WordSize < 0)
+      WordSize = 0;
+    CertaintyThreshold += WordSize * stopper_certainty_per_char;
+  }
+
+  if (stopper_debug_level >= 1)
+    tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f   ",
+            word->best_choice->certainty(), CertaintyThreshold);
+
+  if (word->best_choice->certainty() > CertaintyThreshold &&
+      !stopper_no_acceptable_choices) {
+    if (stopper_debug_level >= 1)
+      tprintf("ACCEPTED\n");
+    return true;
+  } else {
+    if (stopper_debug_level >= 1)
+      tprintf("REJECTED\n");
+    return false;
+  }
+}
+
+#if !defined(DISABLED_LEGACY_ENGINE)
+
+bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
+                            DANGERR *fixpt,
+                            bool fix_replaceable,
+                            MATRIX *ratings) {
+  if (stopper_debug_level > 2) {
+    tprintf("\nRunning NoDangerousAmbig() for %s\n",
+            best_choice->debug_string().c_str());
+  }
+
+  // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
+  // for each unichar id in BestChoice.
+  BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
+  int i;
+  bool ambigs_found = false;
+  // For each position in best_choice:
+  // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
+  // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
+  // -- look for ambiguities corresponding to wrong_ngram in the list while
+  //    adding the following unichar_ids from best_choice to wrong_ngram
+  //
+  // Repeat the above procedure twice: first time look through
+  // ambigs to be replaced and replace all the ambiguities found;
+  // second time look through dangerous ambiguities and construct
+  // ambig_blob_choices with fake a blob choice for each ambiguity
+  // and pass them to dawg_permute_and_select() to search for
+  // ambiguous words in the dictionaries.
+  //
+  // Note that during the execution of the for loop (on the first pass)
+  // if replacements are made the length of best_choice might change.
+  for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
+    bool replace = (fix_replaceable && pass == 0);
+    const UnicharAmbigsVector &table = replace ?
+      getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs();
+    if (!replace) {
+      // Initialize ambig_blob_choices with lists containing a single
+      // unichar id for the corresponding position in best_choice.
+      // best_choice consisting from only the original letters will
+      // have a rating of 0.0.
+      for (i = 0; i < best_choice->length(); ++i) {
+        auto *lst = new BLOB_CHOICE_LIST();
+        BLOB_CHOICE_IT lst_it(lst);
+        // TODO(rays/antonova) Put real xheights and y shifts here.
+        lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
+                                          0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
+        ambig_blob_choices.push_back(lst);
+      }
+    }
+    UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
+    int wrong_ngram_index;
+    int next_index;
+    int blob_index = 0;
+    for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i),
+         ++i) {
+      UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
+      if (stopper_debug_level > 2) {
+        tprintf("Looking for %s ngrams starting with %s:\n",
+                replace ? "replaceable" : "ambiguous",
+                getUnicharset().debug_str(curr_unichar_id).c_str());
+      }
+      int num_wrong_blobs = best_choice->state(i);
+      wrong_ngram_index = 0;
+      wrong_ngram[wrong_ngram_index] = curr_unichar_id;
+      if (curr_unichar_id == INVALID_UNICHAR_ID ||
+          curr_unichar_id >= table.size() ||
+          table[curr_unichar_id] == nullptr) {
+        continue;  // there is no ambig spec for this unichar id
+      }
+      AmbigSpec_IT spec_it(table[curr_unichar_id]);
+      for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
+        const AmbigSpec *ambig_spec = spec_it.data();
+        wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
+        int compare = UnicharIdArrayUtils::compare(wrong_ngram,
+                                                   ambig_spec->wrong_ngram);
+        if (stopper_debug_level > 2) {
+          tprintf("candidate ngram: ");
+          UnicharIdArrayUtils::print(wrong_ngram, getUnicharset());
+          tprintf("current ngram from spec: ");
+          UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
+          tprintf("comparison result: %d\n", compare);
+        }
+        if (compare == 0) {
+          // Record the place where we found an ambiguity.
+          if (fixpt != nullptr) {
+            UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
+            fixpt->push_back(DANGERR_INFO(
+                blob_index, blob_index + num_wrong_blobs, replace,
+                getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
+                leftmost_id));
+            if (stopper_debug_level > 1) {
+              tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index,
+                      blob_index + num_wrong_blobs, false,
+                      getUnicharset().get_isngram(
+                          ambig_spec->correct_ngram_id),
+                      getUnicharset().id_to_unichar(leftmost_id));
+            }
+          }
+
+          if (replace) {
+            if (stopper_debug_level > 2) {
+              tprintf("replace ambiguity with %s : ",
+                      getUnicharset().id_to_unichar(
+                          ambig_spec->correct_ngram_id));
+              UnicharIdArrayUtils::print(
+                  ambig_spec->correct_fragments, getUnicharset());
+            }
+            ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
+                         ambig_spec->correct_ngram_id,
+                         best_choice, ratings);
+          } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
+            // We found dang ambig - update ambig_blob_choices.
+            if (stopper_debug_level > 2) {
+              tprintf("found ambiguity: ");
+              UnicharIdArrayUtils::print(
+                  ambig_spec->correct_fragments, getUnicharset());
+            }
+            ambigs_found = true;
+            for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
+                 ++tmp_index) {
+              // Add a blob choice for the corresponding fragment of the
+              // ambiguity. These fake blob choices are initialized with
+              // negative ratings (which are not possible for real blob
+              // choices), so that dawg_permute_and_select() considers any
+              // word not consisting of only the original letters a better
+              // choice and stops searching for alternatives once such a
+              // choice is found.
+              BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
+              bc_it.add_to_end(new BLOB_CHOICE(
+                  ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
+                  -1, 0, 1, 0, BCC_AMBIG));
+            }
+          }
+          spec_it.forward();
+        } else if (compare == -1) {
+          if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
+              ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
+            // Add the next unichar id to wrong_ngram and keep looking for
+            // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
+            wrong_ngram[++wrong_ngram_index] =
+              best_choice->unichar_id(next_index);
+            num_wrong_blobs += best_choice->state(next_index);
+          } else {
+            break;  // no more matching ambigs in this AMBIG_SPEC_LIST
+          }
+        } else {
+          spec_it.forward();
+        }
+      }  // end searching AmbigSpec_LIST
+    }  // end searching best_choice
+  }  // end searching replace and dangerous ambigs
+
+  // If any ambiguities were found permute the constructed ambig_blob_choices
+  // to see if an alternative dictionary word can be found.
+  if (ambigs_found) {
+    if (stopper_debug_level > 2) {
+      tprintf("\nResulting ambig_blob_choices:\n");
+      for (i = 0; i < ambig_blob_choices.size(); ++i) {
+        print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
+        tprintf("\n");
+      }
+    }
+    WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
+    ambigs_found = (alt_word->rating() < 0.0);
+    if (ambigs_found) {
+      if (stopper_debug_level >= 1) {
+        tprintf ("Stopper: Possible ambiguous word = %s\n",
+                 alt_word->debug_string().c_str());
+      }
+      if (fixpt != nullptr) {
+        // Note: Currently character choices combined from fragments can only
+        // be generated by NoDangrousAmbigs(). This code should be updated if
+        // the capability to produce classifications combined from character
+        // fragments is added to other functions.
+        int orig_i = 0;
+        for (i = 0; i < alt_word->length(); ++i) {
+          const UNICHARSET &uchset = getUnicharset();
+          bool replacement_is_ngram =
+              uchset.get_isngram(alt_word->unichar_id(i));
+          UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
+          if (replacement_is_ngram) {
+            // we have to extract the leftmost unichar from the ngram.
+            const char *str = uchset.id_to_unichar(leftmost_id);
+            int step = uchset.step(str);
+            if (step) leftmost_id = uchset.unichar_to_id(str, step);
+          }
+          int end_i = orig_i + alt_word->state(i);
+          if (alt_word->state(i) > 1 ||
+              (orig_i + 1 == end_i && replacement_is_ngram)) {
+            // Compute proper blob indices.
+            int blob_start = 0;
+            for (int j = 0; j < orig_i; ++j)
+              blob_start += best_choice->state(j);
+            int blob_end = blob_start;
+            for (int j = orig_i; j < end_i; ++j)
+              blob_end += best_choice->state(j);
+            fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true,
+                                          replacement_is_ngram, leftmost_id));
+            if (stopper_debug_level > 1) {
+              tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i,
+                      true, replacement_is_ngram,
+                      uchset.id_to_unichar(leftmost_id));
+            }
+          }
+          orig_i += alt_word->state(i);
+        }
+      }
+    }
+    delete alt_word;
+  }
+  if (output_ambig_words_file_ != nullptr) {
+    fprintf(output_ambig_words_file_, "\n");
+  }
+
+  ambig_blob_choices.delete_data_pointers();
+  return !ambigs_found;
+}
+
+void Dict::EndDangerousAmbigs() {}
+
+#endif   // !defined(DISABLED_LEGACY_ENGINE)
+
+void Dict::SettupStopperPass1() {
+  reject_offset_ = 0.0;
+}
+
+void Dict::SettupStopperPass2() {
+  reject_offset_ = stopper_phase2_certainty_rejection_offset;
+}
+
+void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
+                        UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
+                        MATRIX *ratings) {
+  int num_blobs_to_replace = 0;
+  int begin_blob_index = 0;
+  int i;
+  // Rating and certainty for the new BLOB_CHOICE are derived from the
+  // replaced choices.
+  float new_rating = 0.0f;
+  float new_certainty = 0.0f;
+  BLOB_CHOICE* old_choice = nullptr;
+  for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
+    if (i >= wrong_ngram_begin_index) {
+      int num_blobs = werd_choice->state(i);
+      int col = begin_blob_index + num_blobs_to_replace;
+      int row = col + num_blobs - 1;
+      BLOB_CHOICE_LIST* choices = ratings->get(col, row);
+      ASSERT_HOST(choices != nullptr);
+      old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
+      ASSERT_HOST(old_choice != nullptr);
+      new_rating += old_choice->rating();
+      new_certainty += old_choice->certainty();
+      num_blobs_to_replace += num_blobs;
+    } else {
+      begin_blob_index += werd_choice->state(i);
+    }
+  }
+  new_certainty /= wrong_ngram_size;
+  // If there is no entry in the ratings matrix, add it.
+  MATRIX_COORD coord(begin_blob_index,
+                     begin_blob_index + num_blobs_to_replace - 1);
+  if (!coord.Valid(*ratings)) {
+    ratings->IncreaseBandSize(coord.row - coord.col + 1);
+  }
+  if (ratings->get(coord.col, coord.row) == nullptr)
+    ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
+  BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row);
+  BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices);
+  if (choice != nullptr) {
+    // Already there. Upgrade if new rating better.
+    if (new_rating < choice->rating())
+      choice->set_rating(new_rating);
+    if (new_certainty < choice->certainty())
+      choice->set_certainty(new_certainty);
+    // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
+  } else {
+    // Need a new choice with the correct_ngram_id.
+    choice = new BLOB_CHOICE(*old_choice);
+    choice->set_unichar_id(correct_ngram_id);
+    choice->set_rating(new_rating);
+    choice->set_certainty(new_certainty);
+    choice->set_classifier(BCC_AMBIG);
+    choice->set_matrix_cell(coord.col, coord.row);
+    BLOB_CHOICE_IT it (new_choices);
+    it.add_to_end(choice);
+  }
+  // Remove current unichar from werd_choice. On the last iteration
+  // set the correct replacement unichar instead of removing a unichar.
+  for (int replaced_count = 0; replaced_count < wrong_ngram_size;
+       ++replaced_count) {
+    if (replaced_count + 1 == wrong_ngram_size) {
+      werd_choice->set_blob_choice(wrong_ngram_begin_index,
+                                   num_blobs_to_replace, choice);
+    } else {
+      werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
+    }
+  }
+  if (stopper_debug_level >= 1) {
+      werd_choice->print("ReplaceAmbig() ");
+      tprintf("Modified blob_choices: ");
+      print_ratings_list("\n", new_choices, getUnicharset());
+  }
+}
+
+int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const {
+  int shortest = INT32_MAX;
+  int curr_len = 0;
+  for (int w = 0; w < WordChoice.length(); ++w) {
+    if (WordChoice.unicharset()->get_isalpha(WordChoice.unichar_id(w))) {
+      curr_len++;
+    } else if (curr_len > 0) {
+      if (curr_len < shortest) shortest = curr_len;
+      curr_len = 0;
+    }
+  }
+  if (curr_len > 0 && curr_len < shortest) {
+    shortest = curr_len;
+  } else if (shortest == INT32_MAX) {
+    shortest = 0;
+  }
+  return shortest;
+}
+
+int Dict::UniformCertainties(const WERD_CHOICE& word) {
+  float Certainty;
+  float WorstCertainty = FLT_MAX;
+  float CertaintyThreshold;
+  double TotalCertainty;
+  double TotalCertaintySquared;
+  double Variance;
+  float Mean, StdDev;
+  int word_length = word.length();
+
+  if (word_length < 3)
+    return true;
+
+  TotalCertainty = TotalCertaintySquared = 0.0;
+  for (int i = 0; i < word_length; ++i) {
+    Certainty = word.certainty(i);
+    TotalCertainty += Certainty;
+    TotalCertaintySquared += static_cast<double>(Certainty) * Certainty;
+    if (Certainty < WorstCertainty)
+      WorstCertainty = Certainty;
+  }
+
+  // Subtract off worst certainty from statistics.
+  word_length--;
+  TotalCertainty -= WorstCertainty;
+  TotalCertaintySquared -= static_cast<double>(WorstCertainty) * WorstCertainty;
+
+  Mean = TotalCertainty / word_length;
+  Variance = ((word_length * TotalCertaintySquared -
+    TotalCertainty * TotalCertainty) /
+    (word_length * (word_length - 1)));
+  if (Variance < 0.0)
+    Variance = 0.0;
+  StdDev = sqrt(Variance);
+
+  CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
+  if (CertaintyThreshold > stopper_nondict_certainty_base)
+    CertaintyThreshold = stopper_nondict_certainty_base;
+
+  if (word.certainty() < CertaintyThreshold) {
+    if (stopper_debug_level >= 1)
+      tprintf("Stopper: Non-uniform certainty = %4.1f"
+              " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
+              word.certainty(), Mean, StdDev, CertaintyThreshold);
+    return false;
+  } else {
+    return true;
+  }
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/dict/stopper.h b/tesseract/src/dict/stopper.h
new file mode 100644
index 00000000..d7df3988
--- /dev/null
+++ b/tesseract/src/dict/stopper.h
@@ -0,0 +1,50 @@
+/******************************************************************************
+ ** Filename:    stopper.h
+ ** Purpose:     Stopping criteria for word classifier.
+ ** Author:      Dan Johnson
+ ** History:     Wed May  1 09:42:57 1991, DSJ, Created.
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+#ifndef   STOPPER_H
+#define   STOPPER_H
+
+#include "params.h"
+#include "ratngs.h"
+
+#include "genericvector.h"
+#include <tesseract/unichar.h>
+
+namespace tesseract {
+
+class WERD_CHOICE;
+
+using BLOB_WIDTH = uint8_t;
+
+struct DANGERR_INFO {
+  DANGERR_INFO() :
+    begin(-1), end(-1), dangerous(false), correct_is_ngram(false),
+    leftmost(INVALID_UNICHAR_ID) {}
+  DANGERR_INFO(int b, int e, bool d, bool n, UNICHAR_ID l) :
+    begin(b), end(e), dangerous(d), correct_is_ngram(n), leftmost(l) {}
+  int begin;
+  int end;
+  bool dangerous;
+  bool correct_is_ngram;
+  UNICHAR_ID leftmost;   // in the replacement, what's the leftmost character?
+};
+
+using DANGERR = GenericVector<DANGERR_INFO>;
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/dict/trie.cpp b/tesseract/src/dict/trie.cpp
new file mode 100644
index 00000000..e9edbbaf
--- /dev/null
+++ b/tesseract/src/dict/trie.cpp
@@ -0,0 +1,716 @@
+/******************************************************************************
+ *
+ * File:         trie.cpp  (Formerly trie.c)
+ * Description:  Functions to build a trie data structure.
+ * Author:       Mark Seaman, OCR Technology
+ *
+ * (c) Copyright 1987, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *****************************************************************************/
+/*----------------------------------------------------------------------
+              I n c l u d e s
+----------------------------------------------------------------------*/
+
+#include "trie.h"
+
+#include "dawg.h"
+#include "dict.h"
+#include "genericvector.h"
+#include "helpers.h"
+#include "kdpair.h"
+
+namespace tesseract {
+
+const char kDoNotReverse[] = "RRP_DO_NO_REVERSE";
+const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL";
+const char kForceReverse[] = "RRP_FORCE_REVERSE";
+
+const char * const RTLReversePolicyNames[] = {
+  kDoNotReverse,
+  kReverseIfHasRTL,
+  kForceReverse
+};
+
+const char Trie::kAlphaPatternUnicode[] = "\u2000";
+const char Trie::kDigitPatternUnicode[] = "\u2001";
+const char Trie::kAlphanumPatternUnicode[] = "\u2002";
+const char Trie::kPuncPatternUnicode[] = "\u2003";
+const char Trie::kLowerPatternUnicode[] = "\u2004";
+const char Trie::kUpperPatternUnicode[] = "\u2005";
+
+const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {
+  return RTLReversePolicyNames[reverse_policy];
+}
+
+// Reset the Trie to empty.
+void Trie::clear() {
+  nodes_.delete_data_pointers();
+  nodes_.clear();
+  root_back_freelist_.clear();
+  num_edges_ = 0;
+  new_dawg_node();  // Need to allocate node 0.
+}
+
+bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node,
+                        int direction, bool word_end, UNICHAR_ID unichar_id,
+                        EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const {
+  if (debug_level_ == 3) {
+    tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT
+            " direction %d word_end %d unichar_id %d, exploring node:\n",
+            node_ref, next_node, direction, word_end, unichar_id);
+    if (node_ref != NO_EDGE) {
+      print_node(node_ref, nodes_[node_ref]->forward_edges.size());
+    }
+  }
+  if (node_ref == NO_EDGE) return false;
+  assert(node_ref < nodes_.size());
+  EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ?
+    nodes_[node_ref]->forward_edges : nodes_[node_ref]->backward_edges;
+  int vec_size = vec.size();
+  if (node_ref == 0 && direction == FORWARD_EDGE) {  // binary search
+    EDGE_INDEX start = 0;
+    EDGE_INDEX end = vec_size - 1;
+    EDGE_INDEX k;
+    int compare;
+    while (start <= end) {
+      k = (start + end) >> 1;  // (start + end) / 2
+      compare = given_greater_than_edge_rec(next_node, word_end,
+                                            unichar_id, vec[k]);
+      if (compare == 0) {  // given == vec[k]
+        *edge_ptr = &(vec[k]);
+        *edge_index = k;
+        return true;
+      } else if (compare == 1) {  // given > vec[k]
+        start = k + 1;
+      } else {  // given < vec[k]
+        end = k - 1;
+      }
+    }
+  } else {  // linear search
+    for (int i = 0; i < vec_size; ++i) {
+      EDGE_RECORD &edge_rec = vec[i];
+      if (edge_rec_match(next_node, word_end, unichar_id,
+                         next_node_from_edge_rec(edge_rec),
+                         end_of_word_from_edge_rec(edge_rec),
+                         unichar_id_from_edge_rec(edge_rec))) {
+        *edge_ptr = &(edge_rec);
+        *edge_index = i;
+        return true;
+      }
+    }
+  }
+  return false;  // not found
+}
+
+bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag,
+                            int direction, bool word_end,
+                            UNICHAR_ID unichar_id) {
+  EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
+    &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
+  int search_index;
+  if (node1 == 0 && direction == FORWARD_EDGE) {
+    search_index = 0;  // find the index to make the add sorted
+    while (search_index < vec->size() &&
+           given_greater_than_edge_rec(node2, word_end, unichar_id,
+                                       (*vec)[search_index]) == 1) {
+      search_index++;
+    }
+  } else {
+    search_index = vec->size();  // add is unsorted, so index does not matter
+  }
+  EDGE_RECORD edge_rec;
+  link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
+  if (node1 == 0 && direction == BACKWARD_EDGE &&
+      !root_back_freelist_.empty()) {
+    EDGE_INDEX edge_index = root_back_freelist_.pop_back();
+    (*vec)[edge_index] = edge_rec;
+  } else if (search_index < vec->size()) {
+    vec->insert(edge_rec, search_index);
+  } else {
+    vec->push_back(edge_rec);
+  }
+  if (debug_level_ > 1) {
+    tprintf("new edge in nodes_[" REFFORMAT "]: ", node1);
+    print_edge_rec(edge_rec);
+    tprintf("\n");
+  }
+  num_edges_++;
+  return true;
+}
+
+void Trie::add_word_ending(EDGE_RECORD *edge_ptr,
+                           NODE_REF the_next_node,
+                           bool marker_flag,
+                           UNICHAR_ID unichar_id) {
+  EDGE_RECORD *back_edge_ptr;
+  EDGE_INDEX back_edge_index;
+  ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false,
+                           unichar_id, &back_edge_ptr, &back_edge_index));
+  if (marker_flag) {
+    *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_);
+    *edge_ptr |= (MARKER_FLAG << flag_start_bit_);
+  }
+  // Mark both directions as end of word.
+  *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
+  *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
+}
+
+bool Trie::add_word_to_dawg(const WERD_CHOICE &word,
+                            const GenericVector<bool> *repetitions) {
+  if (word.length() <= 0) return false;  // can't add empty words
+  if (repetitions != nullptr) ASSERT_HOST(repetitions->size() == word.length());
+  // Make sure the word does not contain invalid unchar ids.
+  for (int i = 0; i < word.length(); ++i) {
+    if (word.unichar_id(i) < 0 ||
+        word.unichar_id(i) >= unicharset_size_) return false;
+  }
+
+  EDGE_RECORD *edge_ptr;
+  NODE_REF last_node = 0;
+  NODE_REF the_next_node;
+  bool marker_flag = false;
+  EDGE_INDEX edge_index;
+  int i;
+  int32_t still_finding_chars = true;
+  int32_t word_end = false;
+  bool  add_failed = false;
+  bool found;
+
+  if (debug_level_ > 1) word.print("\nAdding word: ");
+
+  UNICHAR_ID unichar_id;
+  for (i = 0; i < word.length() - 1; ++i) {
+    unichar_id = word.unichar_id(i);
+    marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
+    if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
+    if (still_finding_chars) {
+      found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end,
+                           unichar_id, &edge_ptr, &edge_index);
+      if (found && debug_level_ > 1) {
+        tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n",
+                edge_index, last_node);
+      }
+      if (!found) {
+        still_finding_chars = false;
+      } else if (next_node_from_edge_rec(*edge_ptr) == 0) {
+        // We hit the end of an existing word, but the new word is longer.
+        // In this case we have to disconnect the existing word from the
+        // backwards root node, mark the current position as end-of-word
+        // and add new nodes for the increased length. Disconnecting the
+        // existing word from the backwards root node requires a linear
+        // search, so it is much faster to add the longest words first,
+        // to avoid having to come here.
+        word_end = true;
+        still_finding_chars = false;
+        remove_edge(last_node, 0, word_end, unichar_id);
+      } else {
+        // We have to add a new branch here for the new word.
+        if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr);
+        last_node = next_node_from_edge_rec(*edge_ptr);
+      }
+    }
+    if (!still_finding_chars) {
+      the_next_node = new_dawg_node();
+      if (debug_level_ > 1)
+        tprintf("adding node " REFFORMAT "\n", the_next_node);
+      if (the_next_node == 0) {
+        add_failed = true;
+        break;
+      }
+      if (!add_new_edge(last_node, the_next_node,
+                        marker_flag, word_end, unichar_id)) {
+        add_failed = true;
+        break;
+      }
+      word_end = false;
+      last_node = the_next_node;
+    }
+  }
+  the_next_node = 0;
+  unichar_id = word.unichar_id(i);
+  marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
+  if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
+  if (still_finding_chars &&
+      edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false,
+                   unichar_id, &edge_ptr, &edge_index)) {
+    // An extension of this word already exists in the trie, so we
+    // only have to add the ending flags in both directions.
+    add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr),
+                    marker_flag, unichar_id);
+  } else {
+    // Add a link to node 0. All leaves connect to node 0 so the back links can
+    // be used in reduction to a dawg. This root backward node has one edge
+    // entry for every word, (except prefixes of longer words) so it is huge.
+    if (!add_failed &&
+        !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id))
+      add_failed = true;
+  }
+  if (add_failed) {
+    tprintf("Re-initializing document dictionary...\n");
+    clear();
+    return false;
+  } else {
+    return true;
+  }
+}
+
+NODE_REF Trie::new_dawg_node() {
+  auto *node = new TRIE_NODE_RECORD();
+  nodes_.push_back(node);
+  return nodes_.size() - 1;
+}
+
+bool Trie::read_and_add_word_list(const char *filename,
+                                  const UNICHARSET &unicharset,
+                                  Trie::RTLReversePolicy reverse_policy) {
+  std::vector<STRING> word_list;
+  if (!read_word_list(filename, &word_list)) return false;
+  std::sort(word_list.begin(), word_list.end(), [](auto &s1, auto &s2) {
+      return s1.size() > s2.size();
+  });
+  return add_word_list(word_list, unicharset, reverse_policy);
+}
+
+bool Trie::read_word_list(const char *filename,
+                          std::vector<STRING>* words) {
+  FILE *word_file;
+  char line_str[CHARS_PER_LINE];
+  int  word_count = 0;
+
+  word_file = fopen(filename, "rb");
+  if (word_file == nullptr) return false;
+
+  while (fgets(line_str, sizeof(line_str), word_file) != nullptr) {
+    chomp_string(line_str);  // remove newline
+    STRING word_str(line_str);
+    ++word_count;
+    if (debug_level_ && word_count % 10000 == 0)
+      tprintf("Read %d words so far\n", word_count);
+    words->push_back(word_str);
+  }
+  if (debug_level_)
+    tprintf("Read %d words total.\n", word_count);
+  fclose(word_file);
+  return true;
+}
+
+bool Trie::add_word_list(const std::vector<STRING> &words,
+                         const UNICHARSET &unicharset,
+                         Trie::RTLReversePolicy reverse_policy) {
+  for (int i = 0; i < words.size(); ++i) {
+    WERD_CHOICE word(words[i].c_str(), unicharset);
+    if (word.length() == 0 || word.contains_unichar_id(INVALID_UNICHAR_ID))
+      continue;
+    if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL &&
+         word.has_rtl_unichar_id()) ||
+        reverse_policy == RRP_FORCE_REVERSE) {
+      word.reverse_and_mirror_unichar_ids();
+    }
+    if (!word_in_dawg(word)) {
+      add_word_to_dawg(word);
+      if (!word_in_dawg(word)) {
+        tprintf("Error: word '%s' not in DAWG after adding it\n",
+                words[i].c_str());
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void Trie::initialize_patterns(UNICHARSET *unicharset) {
+  unicharset->unichar_insert(kAlphaPatternUnicode);
+  alpha_pattern_ = unicharset->unichar_to_id(kAlphaPatternUnicode);
+  unicharset->unichar_insert(kDigitPatternUnicode);
+  digit_pattern_ = unicharset->unichar_to_id(kDigitPatternUnicode);
+  unicharset->unichar_insert(kAlphanumPatternUnicode);
+  alphanum_pattern_ = unicharset->unichar_to_id(kAlphanumPatternUnicode);
+  unicharset->unichar_insert(kPuncPatternUnicode);
+  punc_pattern_ = unicharset->unichar_to_id(kPuncPatternUnicode);
+  unicharset->unichar_insert(kLowerPatternUnicode);
+  lower_pattern_ = unicharset->unichar_to_id(kLowerPatternUnicode);
+  unicharset->unichar_insert(kUpperPatternUnicode);
+  upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode);
+  initialized_patterns_ = true;
+  unicharset_size_ = unicharset->size();
+}
+
+void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id,
+                                  const UNICHARSET &unicharset,
+                                  GenericVector<UNICHAR_ID> *vec) const {
+  bool is_alpha = unicharset.get_isalpha(unichar_id);
+  if (is_alpha) {
+    vec->push_back(alpha_pattern_);
+    vec->push_back(alphanum_pattern_);
+    if (unicharset.get_islower(unichar_id)) {
+      vec->push_back(lower_pattern_);
+    } else if (unicharset.get_isupper(unichar_id)) {
+      vec->push_back(upper_pattern_);
+    }
+  }
+  if (unicharset.get_isdigit(unichar_id)) {
+    vec->push_back(digit_pattern_);
+    if (!is_alpha) vec->push_back(alphanum_pattern_);
+  }
+  if (unicharset.get_ispunctuation(unichar_id)) {
+    vec->push_back(punc_pattern_);
+  }
+}
+
+UNICHAR_ID Trie::character_class_to_pattern(char ch) {
+  if (ch == 'c') {
+    return alpha_pattern_;
+  } else if (ch == 'd') {
+    return digit_pattern_;
+  } else if (ch == 'n') {
+    return alphanum_pattern_;
+  } else if (ch == 'p') {
+    return punc_pattern_;
+  } else if (ch == 'a') {
+    return lower_pattern_;
+  } else if (ch == 'A') {
+    return upper_pattern_;
+  } else {
+    return INVALID_UNICHAR_ID;
+  }
+}
+
+bool Trie::read_pattern_list(const char *filename,
+                             const UNICHARSET &unicharset) {
+  if (!initialized_patterns_) {
+    tprintf("please call initialize_patterns() before read_pattern_list()\n");
+    return false;
+  }
+
+  FILE *pattern_file = fopen(filename, "rb");
+  if (pattern_file == nullptr) {
+    tprintf("Error opening pattern file %s\n", filename);
+    return false;
+  }
+
+  int pattern_count = 0;
+  char string[CHARS_PER_LINE];
+  while (fgets(string, CHARS_PER_LINE, pattern_file) != nullptr) {
+    chomp_string(string);  // remove newline
+    // Parse the pattern and construct a unichar id vector.
+    // Record the number of repetitions of each unichar in the parallel vector.
+    WERD_CHOICE word(&unicharset);
+    GenericVector<bool> repetitions_vec;
+    const char *str_ptr = string;
+    int step = unicharset.step(str_ptr);
+    bool failed = false;
+    while (step > 0) {
+      UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
+      if (step == 1 && *str_ptr == '\\') {
+        ++str_ptr;
+        if (*str_ptr == '\\') {  // regular '\' unichar that was escaped
+          curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
+        } else {
+          if (word.length() < kSaneNumConcreteChars) {
+            tprintf("Please provide at least %d concrete characters at the"
+                    " beginning of the pattern\n", kSaneNumConcreteChars);
+            failed = true;
+            break;
+          }
+          // Parse character class from expression.
+          curr_unichar_id = character_class_to_pattern(*str_ptr);
+        }
+      } else {
+        curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
+      }
+      if (curr_unichar_id ==  INVALID_UNICHAR_ID) {
+        failed = true;
+        break;  // failed to parse this pattern
+      }
+      word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);
+      repetitions_vec.push_back(false);
+      str_ptr += step;
+      step = unicharset.step(str_ptr);
+      // Check if there is a repetition pattern specified after this unichar.
+      if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') {
+        repetitions_vec[repetitions_vec.size()-1] = true;
+        str_ptr += 2;
+        step = unicharset.step(str_ptr);
+      }
+    }
+    if (failed) {
+      tprintf("Invalid user pattern %s\n", string);
+      continue;
+    }
+    // Insert the pattern into the trie.
+    if (debug_level_ > 2) {
+      tprintf("Inserting expanded user pattern %s\n",
+              word.debug_string().c_str());
+    }
+    if (!this->word_in_dawg(word)) {
+      this->add_word_to_dawg(word, &repetitions_vec);
+      if (!this->word_in_dawg(word)) {
+        tprintf("Error: failed to insert pattern '%s'\n", string);
+      }
+    }
+    ++pattern_count;
+  }
+  if (debug_level_) {
+    tprintf("Read %d valid patterns from %s\n", pattern_count, filename);
+  }
+  fclose(pattern_file);
+  return true;
+}
+
+void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction,
+                               bool word_end, UNICHAR_ID unichar_id) {
+  EDGE_RECORD *edge_ptr = nullptr;
+  EDGE_INDEX edge_index = 0;
+  ASSERT_HOST(edge_char_of(node1, node2, direction, word_end,
+                           unichar_id, &edge_ptr, &edge_index));
+  if (debug_level_ > 1) {
+    tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1);
+    print_edge_rec(*edge_ptr);
+    tprintf("\n");
+  }
+  if (direction == FORWARD_EDGE) {
+    nodes_[node1]->forward_edges.remove(edge_index);
+  } else if (node1 == 0) {
+    KillEdge(&nodes_[node1]->backward_edges[edge_index]);
+    root_back_freelist_.push_back(edge_index);
+  } else {
+    nodes_[node1]->backward_edges.remove(edge_index);
+  }
+  --num_edges_;
+}
+
+// Some optimizations employed in add_word_to_dawg and trie_to_dawg:
+// 1 Avoid insertion sorting or bubble sorting the tail root node
+//   (back links on node 0, a list of all the leaves.). The node is
+//   huge, and sorting it with n^2 time is terrible.
+// 2 Avoid using GenericVector::remove on the tail root node.
+//   (a) During add of words to the trie, zero-out the unichars and
+//       keep a freelist of spaces to re-use.
+//   (b) During reduction, just zero-out the unichars of deleted back
+//       links, skipping zero entries while searching.
+// 3 Avoid linear search of the tail root node. This has to be done when
+//   a suffix is added to an existing word. Adding words by decreasing
+//   length avoids this problem entirely. Words can still be added in
+//   any order, but it is faster to add the longest first.
+SquishedDawg *Trie::trie_to_dawg() {
+  root_back_freelist_.clear();  // Will be invalided by trie_to_dawg.
+  if (debug_level_ > 2) {
+    print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY);
+  }
+  auto reduced_nodes = new bool[nodes_.size()];
+  for (int i = 0; i < nodes_.size(); i++) reduced_nodes[i] = false;
+  this->reduce_node_input(0, reduced_nodes);
+  delete[] reduced_nodes;
+
+  if (debug_level_ > 2) {
+    print_all("After reduction:", MAX_NODE_EDGES_DISPLAY);
+  }
+  // Build a translation map from node indices in nodes_ vector to
+  // their target indices in EDGE_ARRAY.
+  auto *node_ref_map = new NODE_REF[nodes_.size() + 1];
+  int i, j;
+  node_ref_map[0] = 0;
+  for (i = 0; i < nodes_.size(); ++i) {
+    node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
+  }
+  int num_forward_edges = node_ref_map[i];
+
+  // Convert nodes_ vector into EDGE_ARRAY translating the next node references
+  // in edges using node_ref_map. Empty nodes and backward edges are dropped.
+  auto edge_array = new EDGE_RECORD[num_forward_edges];
+  EDGE_ARRAY edge_array_ptr = edge_array;
+  for (i = 0; i < nodes_.size(); ++i) {
+    TRIE_NODE_RECORD *node_ptr = nodes_[i];
+    int end = node_ptr->forward_edges.size();
+    for (j = 0; j < end; ++j) {
+      EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
+      NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
+      ASSERT_HOST(node_ref < nodes_.size());
+      UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
+      link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
+                end_of_word_from_edge_rec(edge_rec), unichar_id);
+      if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr);
+      ++edge_array_ptr;
+    }
+  }
+  delete[] node_ref_map;
+
+  return new SquishedDawg(edge_array, num_forward_edges, type_, lang_,
+                          perm_, unicharset_size_, debug_level_);
+}
+
+bool Trie::eliminate_redundant_edges(NODE_REF node,
+                                     const EDGE_RECORD &edge1,
+                                     const EDGE_RECORD &edge2) {
+  if (debug_level_ > 1) {
+    tprintf("\nCollapsing node %" PRIi64 ":\n", node);
+    print_node(node, MAX_NODE_EDGES_DISPLAY);
+    tprintf("Candidate edges: ");
+    print_edge_rec(edge1);
+    tprintf(", ");
+    print_edge_rec(edge2);
+    tprintf("\n\n");
+  }
+  NODE_REF next_node1 = next_node_from_edge_rec(edge1);
+  NODE_REF next_node2 = next_node_from_edge_rec(edge2);
+  TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2];
+  // Translate all edges going to/from next_node2 to go to/from next_node1.
+  EDGE_RECORD *edge_ptr = nullptr;
+  EDGE_INDEX edge_index;
+  int i;
+  // The backward link in node to next_node2 will be zeroed out by the caller.
+  // Copy all the backward links in next_node2 to node next_node1
+  for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {
+    const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
+    NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
+    UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
+    int curr_word_end = end_of_word_from_edge_rec(bkw_edge);
+    bool marker_flag = marker_flag_from_edge_rec(bkw_edge);
+    add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE,
+                     curr_word_end, curr_unichar_id);
+    // Relocate the corresponding forward edge in curr_next_node
+    ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE,
+                             curr_word_end, curr_unichar_id,
+                             &edge_ptr, &edge_index));
+    set_next_node_in_edge_rec(edge_ptr, next_node1);
+  }
+  int next_node2_num_edges = (next_node2_ptr->forward_edges.size() +
+                              next_node2_ptr->backward_edges.size());
+  if (debug_level_ > 1) {
+    tprintf("removed %d edges from node " REFFORMAT "\n",
+            next_node2_num_edges, next_node2);
+  }
+  next_node2_ptr->forward_edges.clear();
+  next_node2_ptr->backward_edges.clear();
+  num_edges_ -= next_node2_num_edges;
+  return true;
+}
+
+bool Trie::reduce_lettered_edges(EDGE_INDEX edge_index,
+                                 UNICHAR_ID unichar_id,
+                                 NODE_REF node,
+                                 EDGE_VECTOR* backward_edges,
+                                 NODE_MARKER reduced_nodes) {
+  if (debug_level_ > 1)
+    tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
+  // Compare each of the edge pairs with the given unichar_id.
+  bool did_something = false;
+  for (int i = edge_index; i < backward_edges->size() - 1; ++i) {
+    // Find the first edge that can be eliminated.
+    UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
+    while (i < backward_edges->size()) {
+      if (!DeadEdge((*backward_edges)[i])) {
+        curr_unichar_id = unichar_id_from_edge_rec((*backward_edges)[i]);
+        if (curr_unichar_id != unichar_id) return did_something;
+        if (can_be_eliminated((*backward_edges)[i])) break;
+      }
+      ++i;
+    }
+    if (i == backward_edges->size()) break;
+    const EDGE_RECORD &edge_rec = (*backward_edges)[i];
+    // Compare it to the rest of the edges with the given unichar_id.
+    for (int j = i + 1; j < backward_edges->size(); ++j) {
+      const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
+      if (DeadEdge(next_edge_rec)) continue;
+      UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec);
+      if (next_id != unichar_id) break;
+      if (end_of_word_from_edge_rec(next_edge_rec) ==
+          end_of_word_from_edge_rec(edge_rec) &&
+          can_be_eliminated(next_edge_rec) &&
+          eliminate_redundant_edges(node, edge_rec, next_edge_rec)) {
+        reduced_nodes[next_node_from_edge_rec(edge_rec)] = false;
+        did_something = true;
+        KillEdge(&(*backward_edges)[j]);
+      }
+    }
+  }
+  return did_something;
+}
+
+void Trie::sort_edges(EDGE_VECTOR *edges) {
+  int num_edges = edges->size();
+  if (num_edges <= 1) return;
+  GenericVector<KDPairInc<UNICHAR_ID, EDGE_RECORD> > sort_vec;
+  sort_vec.reserve(num_edges);
+  for (int i = 0; i < num_edges; ++i) {
+    sort_vec.push_back(KDPairInc<UNICHAR_ID, EDGE_RECORD>(
+        unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]));
+  }
+  sort_vec.sort();
+  for (int i = 0; i < num_edges; ++i)
+    (*edges)[i] = sort_vec[i].data();
+}
+
+void Trie::reduce_node_input(NODE_REF node,
+                             NODE_MARKER reduced_nodes) {
+  EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
+  sort_edges(&backward_edges);
+  if (debug_level_ > 1) {
+    tprintf("reduce_node_input(node=" REFFORMAT ")\n", node);
+    print_node(node, MAX_NODE_EDGES_DISPLAY);
+  }
+
+  EDGE_INDEX edge_index = 0;
+  while (edge_index < backward_edges.size()) {
+    if (DeadEdge(backward_edges[edge_index])) continue;
+    UNICHAR_ID unichar_id =
+      unichar_id_from_edge_rec(backward_edges[edge_index]);
+    while (reduce_lettered_edges(edge_index, unichar_id, node,
+                                 &backward_edges, reduced_nodes));
+    while (++edge_index < backward_edges.size()) {
+      UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]);
+      if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) break;
+    }
+  }
+  reduced_nodes[node] = true;  // mark as reduced
+
+  if (debug_level_ > 1) {
+    tprintf("Node " REFFORMAT " after reduction:\n", node);
+    print_node(node, MAX_NODE_EDGES_DISPLAY);
+  }
+
+  for (int i = 0; i < backward_edges.size(); ++i) {
+    if (DeadEdge(backward_edges[i])) continue;
+    NODE_REF next_node = next_node_from_edge_rec(backward_edges[i]);
+    if (next_node != 0 && !reduced_nodes[next_node]) {
+      reduce_node_input(next_node, reduced_nodes);
+    }
+  }
+}
+
+void Trie::print_node(NODE_REF node, int max_num_edges) const {
+  if (node == NO_EDGE) return;  // nothing to print
+  TRIE_NODE_RECORD *node_ptr = nodes_[node];
+  int num_fwd = node_ptr->forward_edges.size();
+  int num_bkw = node_ptr->backward_edges.size();
+  EDGE_VECTOR *vec;
+  for (int dir = 0; dir < 2; ++dir) {
+    if (dir == 0) {
+      vec = &(node_ptr->forward_edges);
+      tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw);
+    } else {
+      vec = &(node_ptr->backward_edges);
+      tprintf("\t");
+    }
+    int i;
+    for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) &&
+         i < max_num_edges; ++i) {
+      if (DeadEdge((*vec)[i])) continue;
+      print_edge_rec((*vec)[i]);
+      tprintf(" ");
+    }
+    if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("...");
+    tprintf("\n");
+  }
+}
+
+}  // namespace tesseract
diff --git a/tesseract/src/dict/trie.h b/tesseract/src/dict/trie.h
new file mode 100644
index 00000000..f516b3c5
--- /dev/null
+++ b/tesseract/src/dict/trie.h
@@ -0,0 +1,432 @@
+/******************************************************************************
+ *
+ * File:        trie.h
+ * Description: Functions to build a trie data structure.
+ * Author:      Mark Seaman, SW Productivity
+ *
+ * (c) Copyright 1987, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *****************************************************************************/
+#ifndef TRIE_H
+#define TRIE_H
+
+#include "dawg.h"
+
+#include "genericvector.h"
+
+namespace tesseract {
+
+class UNICHARSET;
+
+// Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed
+// max int32, we will need to change GenericVector to use int64 for size
+// and address indices. This does not seem to be needed immediately,
+// since currently the largest number of edges limit used by tesseract
+// (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32.
+// There are also int casts below to satisfy the WIN32 compiler that would
+// need to be changed.
+// It might be cleanest to change the types of most of the Trie/Dawg related
+// typedefs to int and restrict the casts to extracting these values from
+// the 64 bit EDGE_RECORD.
+using EDGE_INDEX = int64_t ;  // index of an edge in a given node
+using NODE_MARKER = bool *;
+using EDGE_VECTOR = GenericVector<EDGE_RECORD> ;
+
+struct TRIE_NODE_RECORD {
+  EDGE_VECTOR forward_edges;
+  EDGE_VECTOR backward_edges;
+};
+using TRIE_NODES = GenericVector<TRIE_NODE_RECORD *> ;
+
+/**
+ * Concrete class for Trie data structure that allows to store a list of
+ * words (extends Dawg base class) as well as dynamically add new words.
+ * This class stores a vector of pointers to TRIE_NODE_RECORDs, each of
+ * which has a vector of forward and backward edges.
+ */
+class TESS_API Trie : public Dawg {
+ public:
+  enum RTLReversePolicy {
+    RRP_DO_NO_REVERSE,
+    RRP_REVERSE_IF_HAS_RTL,
+    RRP_FORCE_REVERSE,
+  };
+
+  // Minimum number of concrete characters at the beginning of user patterns.
+  static const int kSaneNumConcreteChars = 0;
+  // Various unicode whitespace characters are used to denote unichar patterns,
+  // (character classifier would never produce these whitespace characters as a
+  // valid classification).
+  static const char kAlphaPatternUnicode[];
+  static const char kDigitPatternUnicode[];
+  static const char kAlphanumPatternUnicode[];
+  static const char kPuncPatternUnicode[];
+  static const char kLowerPatternUnicode[];
+  static const char kUpperPatternUnicode[];
+
+  static const char *get_reverse_policy_name(
+      RTLReversePolicy reverse_policy);
+
+  // max_num_edges argument allows limiting the amount of memory this
+  // Trie can consume (if a new word insert would cause the Trie to
+  // contain more edges than max_num_edges, all the edges are cleared
+  // so that new inserts can proceed).
+  Trie(DawgType type, const STRING &lang, PermuterType perm,
+       int unicharset_size, int debug_level)
+      : Dawg(type, lang, perm, debug_level) {
+    init(unicharset_size);
+    num_edges_ = 0;
+    deref_node_index_mask_ = ~letter_mask_;
+    new_dawg_node();  // need to allocate node 0
+    initialized_patterns_ = false;
+  }
+  ~Trie() override { nodes_.delete_data_pointers(); }
+
+  // Reset the Trie to empty.
+  void clear();
+
+  /** Returns the edge that corresponds to the letter out of this node. */
+  EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id,
+                        bool word_end) const override {
+    EDGE_RECORD *edge_ptr;
+    EDGE_INDEX edge_index;
+    if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id,
+                      &edge_ptr, &edge_index)) return NO_EDGE;
+    return make_edge_ref(node_ref, edge_index);
+  }
+
+  /**
+   * Fills the given NodeChildVector with all the unichar ids (and the
+   * corresponding EDGE_REFs) for which there is an edge out of this node.
+   */
+  void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
+                      bool word_end) const override {
+    const EDGE_VECTOR &forward_edges =
+      nodes_[static_cast<int>(node)]->forward_edges;
+    for (int i = 0; i < forward_edges.size(); ++i) {
+      if (!word_end || end_of_word_from_edge_rec(forward_edges[i])) {
+        vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]),
+                                 make_edge_ref(node, i)));
+      }
+    }
+  }
+
+  /**
+   * Returns the next node visited by following the edge
+   * indicated by the given EDGE_REF.
+   */
+  NODE_REF next_node(EDGE_REF edge_ref) const override {
+    if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE;
+    return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
+  }
+
+  /**
+   * Returns true if the edge indicated by the given EDGE_REF
+   * marks the end of a word.
+   */
+  bool end_of_word(EDGE_REF edge_ref) const override {
+    if (edge_ref == NO_EDGE || num_edges_ == 0) return false;
+    return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
+  }
+
+  /** Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. */
+  UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override {
+    if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID;
+    return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
+  }
+  // Sets the UNICHAR_ID in the given edge_rec to unicharset_size_, marking
+  // the edge dead.
+  void KillEdge(EDGE_RECORD* edge_rec) const {
+    *edge_rec &= ~letter_mask_;
+    *edge_rec |= (unicharset_size_ << LETTER_START_BIT);
+  }
+  bool DeadEdge(const EDGE_RECORD& edge_rec) const {
+    return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;
+  }
+
+  // Prints the contents of the node indicated by the given NODE_REF.
+  // At most max_num_edges will be printed.
+  void print_node(NODE_REF node, int max_num_edges) const override;
+
+  // Writes edges from nodes_ to an EDGE_ARRAY and creates a SquishedDawg.
+  // Eliminates redundant edges and returns the pointer to the SquishedDawg.
+  // Note: the caller is responsible for deallocating memory associated
+  // with the returned SquishedDawg pointer.
+  SquishedDawg *trie_to_dawg();
+
+  // Reads a list of words from the given file and adds into the Trie.
+  // Calls WERD_CHOICE::reverse_unichar_ids_if_rtl() according to the reverse
+  // policy and information in the unicharset.
+  // Returns false on error.
+  bool read_and_add_word_list(const char *filename,
+                              const UNICHARSET &unicharset,
+                              Trie::RTLReversePolicy reverse);
+
+  // Reads a list of words from the given file.
+  // Returns false on error.
+  bool read_word_list(const char *filename,
+                      std::vector<STRING>* words);
+  // Adds a list of words previously read using read_word_list to the trie
+  // using the given unicharset and reverse_policy to convert to unichar-ids.
+  // Returns false on error.
+  bool add_word_list(const std::vector<STRING> &words,
+                     const UNICHARSET &unicharset,
+                     Trie::RTLReversePolicy reverse_policy);
+
+  // Inserts the list of patterns from the given file into the Trie.
+  // The pattern list file should contain one pattern per line in UTF-8 format.
+  //
+  // Each pattern can contain any non-whitespace characters, however only the
+  // patterns that contain characters from the unicharset of the corresponding
+  // language will be useful.
+  // The only meta character is '\'. To be used in a pattern as an ordinary
+  // string it should be escaped with '\' (e.g. string "C:\Documents" should
+  // be written in the patterns file as "C:\\Documents").
+  // This function supports a very limited regular expression syntax. One can
+  // express a character, a certain character class and a number of times the
+  // entity should be repeated in the pattern.
+  //
+  // To denote a character class use one of:
+  // \c - unichar for which UNICHARSET::get_isalpha() is true (character)
+  // \d - unichar for which UNICHARSET::get_isdigit() is true
+  // \n - unichar for which UNICHARSET::get_isdigit() or
+  //      UNICHARSET::isalpha() are true
+  // \p - unichar for which UNICHARSET::get_ispunct() is true
+  // \a - unichar for which UNICHARSET::get_islower() is true
+  // \A - unichar for which UNICHARSET::get_isupper() is true
+  //
+  // \* could be specified after each character or pattern to indicate that
+  // the character/pattern can be repeated any number of times before the next
+  // character/pattern occurs.
+  //
+  // Examples:
+  // 1-8\d\d-GOOG-411 will be expanded to strings:
+  // 1-800-GOOG-411, 1-801-GOOG-411, ... 1-899-GOOG-411.
+  //
+  // http://www.\n\*.com will be expanded to strings like:
+  // http://www.a.com http://www.a123.com ... http://www.ABCDefgHIJKLMNop.com
+  //
+  // Note: In choosing which patterns to include please be aware of the fact
+  // providing very generic patterns will make tesseract run slower.
+  // For example \n\* at the beginning of the pattern will make Tesseract
+  // consider all the combinations of proposed character choices for each
+  // of the segmentations, which will be unacceptably slow.
+  // Because of potential problems with speed that could be difficult to
+  // identify, each user pattern has to have at least kSaneNumConcreteChars
+  // concrete characters from the unicharset at the beginning.
+  bool read_pattern_list(const char *filename, const UNICHARSET &unicharset);
+
+  // Initializes the values of *_pattern_ unichar ids.
+  // This function should be called before calling read_pattern_list().
+  void initialize_patterns(UNICHARSET *unicharset);
+
+  // Fills in the given unichar id vector with the unichar ids that represent
+  // the patterns of the character classes of the given unichar_id.
+  void unichar_id_to_patterns(UNICHAR_ID unichar_id,
+                              const UNICHARSET &unicharset,
+                              GenericVector<UNICHAR_ID> *vec) const override;
+
+  // Returns the given EDGE_REF if the EDGE_RECORD that it points to has
+  // a self loop and the given unichar_id matches the unichar_id stored in the
+  // EDGE_RECORD, returns NO_EDGE otherwise.
+  EDGE_REF pattern_loop_edge(EDGE_REF edge_ref,
+                                     UNICHAR_ID unichar_id,
+                                     bool word_end) const override {
+    if (edge_ref == NO_EDGE) return NO_EDGE;
+    EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
+    return (marker_flag_from_edge_rec(*edge_rec) &&
+            unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
+            word_end == end_of_word_from_edge_rec(*edge_rec)) ?
+            edge_ref : NO_EDGE;
+  }
+
+  // Adds a word to the Trie (creates the necessary nodes and edges).
+  //
+  // If repetitions vector is not nullptr, each entry in the vector indicates
+  // whether the unichar id with the corresponding index in the word is allowed
+  // to repeat an unlimited number of times. For each entry that is true, MARKER
+  // flag of the corresponding edge created for this unichar id is set to true).
+  //
+  // Return true if add succeeded, false otherwise (e.g. when a word contained
+  // an invalid unichar id or the trie was getting too large and was cleared).
+  bool add_word_to_dawg(const WERD_CHOICE &word,
+                        const GenericVector<bool> *repetitions);
+  bool add_word_to_dawg(const WERD_CHOICE &word) {
+    return add_word_to_dawg(word, nullptr);
+  }
+
+ protected:
+  // The structure of an EDGE_REF for Trie edges is as follows:
+  // [LETTER_START_BIT, flag_start_bit_):
+  //                             edge index in *_edges in a TRIE_NODE_RECORD
+  // [flag_start_bit, 30th bit]: node index in nodes (TRIE_NODES vector)
+  //
+  // With this arrangement there are enough bits to represent edge indices
+  // (each node can have at most unicharset_size_ forward edges and
+  // the position of flag_start_bit is set to be log2(unicharset_size_)).
+  // It is also possible to accommodate a maximum number of nodes that is at
+  // least as large as that of the SquishedDawg representation (in SquishedDawg
+  // each EDGE_RECORD has 32-(flag_start_bit+NUM_FLAG_BITS) bits to represent
+  // the next node index).
+  //
+
+  // Returns the pointer to EDGE_RECORD after decoding the location
+  // of the edge from the information in the given EDGE_REF.
+  // This function assumes that EDGE_REF holds valid node/edge indices.
+  inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const {
+    int edge_index = static_cast<int>(
+      (edge_ref & letter_mask_) >> LETTER_START_BIT);
+    int node_index = static_cast<int>(
+      (edge_ref & deref_node_index_mask_) >> flag_start_bit_);
+    TRIE_NODE_RECORD *node_rec = nodes_[node_index];
+    return &(node_rec->forward_edges[edge_index]);
+  }
+  /** Constructs EDGE_REF from the given node_index and edge_index. */
+  inline EDGE_REF make_edge_ref(NODE_REF node_index,
+                                EDGE_INDEX edge_index) const {
+    return ((node_index << flag_start_bit_) |
+            (edge_index << LETTER_START_BIT));
+  }
+  /** Sets up this edge record to the requested values. */
+  inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats,
+                        int direction, bool word_end, UNICHAR_ID unichar_id) {
+    EDGE_RECORD flags = 0;
+    if (repeats) flags |= MARKER_FLAG;
+    if (word_end) flags |= WERD_END_FLAG;
+    if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG;
+    *edge = ((nxt << next_node_start_bit_) |
+             (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
+             (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
+  }
+  /** Prints the given EDGE_RECORD. */
+  inline void print_edge_rec(const EDGE_RECORD &edge_rec) const {
+    tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
+            marker_flag_from_edge_rec(edge_rec) ? "R," : "",
+            (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
+            end_of_word_from_edge_rec(edge_rec) ? ",E" : "",
+            unichar_id_from_edge_rec(edge_rec));
+  }
+  // Returns true if the next node in recorded the given EDGE_RECORD
+  // has exactly one forward edge.
+  inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) {
+    NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
+    return (node_ref != NO_EDGE &&
+            nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);
+  }
+
+  // Prints the contents of the Trie.
+  // At most max_num_edges will be printed for each node.
+  void print_all(const char* msg, int max_num_edges) {
+    tprintf("\n__________________________\n%s\n", msg);
+    for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges);
+    tprintf("__________________________\n");
+  }
+
+  // Finds the edge with the given direction, word_end and unichar_id
+  // in the node indicated by node_ref. Fills in the pointer to the
+  // EDGE_RECORD and the index of the edge with the the values
+  // corresponding to the edge found. Returns true if an edge was found.
+  bool edge_char_of(NODE_REF node_ref, NODE_REF next_node,
+                    int direction, bool word_end, UNICHAR_ID unichar_id,
+                    EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const;
+
+  // Adds an single edge linkage between node1 and node2 in the direction
+  // indicated by direction argument.
+  bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats,
+                        int direction, bool word_end,
+                        UNICHAR_ID unichar_id);
+
+  // Adds forward edge linkage from node1 to node2 and the corresponding
+  // backward edge linkage in the other direction.
+  bool add_new_edge(NODE_REF node1, NODE_REF node2,
+                    bool repeats, bool word_end, UNICHAR_ID unichar_id) {
+    return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE,
+                             word_end, unichar_id) &&
+            add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE,
+                             word_end, unichar_id));
+  }
+
+  // Sets the word ending flags in an already existing edge pair.
+  // Returns true on success.
+  void add_word_ending(EDGE_RECORD *edge,
+                       NODE_REF the_next_node,
+                       bool repeats,
+                       UNICHAR_ID unichar_id);
+
+  // Allocates space for a new node in the Trie.
+  NODE_REF new_dawg_node();
+
+  // Removes a single edge linkage to between node1 and node2 in the
+  // direction indicated by direction argument.
+  void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction,
+                           bool word_end, UNICHAR_ID unichar_id);
+
+  // Removes forward edge linkage from node1 to node2 and the corresponding
+  // backward edge linkage in the other direction.
+  void remove_edge(NODE_REF node1, NODE_REF node2,
+                   bool word_end, UNICHAR_ID unichar_id) {
+    remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
+    remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
+  }
+
+  // Compares edge1 and edge2 in the given node to see if they point to two
+  // next nodes that could be collapsed. If they do, performs the reduction
+  // and returns true.
+  bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1,
+                                 const EDGE_RECORD &edge2);
+
+  // Assuming that edge_index indicates the first edge in a group of edges
+  // in this node with a particular letter value, looks through these edges
+  // to see if any of them can be collapsed. If so does it. Returns to the
+  // caller when all edges with this letter have been reduced.
+  // Returns true if further reduction is possible with this same letter.
+  bool reduce_lettered_edges(EDGE_INDEX edge_index,
+                             UNICHAR_ID unichar_id,
+                             NODE_REF node,
+                             EDGE_VECTOR* backward_edges,
+                             NODE_MARKER reduced_nodes);
+
+  /**
+   * Order num_edges of consecutive EDGE_RECORDS in the given EDGE_VECTOR in
+   * increasing order of unichar ids. This function is normally called
+   * for all edges in a single node, and since number of edges in each node
+   * is usually quite small, selection sort is used.
+   */
+  void sort_edges(EDGE_VECTOR *edges);
+
+  /** Eliminates any redundant edges from this node in the Trie. */
+  void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes);
+
+  // Returns the pattern unichar id for the given character class code.
+  UNICHAR_ID character_class_to_pattern(char ch);
+
+  // Member variables
+  TRIE_NODES nodes_;              // vector of nodes in the Trie
+  // Freelist of edges in the root backwards node that were previously zeroed.
+  GenericVector<EDGE_INDEX> root_back_freelist_;
+  uint64_t num_edges_;              // sum of all edges (forward and backward)
+  uint64_t deref_direction_mask_;   // mask for EDGE_REF to extract direction
+  uint64_t deref_node_index_mask_;  // mask for EDGE_REF to extract node index
+  // Variables for translating character class codes denoted in user patterns
+  // file to the unichar ids used to represent them in a Trie.
+  UNICHAR_ID alpha_pattern_;
+  UNICHAR_ID digit_pattern_;
+  UNICHAR_ID alphanum_pattern_;
+  UNICHAR_ID punc_pattern_;
+  UNICHAR_ID lower_pattern_;
+  UNICHAR_ID upper_pattern_;
+  bool initialized_patterns_;
+};
+
+}  // namespace tesseract
+
+#endif
author	Thomas Deutschmann <whissi@gentoo.org>	2021-03-30 10:59:39 +0200
committer	Thomas Deutschmann <whissi@gentoo.org>	2021-04-01 00:04:14 +0200
commit	5ff1d6955496b3cf9a35042c9ac35db43bc336b1 (patch)
tree	6d470f7eb448f59f53e8df1010aec9dad8ce1f72 /tesseract/src/dict
parent	Import Ghostscript 9.53.1 (diff)
download	ghostscript-gpl-patches-5ff1d6955496b3cf9a35042c9ac35db43bc336b1.tar.gz ghostscript-gpl-patches-5ff1d6955496b3cf9a35042c9ac35db43bc336b1.tar.bz2 ghostscript-gpl-patches-5ff1d6955496b3cf9a35042c9ac35db43bc336b1.zip