summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas Deutschmann <whissi@gentoo.org>2021-03-30 10:59:39 +0200
committerThomas Deutschmann <whissi@gentoo.org>2021-04-01 00:04:14 +0200
commit5ff1d6955496b3cf9a35042c9ac35db43bc336b1 (patch)
tree6d470f7eb448f59f53e8df1010aec9dad8ce1f72 /tesseract/src/dict
parentImport Ghostscript 9.53.1 (diff)
downloadghostscript-gpl-patches-5ff1d6955496b3cf9a35042c9ac35db43bc336b1.tar.gz
ghostscript-gpl-patches-5ff1d6955496b3cf9a35042c9ac35db43bc336b1.tar.bz2
ghostscript-gpl-patches-5ff1d6955496b3cf9a35042c9ac35db43bc336b1.zip
Import Ghostscript 9.54ghostscript-9.54
Signed-off-by: Thomas Deutschmann <whissi@gentoo.org>
Diffstat (limited to 'tesseract/src/dict')
-rw-r--r--tesseract/src/dict/context.cpp76
-rw-r--r--tesseract/src/dict/dawg.cpp417
-rw-r--r--tesseract/src/dict/dawg.h554
-rw-r--r--tesseract/src/dict/dawg_cache.cpp96
-rw-r--r--tesseract/src/dict/dawg_cache.h53
-rw-r--r--tesseract/src/dict/dict.cpp888
-rw-r--r--tesseract/src/dict/dict.h651
-rw-r--r--tesseract/src/dict/hyphen.cpp61
-rw-r--r--tesseract/src/dict/matchdefs.h50
-rw-r--r--tesseract/src/dict/permdawg.cpp390
-rw-r--r--tesseract/src/dict/stopper.cpp515
-rw-r--r--tesseract/src/dict/stopper.h50
-rw-r--r--tesseract/src/dict/trie.cpp716
-rw-r--r--tesseract/src/dict/trie.h432
14 files changed, 4949 insertions, 0 deletions
diff --git a/tesseract/src/dict/context.cpp b/tesseract/src/dict/context.cpp
new file mode 100644
index 00000000..93cff5ff
--- /dev/null
+++ b/tesseract/src/dict/context.cpp
@@ -0,0 +1,76 @@
+/******************************************************************************
+ *
+ * File: context.cpp (Formerly context.c)
+ * Description: Context checking functions
+ * Author: Mark Seaman, OCR Technology
+ *
+ * (c) Copyright 1990, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *****************************************************************************/
+
+#include "dict.h"
+#include "unicharset.h"
+
+namespace tesseract {
+
+static const int kMinAbsoluteGarbageWordLength = 10;
+static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;
+
+const int case_state_table[6][4] = {
+ {/* 0. Beginning of word */
+ /* P U L D */
+ /* -1. Error on case */
+ 0, 1, 5, 4},
+ {/* 1. After initial capital */
+ 0, 3, 2, 4},
+ {/* 2. After lower case */
+ 0, -1, 2, -1},
+ {/* 3. After upper case */
+ 0, 3, -1, 4},
+ {/* 4. After a digit */
+ 0, -1, -1, 4},
+ {/* 5. After initial lower case */
+ 5, -1, 2, -1},
+};
+
+int Dict::case_ok(const WERD_CHOICE &word) const {
+ int state = 0;
+ int x;
+ const UNICHARSET* unicharset = word.unicharset();
+ for (x = 0; x < word.length(); ++x) {
+ UNICHAR_ID ch_id = word.unichar_id(x);
+ if (unicharset->get_isupper(ch_id))
+ state = case_state_table[state][1];
+ else if (unicharset->get_islower(ch_id))
+ state = case_state_table[state][2];
+ else if (unicharset->get_isdigit(ch_id))
+ state = case_state_table[state][3];
+ else
+ state = case_state_table[state][0];
+ if (state == -1) return false;
+ }
+ return state != 5; // single lower is bad
+}
+
+bool Dict::absolute_garbage(const WERD_CHOICE &word,
+ const UNICHARSET &unicharset) {
+ if (word.length() < kMinAbsoluteGarbageWordLength) return false;
+ int num_alphanum = 0;
+ for (int x = 0; x < word.length(); ++x) {
+ num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
+ unicharset.get_isdigit(word.unichar_id(x)));
+ }
+ return (static_cast<float>(num_alphanum) /
+ static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/dict/dawg.cpp b/tesseract/src/dict/dawg.cpp
new file mode 100644
index 00000000..aa339727
--- /dev/null
+++ b/tesseract/src/dict/dawg.cpp
@@ -0,0 +1,417 @@
+/********************************************************************************
+ *
+ * File: dawg.cpp (Formerly dawg.c)
+ * Description: Use a Directed Acyclic Word Graph
+ * Author: Mark Seaman, OCR Technology
+ *
+ * (c) Copyright 1987, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *********************************************************************************/
+/*----------------------------------------------------------------------
+ I n c l u d e s
+----------------------------------------------------------------------*/
+
+#include "dawg.h"
+
+#include "dict.h"
+#include "helpers.h"
+#include "strngs.h"
+#include "tprintf.h"
+
+#include <memory>
+
+/*----------------------------------------------------------------------
+ F u n c t i o n s f o r D a w g
+----------------------------------------------------------------------*/
+namespace tesseract {
+
+// Destructor.
+// It is defined here, so the compiler can create a single vtable
+// instead of weak vtables in every compilation unit.
+Dawg::~Dawg() = default;
+
+bool Dawg::prefix_in_dawg(const WERD_CHOICE &word,
+ bool requires_complete) const {
+ if (word.length() == 0) return !requires_complete;
+ NODE_REF node = 0;
+ int end_index = word.length() - 1;
+ for (int i = 0; i < end_index; i++) {
+ EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false);
+ if (edge == NO_EDGE) {
+ return false;
+ }
+ if ((node = next_node(edge)) == 0) {
+ // This only happens if all words following this edge terminate --
+ // there are no larger words. See Trie::add_word_to_dawg()
+ return false;
+ }
+ }
+ // Now check the last character.
+ return edge_char_of(node, word.unichar_id(end_index), requires_complete) !=
+ NO_EDGE;
+}
+
+bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
+ return prefix_in_dawg(word, true);
+}
+
+int Dawg::check_for_words(const char *filename,
+ const UNICHARSET &unicharset,
+ bool enable_wildcard) const {
+ if (filename == nullptr) return 0;
+
+ FILE *word_file;
+ char string [CHARS_PER_LINE];
+ int misses = 0;
+ UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);
+
+ word_file = fopen(filename, "r");
+ if (word_file == nullptr) {
+ tprintf("Error: Could not open file %s\n", filename);
+ ASSERT_HOST(word_file);
+ }
+
+ while (fgets (string, CHARS_PER_LINE, word_file) != nullptr) {
+ chomp_string(string); // remove newline
+ WERD_CHOICE word(string, unicharset);
+ if (word.length() > 0 &&
+ !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
+ if (!match_words(&word, 0, 0,
+ enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
+ tprintf("Missing word: %s\n", string);
+ ++misses;
+ }
+ } else {
+ tprintf("Failed to create a valid word from %s\n", string);
+ }
+ }
+ fclose (word_file);
+ // Make sure the user sees this with fprintf instead of tprintf.
+ if (debug_level_) tprintf("Number of lost words=%d\n", misses);
+ return misses;
+}
+
+void Dawg::iterate_words(const UNICHARSET& unicharset,
+ std::function<void(const WERD_CHOICE*)> cb) const {
+ WERD_CHOICE word(&unicharset);
+ iterate_words_rec(word, 0, cb);
+}
+
+static void CallWithUTF8(std::function<void(const char*)> cb,
+ const WERD_CHOICE* wc) {
+ STRING s;
+ wc->string_and_lengths(&s, nullptr);
+ cb(s.c_str());
+}
+
+void Dawg::iterate_words(const UNICHARSET& unicharset,
+ std::function<void(const char*)> cb) const {
+ using namespace std::placeholders; // for _1
+ std::function<void(const WERD_CHOICE*)> shim(
+ std::bind(CallWithUTF8, cb, _1));
+ WERD_CHOICE word(&unicharset);
+ iterate_words_rec(word, 0, shim);
+}
+
+void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far,
+ NODE_REF to_explore,
+ std::function<void(const WERD_CHOICE*)> cb) const {
+ NodeChildVector children;
+ this->unichar_ids_of(to_explore, &children, false);
+ for (int i = 0; i < children.size(); i++) {
+ WERD_CHOICE next_word(word_so_far);
+ next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
+ if (this->end_of_word(children[i].edge_ref)) {
+ cb(&next_word);
+ }
+ NODE_REF next = next_node(children[i].edge_ref);
+ if (next != 0) {
+ iterate_words_rec(next_word, next, cb);
+ }
+ }
+}
+
+bool Dawg::match_words(WERD_CHOICE *word, int32_t index,
+ NODE_REF node, UNICHAR_ID wildcard) const {
+ EDGE_REF edge;
+ int32_t word_end;
+
+ if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
+ bool any_matched = false;
+ NodeChildVector vec;
+ this->unichar_ids_of(node, &vec, false);
+ for (int i = 0; i < vec.size(); ++i) {
+ word->set_unichar_id(vec[i].unichar_id, index);
+ if (match_words(word, index, node, wildcard))
+ any_matched = true;
+ }
+ word->set_unichar_id(wildcard, index);
+ return any_matched;
+ } else {
+ word_end = index == word->length() - 1;
+ edge = edge_char_of(node, word->unichar_id(index), word_end);
+ if (edge != NO_EDGE) { // normal edge in DAWG
+ node = next_node(edge);
+ if (word_end) {
+ if (debug_level_ > 1) word->print("match_words() found: ");
+ return true;
+ } else if (node != 0) {
+ return match_words(word, index+1, node, wildcard);
+ }
+ }
+ }
+ return false;
+}
+
+void Dawg::init(int unicharset_size) {
+ ASSERT_HOST(unicharset_size > 0);
+ unicharset_size_ = unicharset_size;
+ // Set bit masks. We will use the value unicharset_size_ as a null char, so
+ // the actual number of unichars is unicharset_size_ + 1.
+ flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0));
+ next_node_start_bit_ = flag_start_bit_ + NUM_FLAG_BITS;
+ letter_mask_ = ~(~0ull << flag_start_bit_);
+ next_node_mask_ = ~0ull << (flag_start_bit_ + NUM_FLAG_BITS);
+ flags_mask_ = ~(letter_mask_ | next_node_mask_);
+}
+
+
+/*----------------------------------------------------------------------
+ F u n c t i o n s f o r S q u i s h e d D a w g
+----------------------------------------------------------------------*/
+
+SquishedDawg::~SquishedDawg() { delete[] edges_; }
+
+EDGE_REF SquishedDawg::edge_char_of(NODE_REF node,
+ UNICHAR_ID unichar_id,
+ bool word_end) const {
+ EDGE_REF edge = node;
+ if (node == 0) { // binary search
+ EDGE_REF start = 0;
+ EDGE_REF end = num_forward_edges_in_node0 - 1;
+ int compare;
+ while (start <= end) {
+ edge = (start + end) >> 1; // (start + end) / 2
+ compare = given_greater_than_edge_rec(NO_EDGE, word_end,
+ unichar_id, edges_[edge]);
+ if (compare == 0) { // given == vec[k]
+ return edge;
+ } else if (compare == 1) { // given > vec[k]
+ start = edge + 1;
+ } else { // given < vec[k]
+ end = edge - 1;
+ }
+ }
+ } else { // linear search
+ if (edge != NO_EDGE && edge_occupied(edge)) {
+ do {
+ if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
+ (!word_end || end_of_word_from_edge_rec(edges_[edge])))
+ return (edge);
+ } while (!last_edge(edge++));
+ }
+ }
+ return (NO_EDGE); // not found
+}
+
+int32_t SquishedDawg::num_forward_edges(NODE_REF node) const {
+ EDGE_REF edge = node;
+ int32_t num = 0;
+
+ if (forward_edge (edge)) {
+ do {
+ num++;
+ } while (!last_edge(edge++));
+ }
+
+ return (num);
+}
+
+void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const {
+ if (node == NO_EDGE) return; // nothing to print
+
+ EDGE_REF edge = node;
+ const char *forward_string = "FORWARD";
+ const char *backward_string = " ";
+
+ const char *last_string = "LAST";
+ const char *not_last_string = " ";
+
+ const char *eow_string = "EOW";
+ const char *not_eow_string = " ";
+
+ const char *direction;
+ const char *is_last;
+ const char *eow;
+
+ UNICHAR_ID unichar_id;
+
+ if (edge_occupied(edge)) {
+ do {
+ direction =
+ forward_edge(edge) ? forward_string : backward_string;
+ is_last = last_edge(edge) ? last_string : not_last_string;
+ eow = end_of_word(edge) ? eow_string : not_eow_string;
+
+ unichar_id = edge_letter(edge);
+ tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
+ edge, next_node(edge), unichar_id,
+ direction, is_last, eow);
+
+ if (edge - node > max_num_edges) return;
+ } while (!last_edge(edge++));
+
+ if (edge < num_edges_ &&
+ edge_occupied(edge) && backward_edge(edge)) {
+ do {
+ direction =
+ forward_edge(edge) ? forward_string : backward_string;
+ is_last = last_edge(edge) ? last_string : not_last_string;
+ eow = end_of_word(edge) ? eow_string : not_eow_string;
+
+ unichar_id = edge_letter(edge);
+ tprintf(REFFORMAT " : next = " REFFORMAT
+ ", unichar_id = %d, %s %s %s\n",
+ edge, next_node(edge), unichar_id,
+ direction, is_last, eow);
+
+ if (edge - node > MAX_NODE_EDGES_DISPLAY) return;
+ } while (!last_edge(edge++));
+ }
+ }
+ else {
+ tprintf(REFFORMAT " : no edges in this node\n", node);
+ }
+ tprintf("\n");
+}
+
+void SquishedDawg::print_edge(EDGE_REF edge) const {
+ if (edge == NO_EDGE) {
+ tprintf("NO_EDGE\n");
+ } else {
+ tprintf(REFFORMAT " : next = " REFFORMAT
+ ", unichar_id = '%d', %s %s %s\n", edge,
+ next_node(edge), edge_letter(edge),
+ (forward_edge(edge) ? "FORWARD" : " "),
+ (last_edge(edge) ? "LAST" : " "),
+ (end_of_word(edge) ? "EOW" : ""));
+ }
+}
+
+bool SquishedDawg::read_squished_dawg(TFile *file) {
+ if (debug_level_) tprintf("Reading squished dawg\n");
+
+ // Read the magic number and check that it matches kDawgMagicNumber, as
+ // auto-endian fixing should make sure it is always correct.
+ int16_t magic;
+ if (!file->DeSerialize(&magic)) return false;
+ if (magic != kDawgMagicNumber) {
+ tprintf("Bad magic number on dawg: %d vs %d\n", magic, kDawgMagicNumber);
+ return false;
+ }
+
+ int32_t unicharset_size;
+ if (!file->DeSerialize(&unicharset_size)) return false;
+ if (!file->DeSerialize(&num_edges_)) return false;
+ ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty
+ Dawg::init(unicharset_size);
+
+ edges_ = new EDGE_RECORD[num_edges_];
+ if (!file->DeSerialize(&edges_[0], num_edges_)) return false;
+ if (debug_level_ > 2) {
+ tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
+ type_, lang_.c_str(), perm_, unicharset_size_, num_edges_);
+ for (EDGE_REF edge = 0; edge < num_edges_; ++edge) print_edge(edge);
+ }
+ return true;
+}
+
+std::unique_ptr<EDGE_REF[]> SquishedDawg::build_node_map(
+ int32_t *num_nodes) const {
+ EDGE_REF edge;
+ std::unique_ptr<EDGE_REF[]> node_map(new EDGE_REF[num_edges_]);
+ int32_t node_counter;
+ int32_t num_edges;
+
+ for (edge = 0; edge < num_edges_; edge++) // init all slots
+ node_map[edge] = -1;
+
+ node_counter = num_forward_edges(0);
+
+ *num_nodes = 0;
+ for (edge = 0; edge < num_edges_; edge++) { // search all slots
+
+ if (forward_edge(edge)) {
+ (*num_nodes)++; // count nodes links
+ node_map[edge] = (edge ? node_counter : 0);
+ num_edges = num_forward_edges(edge);
+ if (edge != 0) node_counter += num_edges;
+ edge += num_edges;
+ if (edge >= num_edges_) break;
+ if (backward_edge(edge)) while (!last_edge(edge++));
+ edge--;
+ }
+ }
+ return node_map;
+}
+
+bool SquishedDawg::write_squished_dawg(TFile *file) {
+ EDGE_REF edge;
+ int32_t num_edges;
+ int32_t node_count = 0;
+ EDGE_REF old_index;
+ EDGE_RECORD temp_record;
+
+ if (debug_level_) tprintf("write_squished_dawg\n");
+
+ std::unique_ptr<EDGE_REF[]> node_map(build_node_map(&node_count));
+
+ // Write the magic number to help detecting a change in endianness.
+ int16_t magic = kDawgMagicNumber;
+ if (!file->Serialize(&magic)) return false;
+ if (!file->Serialize(&unicharset_size_)) return false;
+
+ // Count the number of edges in this Dawg.
+ num_edges = 0;
+ for (edge=0; edge < num_edges_; edge++)
+ if (forward_edge(edge))
+ num_edges++;
+
+ // Write edge count to file.
+ if (!file->Serialize(&num_edges)) return false;
+
+ if (debug_level_) {
+ tprintf("%d nodes in DAWG\n", node_count);
+ tprintf("%d edges in DAWG\n", num_edges);
+ }
+
+ for (edge = 0; edge < num_edges_; edge++) {
+ if (forward_edge(edge)) { // write forward edges
+ do {
+ old_index = next_node_from_edge_rec(edges_[edge]);
+ set_next_node(edge, node_map[old_index]);
+ temp_record = edges_[edge];
+ if (!file->Serialize(&temp_record)) return false;
+ set_next_node(edge, old_index);
+ } while (!last_edge(edge++));
+
+ if (edge >= num_edges_) break;
+ if (backward_edge(edge)) // skip back links
+ while (!last_edge(edge++));
+
+ edge--;
+ }
+ }
+ return true;
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/dict/dawg.h b/tesseract/src/dict/dawg.h
new file mode 100644
index 00000000..26d76d1e
--- /dev/null
+++ b/tesseract/src/dict/dawg.h
@@ -0,0 +1,554 @@
+/******************************************************************************
+ *
+ * File: dawg.h
+ * Description: Definition of a class that represents Directed Acyclic Word
+ * Graph (DAWG), functions to build and manipulate the DAWG.
+ * Author: Mark Seaman, SW Productivity
+ *
+ * (c) Copyright 1987, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *****************************************************************************/
+
+#ifndef DICT_DAWG_H_
+#define DICT_DAWG_H_
+
+/*----------------------------------------------------------------------
+ I n c l u d e s
+----------------------------------------------------------------------*/
+
+#include <cinttypes> // for PRId64
+#include <functional> // for std::function
+#include <memory>
+#include "elst.h"
+#include "params.h"
+#include "ratngs.h"
+
+#ifndef __GNUC__
+#ifdef _WIN32
+#define NO_EDGE (int64_t) 0xffffffffffffffffi64
+#endif /*_WIN32*/
+#else
+#define NO_EDGE (int64_t) 0xffffffffffffffffll
+#endif /*__GNUC__*/
+
+namespace tesseract {
+
+class UNICHARSET;
+
+using EDGE_RECORD = uint64_t;
+using EDGE_ARRAY = EDGE_RECORD *;
+using EDGE_REF = int64_t;
+using NODE_REF = int64_t;
+using NODE_MAP = EDGE_REF *;
+
+struct NodeChild {
+ UNICHAR_ID unichar_id;
+ EDGE_REF edge_ref;
+ NodeChild(UNICHAR_ID id, EDGE_REF ref): unichar_id(id), edge_ref(ref) {}
+ NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
+};
+
+using NodeChildVector = GenericVector<NodeChild>;
+using SuccessorList = GenericVector<int>;
+using SuccessorListsVector = GenericVector<SuccessorList *>;
+
+enum DawgType {
+ DAWG_TYPE_PUNCTUATION,
+ DAWG_TYPE_WORD,
+ DAWG_TYPE_NUMBER,
+ DAWG_TYPE_PATTERN,
+
+ DAWG_TYPE_COUNT // number of enum entries
+};
+
+/*----------------------------------------------------------------------
+ C o n s t a n t s
+----------------------------------------------------------------------*/
+
+#define FORWARD_EDGE (int32_t) 0
+#define BACKWARD_EDGE (int32_t) 1
+#define MAX_NODE_EDGES_DISPLAY (int64_t) 100
+#define MARKER_FLAG (int64_t) 1
+#define DIRECTION_FLAG (int64_t) 2
+#define WERD_END_FLAG (int64_t) 4
+#define LETTER_START_BIT 0
+#define NUM_FLAG_BITS 3
+#define REFFORMAT "%" PRId64
+
+static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = {
+ { false, true, true, false }, // for DAWG_TYPE_PUNCTUATION
+ { true, false, false, false }, // for DAWG_TYPE_WORD
+ { true, false, false, false }, // for DAWG_TYPE_NUMBER
+ { false, false, false, false }, // for DAWG_TYPE_PATTERN
+};
+
+static const char kWildcard[] = "*";
+
+
+/*----------------------------------------------------------------------
+ C l a s s e s a n d S t r u c t s
+----------------------------------------------------------------------*/
+//
+/// Abstract class (an interface) that declares methods needed by the
+/// various tesseract classes to operate on SquishedDawg and Trie objects.
+///
+/// This class initializes all the edge masks (since their usage by
+/// SquishedDawg and Trie is identical) and implements simple accessors
+/// for each of the fields encoded in an EDGE_RECORD.
+/// This class also implements word_in_dawg() and check_for_words()
+/// (since they use only the public methods of SquishedDawg and Trie
+/// classes that are inherited from the Dawg base class).
+//
+class TESS_API Dawg {
+ public:
+ /// Magic number to determine endianness when reading the Dawg from file.
+ static const int16_t kDawgMagicNumber = 42;
+ /// A special unichar id that indicates that any appropriate pattern
+ /// (e.g.dicitonary word, 0-9 digit, etc) can be inserted instead
+ /// Used for expressing patterns in punctuation and number Dawgs.
+ static const UNICHAR_ID kPatternUnicharID = 0;
+
+ inline DawgType type() const { return type_; }
+ inline const STRING &lang() const { return lang_; }
+ inline PermuterType permuter() const { return perm_; }
+
+ virtual ~Dawg();
+
+ /// Returns true if the given word is in the Dawg.
+ bool word_in_dawg(const WERD_CHOICE &word) const;
+
+ // Returns true if the given word prefix is not contraindicated by the dawg.
+ // If requires_complete is true, then the exact complete word must be present.
+ bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const;
+
+ /// Checks the Dawg for the words that are listed in the requested file.
+ /// Returns the number of words in the given file missing from the Dawg.
+ int check_for_words(const char *filename,
+ const UNICHARSET &unicharset,
+ bool enable_wildcard) const;
+
+ // For each word in the Dawg, call the given (permanent) callback with the
+ // text (UTF-8) version of the word.
+ void iterate_words(const UNICHARSET& unicharset,
+ std::function<void(const WERD_CHOICE*)> cb) const;
+
+ // For each word in the Dawg, call the given (permanent) callback with the
+ // text (UTF-8) version of the word.
+ void iterate_words(const UNICHARSET& unicharset,
+ std::function<void(const char*)> cb) const;
+
+ // Pure virtual function that should be implemented by the derived classes.
+
+ /// Returns the edge that corresponds to the letter out of this node.
+ virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
+ bool word_end) const = 0;
+
+ /// Fills the given NodeChildVector with all the unichar ids (and the
+ /// corresponding EDGE_REFs) for which there is an edge out of this node.
+ virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
+ bool word_end) const = 0;
+
+ /// Returns the next node visited by following the edge
+ /// indicated by the given EDGE_REF.
+ virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0;
+
+ /// Returns true if the edge indicated by the given EDGE_REF
+ /// marks the end of a word.
+ virtual bool end_of_word(EDGE_REF edge_ref) const = 0;
+
+ /// Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
+ virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0;
+
+ /// Prints the contents of the node indicated by the given NODE_REF.
+ /// At most max_num_edges will be printed.
+ virtual void print_node(NODE_REF node, int max_num_edges) const = 0;
+
+ /// Fills vec with unichar ids that represent the character classes
+ /// of the given unichar_id.
+ virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id,
+ const UNICHARSET &unicharset,
+ GenericVector<UNICHAR_ID> *vec) const {
+ (void)unichar_id;
+ (void)unicharset;
+ (void)vec;
+ }
+
+ /// Returns the given EDGE_REF if the EDGE_RECORD that it points to has
+ /// a self loop and the given unichar_id matches the unichar_id stored in the
+ /// EDGE_RECORD, returns NO_EDGE otherwise.
+ virtual EDGE_REF pattern_loop_edge(
+ EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const {
+ (void)edge_ref;
+ (void)unichar_id;
+ (void)word_end;
+ return false;
+ }
+
+ protected:
+ Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
+ : lang_(lang),
+ type_(type),
+ perm_(perm),
+ unicharset_size_(0),
+ debug_level_(debug_level) {}
+
+ /// Returns the next node visited by following this edge.
+ inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const {
+ return ((edge_rec & next_node_mask_) >> next_node_start_bit_);
+ }
+ /// Returns the marker flag of this edge.
+ inline bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const {
+ return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0;
+ }
+ /// Returns the direction flag of this edge.
+ inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const {
+ return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ?
+ BACKWARD_EDGE : FORWARD_EDGE;
+ }
+ /// Returns true if this edge marks the end of a word.
+ inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const {
+ return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0;
+ }
+ /// Returns UNICHAR_ID recorded in this edge.
+ inline UNICHAR_ID unichar_id_from_edge_rec(
+ const EDGE_RECORD &edge_rec) const {
+ return ((edge_rec & letter_mask_) >> LETTER_START_BIT);
+ }
+ /// Sets the next node link for this edge in the Dawg.
+ inline void set_next_node_in_edge_rec(
+ EDGE_RECORD *edge_rec, EDGE_REF value) {
+ *edge_rec &= (~next_node_mask_);
+ *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_);
+ }
+ /// Sets this edge record to be the last one in a sequence of edges.
+ inline void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec) {
+ *edge_rec |= (MARKER_FLAG << flag_start_bit_);
+ }
+ /// Sequentially compares the given values of unichar ID, next node
+ /// and word end marker with the values in the given EDGE_RECORD.
+ /// Returns: 1 if at any step the given input value exceeds
+ /// that of edge_rec (and all the values already
+ /// checked are the same)
+ /// 0 if edge_rec_match() returns true
+ /// -1 otherwise
+ inline int given_greater_than_edge_rec(NODE_REF next_node,
+ bool word_end,
+ UNICHAR_ID unichar_id,
+ const EDGE_RECORD &edge_rec) const {
+ UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);
+ NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);
+ bool curr_word_end = end_of_word_from_edge_rec(edge_rec);
+ if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,
+ curr_word_end, curr_unichar_id)) return 0;
+ if (unichar_id > curr_unichar_id) return 1;
+ if (unichar_id == curr_unichar_id) {
+ if (next_node > curr_next_node) return 1;
+ if (next_node == curr_next_node) {
+ if (word_end > curr_word_end) return 1;
+ }
+ }
+ return -1;
+ }
+ /// Returns true if all the values are equal (any value matches
+ /// next_node if next_node == NO_EDGE, any value matches word_end
+ /// if word_end is false).
+ inline bool edge_rec_match(NODE_REF next_node,
+ bool word_end,
+ UNICHAR_ID unichar_id,
+ NODE_REF other_next_node,
+ bool other_word_end,
+ UNICHAR_ID other_unichar_id) const {
+ return ((unichar_id == other_unichar_id) &&
+ (next_node == NO_EDGE || next_node == other_next_node) &&
+ (!word_end || (word_end == other_word_end)));
+ }
+
+ /// Sets unicharset_size_.
+ /// Initializes the values of various masks from unicharset_size_.
+ void init(int unicharset_size);
+
+ /// Matches all of the words that are represented by this string.
+ /// If wildcard is set to something other than INVALID_UNICHAR_ID,
+ /// the *'s in this string are interpreted as wildcards.
+ /// WERD_CHOICE param is not passed by const so that wildcard searches
+ /// can modify it and work without having to copy WERD_CHOICEs.
+ bool match_words(WERD_CHOICE *word, int32_t index,
+ NODE_REF node, UNICHAR_ID wildcard) const;
+
+ // Recursively iterate over all words in a dawg (see public iterate_words).
+ void iterate_words_rec(const WERD_CHOICE& word_so_far,
+ NODE_REF to_explore,
+ std::function<void(const WERD_CHOICE*)> cb) const;
+
+ // Member Variables.
+ STRING lang_;
+ DawgType type_;
+ /// Permuter code that should be used if the word is found in this Dawg.
+ PermuterType perm_;
+ // Variables to construct various edge masks. Formerly:
+ // #define NEXT_EDGE_MASK (int64_t) 0xfffffff800000000i64
+ // #define FLAGS_MASK (int64_t) 0x0000000700000000i64
+ // #define LETTER_MASK (int64_t) 0x00000000ffffffffi64
+ uint64_t next_node_mask_ = 0;
+ uint64_t flags_mask_ = 0;
+ uint64_t letter_mask_ = 0;
+ int unicharset_size_;
+ int flag_start_bit_ = 0;
+ int next_node_start_bit_ = 0;
+ // Level of debug statements to print to stdout.
+ int debug_level_;
+};
+
+//
+// DawgPosition keeps track of where we are in the primary dawg we're searching
+// as well as where we may be in the "punctuation dawg" which may provide
+// surrounding context.
+//
+// Example:
+// punctuation dawg -- space is the "pattern character"
+// " " // no punctuation
+// "' '" // leading and trailing apostrophes
+// " '" // trailing apostrophe
+// word dawg:
+// "cat"
+// "cab"
+// "cat's"
+//
+// DawgPosition(dawg_index, dawg_ref, punc_index, punc_ref, rtp)
+//
+// DawgPosition(-1, NO_EDGE, p, pe, false)
+// We're in the punctuation dawg, no other dawg has been started.
+// (1) If there's a pattern edge as a punc dawg child of us,
+// for each punc-following dawg starting with ch, produce:
+// Result: DawgPosition(k, w, p', false)
+// (2) If there's a valid continuation in the punc dawg, produce:
+// Result: DawgPosition(-k, NO_EDGE, p', false)
+//
+// DawgPosition(k, w, -1, NO_EDGE, false)
+// We're in dawg k. Going back to punctuation dawg is not an option.
+// Follow ch in dawg k.
+//
+// DawgPosition(k, w, p, pe, false)
+// We're in dawg k. Continue in dawg k and/or go back to the punc dawg.
+// If ending, check that the punctuation dawg is also ok to end here.
+//
+// DawgPosition(k, w, p, pe true)
+// We're back in the punctuation dawg. Continuing there is the only option.
+struct DawgPosition {
+ DawgPosition() = default;
+ DawgPosition(int dawg_idx, EDGE_REF dawgref,
+ int punc_idx, EDGE_REF puncref,
+ bool backtopunc)
+ : dawg_ref(dawgref), punc_ref(puncref),
+ dawg_index(dawg_idx), punc_index(punc_idx),
+ back_to_punc(backtopunc) {
+ }
+ bool operator==(const DawgPosition &other) {
+ return dawg_index == other.dawg_index &&
+ dawg_ref == other.dawg_ref &&
+ punc_index == other.punc_index &&
+ punc_ref == other.punc_ref &&
+ back_to_punc == other.back_to_punc;
+ }
+
+ EDGE_REF dawg_ref = NO_EDGE;
+ EDGE_REF punc_ref = NO_EDGE;
+ int8_t dawg_index = -1;
+ int8_t punc_index = -1;
+ // Have we returned to the punc dawg at the end of the word?
+ bool back_to_punc = false;
+};
+
+class DawgPositionVector : public GenericVector<DawgPosition> {
+ public:
+ /// Adds an entry for the given dawg_index with the given node to the vec.
+ /// Returns false if the same entry already exists in the vector,
+ /// true otherwise.
+ inline bool add_unique(const DawgPosition &new_pos,
+ bool debug,
+ const char *debug_msg) {
+ for (int i = 0; i < size(); ++i) {
+ if (data_[i] == new_pos) return false;
+ }
+ push_back(new_pos);
+ if (debug) {
+ tprintf("%s[%d, " REFFORMAT "] [punc: " REFFORMAT "%s]\n",
+ debug_msg, new_pos.dawg_index, new_pos.dawg_ref,
+ new_pos.punc_ref, new_pos.back_to_punc ? " returned" : "");
+ }
+ return true;
+ }
+};
+
+//
+/// Concrete class that can operate on a compacted (squished) Dawg (read,
+/// search and write to file). This class is read-only in the sense that
+/// new words can not be added to an instance of SquishedDawg.
+/// The underlying representation of the nodes and edges in SquishedDawg
+/// is stored as a contiguous EDGE_ARRAY (read from file or given as an
+/// argument to the constructor).
+//
+class TESS_API SquishedDawg : public Dawg {
+ public:
+ SquishedDawg(DawgType type, const STRING &lang, PermuterType perm,
+ int debug_level)
+ : Dawg(type, lang, perm, debug_level) {}
+ SquishedDawg(const char *filename, DawgType type, const STRING &lang,
+ PermuterType perm, int debug_level)
+ : Dawg(type, lang, perm, debug_level) {
+ TFile file;
+ ASSERT_HOST(file.Open(filename, nullptr));
+ ASSERT_HOST(read_squished_dawg(&file));
+ num_forward_edges_in_node0 = num_forward_edges(0);
+ }
+ SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type,
+ const STRING &lang, PermuterType perm, int unicharset_size,
+ int debug_level)
+ : Dawg(type, lang, perm, debug_level),
+ edges_(edges),
+ num_edges_(num_edges) {
+ init(unicharset_size);
+ num_forward_edges_in_node0 = num_forward_edges(0);
+ if (debug_level > 3) print_all("SquishedDawg:");
+ }
+ ~SquishedDawg() override;
+
+ // Loads using the given TFile. Returns false on failure.
+ bool Load(TFile *fp) {
+ if (!read_squished_dawg(fp)) return false;
+ num_forward_edges_in_node0 = num_forward_edges(0);
+ return true;
+ }
+
+ int NumEdges() { return num_edges_; }
+
+ /// Returns the edge that corresponds to the letter out of this node.
+ EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
+ bool word_end) const override;
+
+ /// Fills the given NodeChildVector with all the unichar ids (and the
+ /// corresponding EDGE_REFs) for which there is an edge out of this node.
+ void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
+ bool word_end) const override {
+ EDGE_REF edge = node;
+ if (!edge_occupied(edge) || edge == NO_EDGE) return;
+ assert(forward_edge(edge)); // we don't expect any backward edges to
+ do { // be present when this function is called
+ if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
+ vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
+ }
+ } while (!last_edge(edge++));
+ }
+
+ /// Returns the next node visited by following the edge
+ /// indicated by the given EDGE_REF.
+ NODE_REF next_node(EDGE_REF edge) const override {
+ return next_node_from_edge_rec((edges_[edge]));
+ }
+
+ /// Returns true if the edge indicated by the given EDGE_REF
+ /// marks the end of a word.
+ bool end_of_word(EDGE_REF edge_ref) const override {
+ return end_of_word_from_edge_rec((edges_[edge_ref]));
+ }
+
+ /// Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
+ UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override {
+ return unichar_id_from_edge_rec((edges_[edge_ref]));
+ }
+
+ /// Prints the contents of the node indicated by the given NODE_REF.
+ /// At most max_num_edges will be printed.
+ void print_node(NODE_REF node, int max_num_edges) const override;
+
+ /// Writes the squished/reduced Dawg to a file.
+ bool write_squished_dawg(TFile *file);
+
+ /// Opens the file with the given filename and writes the
+ /// squished/reduced Dawg to the file.
+ bool write_squished_dawg(const char *filename) {
+ TFile file;
+ file.OpenWrite(nullptr);
+ if (!this->write_squished_dawg(&file)) {
+ tprintf("Error serializing %s\n", filename);
+ return false;
+ }
+ if (!file.CloseWrite(filename, nullptr)) {
+ tprintf("Error writing file %s\n", filename);
+ return false;
+ }
+ return true;
+ }
+
+ private:
+ /// Sets the next node link for this edge.
+ inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) {
+ set_next_node_in_edge_rec(&(edges_[edge_ref]), value);
+ }
+ /// Sets the edge to be empty.
+ inline void set_empty_edge(EDGE_REF edge_ref) {
+ (edges_[edge_ref] = next_node_mask_);
+ }
+ /// Goes through all the edges and clears each one out.
+ inline void clear_all_edges() {
+ for (int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge);
+ }
+ /// Clears the last flag of this edge.
+ inline void clear_marker_flag(EDGE_REF edge_ref) {
+ (edges_[edge_ref] &= ~(MARKER_FLAG << flag_start_bit_));
+ }
+ /// Returns true if this edge is in the forward direction.
+ inline bool forward_edge(EDGE_REF edge_ref) const {
+ return (edge_occupied(edge_ref) &&
+ (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
+ }
+ /// Returns true if this edge is in the backward direction.
+ inline bool backward_edge(EDGE_REF edge_ref) const {
+ return (edge_occupied(edge_ref) &&
+ (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
+ }
+ /// Returns true if the edge spot in this location is occupied.
+ inline bool edge_occupied(EDGE_REF edge_ref) const {
+ return (edges_[edge_ref] != next_node_mask_);
+ }
+ /// Returns true if this edge is the last edge in a sequence.
+ inline bool last_edge(EDGE_REF edge_ref) const {
+ return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0;
+ }
+
+ /// Counts and returns the number of forward edges in this node.
+ int32_t num_forward_edges(NODE_REF node) const;
+
+ /// Reads SquishedDawg from a file.
+ bool read_squished_dawg(TFile *file);
+
+ /// Prints the contents of an edge indicated by the given EDGE_REF.
+ void print_edge(EDGE_REF edge) const;
+
+ /// Prints the contents of the SquishedDawg.
+ void print_all(const char* msg) {
+ tprintf("\n__________________________\n%s\n", msg);
+ for (int i = 0; i < num_edges_; ++i) print_edge(i);
+ tprintf("__________________________\n");
+ }
+ /// Constructs a mapping from the memory node indices to disk node indices.
+ std::unique_ptr<EDGE_REF[]> build_node_map(int32_t *num_nodes) const;
+
+ // Member variables.
+ EDGE_ARRAY edges_ = nullptr;
+ int32_t num_edges_ = 0;
+ int num_forward_edges_in_node0 = 0;
+};
+
+} // namespace tesseract
+
+#endif // DICT_DAWG_H_
diff --git a/tesseract/src/dict/dawg_cache.cpp b/tesseract/src/dict/dawg_cache.cpp
new file mode 100644
index 00000000..0ade27e6
--- /dev/null
+++ b/tesseract/src/dict/dawg_cache.cpp
@@ -0,0 +1,96 @@
+///////////////////////////////////////////////////////////////////////
+// File: dawg_cache.cpp
+// Description: A class that knows about loading and caching dawgs.
+// Author: David Eger
+//
+// (C) Copyright 2012, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "dawg_cache.h"
+
+#include "dawg.h"
+#include "object_cache.h"
+#include "strngs.h"
+#include "tessdatamanager.h"
+
+namespace tesseract {
+
+struct DawgLoader {
+ DawgLoader(const STRING &lang, TessdataType tessdata_dawg_type,
+ int dawg_debug_level, TessdataManager *data_file)
+ : lang_(lang),
+ data_file_(data_file),
+ tessdata_dawg_type_(tessdata_dawg_type),
+ dawg_debug_level_(dawg_debug_level) {}
+
+ Dawg *Load();
+
+ STRING lang_;
+ TessdataManager *data_file_;
+ TessdataType tessdata_dawg_type_;
+ int dawg_debug_level_;
+};
+
+Dawg *DawgCache::GetSquishedDawg(const STRING &lang,
+ TessdataType tessdata_dawg_type,
+ int debug_level, TessdataManager *data_file) {
+ std::string data_id = data_file->GetDataFileName();
+ data_id += kTessdataFileSuffixes[tessdata_dawg_type];
+ DawgLoader loader(lang, tessdata_dawg_type, debug_level, data_file);
+ return dawgs_.Get(data_id, std::bind(&DawgLoader::Load, &loader));
+}
+
+Dawg *DawgLoader::Load() {
+ TFile fp;
+ if (!data_file_->GetComponent(tessdata_dawg_type_, &fp)) return nullptr;
+ DawgType dawg_type;
+ PermuterType perm_type;
+ switch (tessdata_dawg_type_) {
+ case TESSDATA_PUNC_DAWG:
+ case TESSDATA_LSTM_PUNC_DAWG:
+ dawg_type = DAWG_TYPE_PUNCTUATION;
+ perm_type = PUNC_PERM;
+ break;
+ case TESSDATA_SYSTEM_DAWG:
+ case TESSDATA_LSTM_SYSTEM_DAWG:
+ dawg_type = DAWG_TYPE_WORD;
+ perm_type = SYSTEM_DAWG_PERM;
+ break;
+ case TESSDATA_NUMBER_DAWG:
+ case TESSDATA_LSTM_NUMBER_DAWG:
+ dawg_type = DAWG_TYPE_NUMBER;
+ perm_type = NUMBER_PERM;
+ break;
+ case TESSDATA_BIGRAM_DAWG:
+ dawg_type = DAWG_TYPE_WORD; // doesn't actually matter
+ perm_type = COMPOUND_PERM; // doesn't actually matter
+ break;
+ case TESSDATA_UNAMBIG_DAWG:
+ dawg_type = DAWG_TYPE_WORD;
+ perm_type = SYSTEM_DAWG_PERM;
+ break;
+ case TESSDATA_FREQ_DAWG:
+ dawg_type = DAWG_TYPE_WORD;
+ perm_type = FREQ_DAWG_PERM;
+ break;
+ default:
+ return nullptr;
+ }
+ auto *retval =
+ new SquishedDawg(dawg_type, lang_, perm_type, dawg_debug_level_);
+ if (retval->Load(&fp)) return retval;
+ delete retval;
+ return nullptr;
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/dict/dawg_cache.h b/tesseract/src/dict/dawg_cache.h
new file mode 100644
index 00000000..83233802
--- /dev/null
+++ b/tesseract/src/dict/dawg_cache.h
@@ -0,0 +1,53 @@
+///////////////////////////////////////////////////////////////////////
+// File: dawg_cache.h
+// Description: A class that knows about loading and caching dawgs.
+// Author: David Eger
+// Created: Fri Jan 27 12:08:00 PST 2012
+//
+// (C) Copyright 2012, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_DICT_DAWG_CACHE_H_
+#define TESSERACT_DICT_DAWG_CACHE_H_
+
+#include "dawg.h"
+#include "object_cache.h"
+#include "strngs.h"
+#include "tessdatamanager.h"
+
+namespace tesseract {
+
+class DawgCache {
+ public:
+ Dawg *GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type,
+ int debug_level, TessdataManager *data_file);
+
+ // If we manage the given dawg, decrement its count,
+ // and possibly delete it if the count reaches zero.
+ // If dawg is unknown to us, return false.
+ bool FreeDawg(Dawg *dawg) {
+ return dawgs_.Free(dawg);
+ }
+
+ // Free up any currently unused dawgs.
+ void DeleteUnusedDawgs() {
+ dawgs_.DeleteUnusedObjects();
+ }
+
+ private:
+ ObjectCache<Dawg> dawgs_;
+};
+
+} // namespace tesseract
+
+#endif // TESSERACT_DICT_DAWG_CACHE_H_
diff --git a/tesseract/src/dict/dict.cpp b/tesseract/src/dict/dict.cpp
new file mode 100644
index 00000000..eb69b569
--- /dev/null
+++ b/tesseract/src/dict/dict.cpp
@@ -0,0 +1,888 @@
+///////////////////////////////////////////////////////////////////////
+// File: dict.cpp
+// Description: dict class.
+// Author: Samuel Charron
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "dict.h"
+
+#include "tprintf.h"
+
+#include <cstdio>
+
+namespace tesseract {
+
+class Image;
+
+Dict::Dict(CCUtil* ccutil)
+ : letter_is_okay_(&tesseract::Dict::def_letter_is_okay),
+ probability_in_context_(&tesseract::Dict::def_probability_in_context),
+ ccutil_(ccutil),
+ wildcard_unichar_id_(INVALID_UNICHAR_ID),
+ apostrophe_unichar_id_(INVALID_UNICHAR_ID),
+ question_unichar_id_(INVALID_UNICHAR_ID),
+ slash_unichar_id_(INVALID_UNICHAR_ID),
+ hyphen_unichar_id_(INVALID_UNICHAR_ID),
+ STRING_MEMBER(user_words_file, "", "A filename of user-provided words.",
+ getCCUtil()->params()),
+ STRING_INIT_MEMBER(user_words_suffix, "",
+ "A suffix of user-provided words located in tessdata.",
+ getCCUtil()->params()),
+ STRING_MEMBER(user_patterns_file, "",
+ "A filename of user-provided patterns.",
+ getCCUtil()->params()),
+ STRING_INIT_MEMBER(user_patterns_suffix, "",
+ "A suffix of user-provided patterns located in "
+ "tessdata.",
+ getCCUtil()->params()),
+ BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
+ getCCUtil()->params()),
+ BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
+ getCCUtil()->params()),
+ BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
+ getCCUtil()->params()),
+ BOOL_INIT_MEMBER(load_punc_dawg, true,
+ "Load dawg with punctuation"
+ " patterns.",
+ getCCUtil()->params()),
+ BOOL_INIT_MEMBER(load_number_dawg, true,
+ "Load dawg with number"
+ " patterns.",
+ getCCUtil()->params()),
+ BOOL_INIT_MEMBER(load_bigram_dawg, true,
+ "Load dawg with special word "
+ "bigrams.",
+ getCCUtil()->params()),
+ double_MEMBER(xheight_penalty_subscripts, 0.125,
+ "Score penalty (0.1 = 10%) added if there are subscripts "
+ "or superscripts in a word, but it is otherwise OK.",
+ getCCUtil()->params()),
+ double_MEMBER(xheight_penalty_inconsistent, 0.25,
+ "Score penalty (0.1 = 10%) added if an xheight is "
+ "inconsistent.",
+ getCCUtil()->params()),
+ double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
+ "Score multiplier for word matches which have good case and"
+ " are frequent in the given language (lower is better).",
+ getCCUtil()->params()),
+ double_MEMBER(segment_penalty_dict_case_ok, 1.1,
+ "Score multiplier for word matches that have good case "
+ "(lower is better).",
+ getCCUtil()->params()),
+ double_MEMBER(segment_penalty_dict_case_bad, 1.3125,
+ "Default score multiplier for word matches, which may have "
+ "case issues (lower is better).",
+ getCCUtil()->params()),
+ double_MEMBER(segment_penalty_dict_nonword, 1.25,
+ "Score multiplier for glyph fragment segmentations which "
+ "do not match a dictionary word (lower is better).",
+ getCCUtil()->params()),
+ double_MEMBER(segment_penalty_garbage, 1.50,
+ "Score multiplier for poorly cased strings that are not in"
+ " the dictionary and generally look like garbage (lower is"
+ " better).",
+ getCCUtil()->params()),
+ STRING_MEMBER(output_ambig_words_file, "",
+ "Output file for ambiguities found in the dictionary",
+ getCCUtil()->params()),
+ INT_MEMBER(dawg_debug_level, 0,
+ "Set to 1 for general debug info"
+ ", to 2 for more details, to 3 to see all the debug messages",
+ getCCUtil()->params()),
+ INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
+ getCCUtil()->params()),
+ BOOL_MEMBER(use_only_first_uft8_step, false,
+ "Use only the first UTF8 step of the given string"
+ " when computing log probabilities.",
+ getCCUtil()->params()),
+ double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
+ getCCUtil()->params()),
+ double_MEMBER(stopper_nondict_certainty_base, -2.50,
+ "Certainty threshold for non-dict words",
+ getCCUtil()->params()),
+ double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0,
+ "Reject certainty offset", getCCUtil()->params()),
+ INT_MEMBER(stopper_smallword_size, 2,
+ "Size of dict word to be treated as non-dict word",
+ getCCUtil()->params()),
+ double_MEMBER(stopper_certainty_per_char, -0.50,
+ "Certainty to add"
+ " for each dict char above small word size.",
+ getCCUtil()->params()),
+ double_MEMBER(stopper_allowable_character_badness, 3.0,
+ "Max certaintly variation allowed in a word (in sigma)",
+ getCCUtil()->params()),
+ INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
+ getCCUtil()->params()),
+ BOOL_MEMBER(stopper_no_acceptable_choices, false,
+ "Make AcceptableChoice() always return false. Useful"
+ " when there is a need to explore all segmentations",
+ getCCUtil()->params()),
+ INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
+ "Max words to keep in list", getCCUtil()->params()),
+ STRING_MEMBER(word_to_debug, "",
+ "Word for which stopper debug"
+ " information should be printed to stdout",
+ getCCUtil()->params()),
+ BOOL_MEMBER(segment_nonalphabetic_script, false,
+ "Don't use any alphabetic-specific tricks."
+ " Set to true in the traineddata config file for"
+ " scripts that are cursive or inherently fixed-pitch",
+ getCCUtil()->params()),
+ BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
+ getCCUtil()->params()),
+ double_MEMBER(doc_dict_pending_threshold, 0.0,
+ "Worst certainty for using pending dictionary",
+ getCCUtil()->params()),
+ double_MEMBER(doc_dict_certainty_threshold, -2.25,
+ "Worst certainty for words that can be inserted into the"
+ " document dictionary",
+ getCCUtil()->params()),
+ INT_MEMBER(max_permuter_attempts, 10000,
+ "Maximum number of different"
+ " character choices to consider during permutation."
+ " This limit is especially useful when user patterns"
+ " are specified, since overly generic patterns can result in"
+ " dawg search exploring an overly large number of options.",
+ getCCUtil()->params()) {
+ reject_offset_ = 0.0;
+ go_deeper_fxn_ = nullptr;
+ hyphen_word_ = nullptr;
+ last_word_on_line_ = false;
+ document_words_ = nullptr;
+ dawg_cache_ = nullptr;
+ dawg_cache_is_ours_ = false;
+ pending_words_ = nullptr;
+ bigram_dawg_ = nullptr;
+ freq_dawg_ = nullptr;
+ punc_dawg_ = nullptr;
+ unambig_dawg_ = nullptr;
+ wordseg_rating_adjust_factor_ = -1.0f;
+ output_ambig_words_file_ = nullptr;
+}
+
+Dict::~Dict() {
+ End();
+ delete hyphen_word_;
+ if (output_ambig_words_file_ != nullptr) fclose(output_ambig_words_file_);
+}
+
+DawgCache* Dict::GlobalDawgCache() {
+ // This global cache (a singleton) will outlive every Tesseract instance
+ // (even those that someone else might declare as global statics).
+ static DawgCache cache;
+ return &cache;
+}
+
+// Sets up ready for a Load or LoadLSTM.
+void Dict::SetupForLoad(DawgCache* dawg_cache) {
+ if (dawgs_.size() != 0) this->End();
+
+ apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
+ question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
+ slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
+ hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
+
+ if (dawg_cache != nullptr) {
+ dawg_cache_ = dawg_cache;
+ dawg_cache_is_ours_ = false;
+ } else {
+ dawg_cache_ = new DawgCache();
+ dawg_cache_is_ours_ = true;
+ }
+}
+
+// Loads the dawgs needed by Tesseract. Call FinishLoad() after.
+void Dict::Load(const STRING& lang, TessdataManager* data_file) {
+ // Load dawgs_.
+ if (load_punc_dawg) {
+ punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG,
+ dawg_debug_level, data_file);
+ if (punc_dawg_) dawgs_ += punc_dawg_;
+ }
+ if (load_system_dawg) {
+ Dawg* system_dawg = dawg_cache_->GetSquishedDawg(
+ lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
+ if (system_dawg) dawgs_ += system_dawg;
+ }
+ if (load_number_dawg) {
+ Dawg* number_dawg = dawg_cache_->GetSquishedDawg(
+ lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
+ if (number_dawg) dawgs_ += number_dawg;
+ }
+ if (load_bigram_dawg) {
+ bigram_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG,
+ dawg_debug_level, data_file);
+ // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the
+ // dawgs_!!
+ }
+ if (load_freq_dawg) {
+ freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG,
+ dawg_debug_level, data_file);
+ if (freq_dawg_) dawgs_ += freq_dawg_;
+ }
+ if (load_unambig_dawg) {
+ unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG,
+ dawg_debug_level, data_file);
+ if (unambig_dawg_) dawgs_ += unambig_dawg_;
+ }
+
+ STRING name;
+ if (!user_words_suffix.empty() || !user_words_file.empty()) {
+ Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
+ getUnicharset().size(), dawg_debug_level);
+ if (!user_words_file.empty()) {
+ name = user_words_file;
+ } else {
+ name = getCCUtil()->language_data_path_prefix;
+ name += user_words_suffix;
+ }
+ if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
+ Trie::RRP_REVERSE_IF_HAS_RTL)) {
+ tprintf("Error: failed to load %s\n", name.c_str());
+ delete trie_ptr;
+ } else {
+ dawgs_ += trie_ptr;
+ }
+ }
+
+ if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
+ Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
+ getUnicharset().size(), dawg_debug_level);
+ trie_ptr->initialize_patterns(&(getUnicharset()));
+ if (!user_patterns_file.empty()) {
+ name = user_patterns_file;
+ } else {
+ name = getCCUtil()->language_data_path_prefix;
+ name += user_patterns_suffix;
+ }
+ if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
+ tprintf("Error: failed to load %s\n", name.c_str());
+ delete trie_ptr;
+ } else {
+ dawgs_ += trie_ptr;
+ }
+ }
+
+ document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
+ getUnicharset().size(), dawg_debug_level);
+ dawgs_ += document_words_;
+
+ // This dawg is temporary and should not be searched by letter_is_ok.
+ pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
+ getUnicharset().size(), dawg_debug_level);
+}
+
+// Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
+void Dict::LoadLSTM(const STRING& lang, TessdataManager* data_file) {
+ // Load dawgs_.
+ if (load_punc_dawg) {
+ punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG,
+ dawg_debug_level, data_file);
+ if (punc_dawg_) dawgs_ += punc_dawg_;
+ }
+ if (load_system_dawg) {
+ Dawg* system_dawg = dawg_cache_->GetSquishedDawg(
+ lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
+ if (system_dawg) dawgs_ += system_dawg;
+ }
+ if (load_number_dawg) {
+ Dawg* number_dawg = dawg_cache_->GetSquishedDawg(
+ lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
+ if (number_dawg) dawgs_ += number_dawg;
+ }
+
+ // stolen from Dict::Load (but needs params_ from Tesseract
+ // langdata/config/api):
+ STRING name;
+ if (!user_words_suffix.empty() || !user_words_file.empty()) {
+ Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
+ getUnicharset().size(), dawg_debug_level);
+ if (!user_words_file.empty()) {
+ name = user_words_file;
+ } else {
+ name = getCCUtil()->language_data_path_prefix;
+ name += user_words_suffix;
+ }
+ if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
+ Trie::RRP_REVERSE_IF_HAS_RTL)) {
+ tprintf("Error: failed to load %s\n", name.c_str());
+ delete trie_ptr;
+ } else {
+ dawgs_ += trie_ptr;
+ }
+ }
+
+ if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
+ Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
+ getUnicharset().size(), dawg_debug_level);
+ trie_ptr->initialize_patterns(&(getUnicharset()));
+ if (!user_patterns_file.empty()) {
+ name = user_patterns_file;
+ } else {
+ name = getCCUtil()->language_data_path_prefix;
+ name += user_patterns_suffix;
+ }
+ if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
+ tprintf("Error: failed to load %s\n", name.c_str());
+ delete trie_ptr;
+ } else {
+ dawgs_ += trie_ptr;
+ }
+ }
+}
+
+// Completes the loading process after Load() and/or LoadLSTM().
+// Returns false if no dictionaries were loaded.
+bool Dict::FinishLoad() {
+ if (dawgs_.empty()) return false;
+ // Construct a list of corresponding successors for each dawg. Each entry, i,
+ // in the successors_ vector is a vector of integers that represent the
+ // indices into the dawgs_ vector of the successors for dawg i.
+ successors_.reserve(dawgs_.size());
+ for (int i = 0; i < dawgs_.size(); ++i) {
+ const Dawg* dawg = dawgs_[i];
+ auto* lst = new SuccessorList();
+ for (int j = 0; j < dawgs_.size(); ++j) {
+ const Dawg* other = dawgs_[j];
+ if (dawg != nullptr && other != nullptr &&
+ (dawg->lang() == other->lang()) &&
+ kDawgSuccessors[dawg->type()][other->type()])
+ *lst += j;
+ }
+ successors_ += lst;
+ }
+ return true;
+}
+
+void Dict::End() {
+ if (dawgs_.size() == 0) return; // Not safe to call twice.
+ for (int i = 0; i < dawgs_.size(); i++) {
+ if (!dawg_cache_->FreeDawg(dawgs_[i])) {
+ delete dawgs_[i];
+ }
+ }
+ dawg_cache_->FreeDawg(bigram_dawg_);
+ if (dawg_cache_is_ours_) {
+ delete dawg_cache_;
+ dawg_cache_ = nullptr;
+ }
+ successors_.delete_data_pointers();
+ dawgs_.clear();
+ successors_.clear();
+ document_words_ = nullptr;
+ delete pending_words_;
+ pending_words_ = nullptr;
+}
+
+// Returns true if in light of the current state unichar_id is allowed
+// according to at least one of the dawgs in the dawgs_ vector.
+// See more extensive comments in dict.h where this function is declared.
+int Dict::def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset,
+ UNICHAR_ID unichar_id, bool word_end) const {
+ auto* dawg_args = static_cast<DawgArgs*>(void_dawg_args);
+
+ ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
+
+ if (dawg_debug_level >= 3) {
+ tprintf(
+ "def_letter_is_okay: current unichar=%s word_end=%d"
+ " num active dawgs=%d\n",
+ getUnicharset().debug_str(unichar_id).c_str(), word_end,
+ dawg_args->active_dawgs->size());
+ }
+
+ // Do not accept words that contain kPatternUnicharID.
+ // (otherwise pattern dawgs would not function correctly).
+ // Do not accept words containing INVALID_UNICHAR_IDs.
+ if (unichar_id == Dawg::kPatternUnicharID ||
+ unichar_id == INVALID_UNICHAR_ID) {
+ dawg_args->permuter = NO_PERM;
+ return NO_PERM;
+ }
+
+ // Initialization.
+ PermuterType curr_perm = NO_PERM;
+ dawg_args->updated_dawgs->clear();
+ dawg_args->valid_end = false;
+
+ // Go over the active_dawgs vector and insert DawgPosition records
+ // with the updated ref (an edge with the corresponding unichar id) into
+ // dawg_args->updated_pos.
+ for (int a = 0; a < dawg_args->active_dawgs->size(); ++a) {
+ const DawgPosition& pos = (*dawg_args->active_dawgs)[a];
+ const Dawg* punc_dawg =
+ pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
+ const Dawg* dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
+
+ if (!dawg && !punc_dawg) {
+ // shouldn't happen.
+ tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n");
+ continue;
+ }
+ if (!dawg) {
+ // We're in the punctuation dawg. A core dawg has not been chosen.
+ NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
+ EDGE_REF punc_transition_edge =
+ punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end);
+ if (punc_transition_edge != NO_EDGE) {
+ // Find all successors, and see which can transition.
+ const SuccessorList& slist = *(successors_[pos.punc_index]);
+ for (int s = 0; s < slist.size(); ++s) {
+ int sdawg_index = slist[s];
+ const Dawg* sdawg = dawgs_[sdawg_index];
+ UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
+ EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
+ if (dawg_edge != NO_EDGE) {
+ if (dawg_debug_level >= 3) {
+ tprintf("Letter found in dawg %d\n", sdawg_index);
+ }
+ dawg_args->updated_dawgs->add_unique(
+ DawgPosition(sdawg_index, dawg_edge, pos.punc_index,
+ punc_transition_edge, false),
+ dawg_debug_level > 0,
+ "Append transition from punc dawg to current dawgs: ");
+ if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
+ if (sdawg->end_of_word(dawg_edge) &&
+ punc_dawg->end_of_word(punc_transition_edge))
+ dawg_args->valid_end = true;
+ }
+ }
+ }
+ EDGE_REF punc_edge =
+ punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
+ if (punc_edge != NO_EDGE) {
+ if (dawg_debug_level >= 3) {
+ tprintf("Letter found in punctuation dawg\n");
+ }
+ dawg_args->updated_dawgs->add_unique(
+ DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false),
+ dawg_debug_level > 0, "Extend punctuation dawg: ");
+ if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM;
+ if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
+ }
+ continue;
+ }
+
+ if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
+ // We can end the main word here.
+ // If we can continue on the punc ref, add that possibility.
+ NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
+ EDGE_REF punc_edge =
+ punc_node == NO_EDGE
+ ? NO_EDGE
+ : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
+ if (punc_edge != NO_EDGE) {
+ dawg_args->updated_dawgs->add_unique(
+ DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index,
+ punc_edge, true),
+ dawg_debug_level > 0, "Return to punctuation dawg: ");
+ if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
+ if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
+ }
+ }
+
+ if (pos.back_to_punc) continue;
+
+ // If we are dealing with the pattern dawg, look up all the
+ // possible edges, not only for the exact unichar_id, but also
+ // for all its character classes (alpha, digit, etc).
+ if (dawg->type() == DAWG_TYPE_PATTERN) {
+ ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args,
+ &curr_perm);
+ // There can't be any successors to dawg that is of type
+ // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
+ continue;
+ }
+
+ // Find the edge out of the node for the unichar_id.
+ NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
+ EDGE_REF edge =
+ (node == NO_EDGE)
+ ? NO_EDGE
+ : dawg->edge_char_of(
+ node, char_for_dawg(unicharset, unichar_id, dawg), word_end);
+
+ if (dawg_debug_level >= 3) {
+ tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
+ pos.dawg_index, node, edge);
+ }
+
+ if (edge != NO_EDGE) { // the unichar was found in the current dawg
+ if (dawg_debug_level >= 3) {
+ tprintf("Letter found in dawg %d\n", pos.dawg_index);
+ }
+ if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
+ if (dawg_debug_level >= 3) {
+ tprintf("Punctuation constraint not satisfied at end of word.\n");
+ }
+ continue;
+ }
+ if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
+ if (dawg->end_of_word(edge) &&
+ (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref)))
+ dawg_args->valid_end = true;
+ dawg_args->updated_dawgs->add_unique(
+ DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
+ false),
+ dawg_debug_level > 0,
+ "Append current dawg to updated active dawgs: ");
+ }
+ } // end for
+ // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
+ // or if we found the current letter in a non-punctuation dawg. This
+ // allows preserving information on which dawg the "core" word came from.
+ // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
+ if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
+ (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
+ dawg_args->permuter = curr_perm;
+ }
+ if (dawg_debug_level >= 2) {
+ tprintf("Returning %d for permuter code for this character.\n",
+ dawg_args->permuter);
+ }
+ return dawg_args->permuter;
+}
+
+void Dict::ProcessPatternEdges(const Dawg* dawg, const DawgPosition& pos,
+ UNICHAR_ID unichar_id, bool word_end,
+ DawgArgs* dawg_args,
+ PermuterType* curr_perm) const {
+ NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
+ // Try to find the edge corresponding to the exact unichar_id and to all the
+ // edges corresponding to the character class of unichar_id.
+ GenericVector<UNICHAR_ID> unichar_id_patterns;
+ unichar_id_patterns.push_back(unichar_id);
+ dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
+ &unichar_id_patterns);
+ for (int i = 0; i < unichar_id_patterns.size(); ++i) {
+ // On the first iteration check all the outgoing edges.
+ // On the second iteration check all self-loops.
+ for (int k = 0; k < 2; ++k) {
+ EDGE_REF edge =
+ (k == 0) ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
+ : dawg->pattern_loop_edge(pos.dawg_ref,
+ unichar_id_patterns[i], word_end);
+ if (edge == NO_EDGE) continue;
+ if (dawg_debug_level >= 3) {
+ tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
+ pos.dawg_index, node, edge);
+ tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
+ }
+ if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
+ if (dawg->end_of_word(edge)) dawg_args->valid_end = true;
+ dawg_args->updated_dawgs->add_unique(
+ DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
+ pos.back_to_punc),
+ dawg_debug_level > 0,
+ "Append current dawg to updated active dawgs: ");
+ }
+ }
+}
+
+// Fill the given active_dawgs vector with dawgs that could contain the
+// beginning of the word. If hyphenated() returns true, copy the entries
+// from hyphen_active_dawgs_ instead.
+void Dict::init_active_dawgs(DawgPositionVector* active_dawgs,
+ bool ambigs_mode) const {
+ int i;
+ if (hyphenated()) {
+ *active_dawgs = hyphen_active_dawgs_;
+ if (dawg_debug_level >= 3) {
+ for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
+ tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
+ hyphen_active_dawgs_[i].dawg_index,
+ hyphen_active_dawgs_[i].dawg_ref);
+ }
+ }
+ } else {
+ default_dawgs(active_dawgs, ambigs_mode);
+ }
+}
+
+void Dict::default_dawgs(DawgPositionVector* dawg_pos_vec,
+ bool suppress_patterns) const {
+ bool punc_dawg_available =
+ (punc_dawg_ != nullptr) &&
+ punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
+
+ for (int i = 0; i < dawgs_.size(); i++) {
+ if (dawgs_[i] != nullptr &&
+ !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
+ int dawg_ty = dawgs_[i]->type();
+ bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
+ if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
+ *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
+ if (dawg_debug_level >= 3) {
+ tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i,
+ NO_EDGE);
+ }
+ } else if (!punc_dawg_available || !subsumed_by_punc) {
+ *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
+ if (dawg_debug_level >= 3) {
+ tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
+ }
+ }
+ }
+ }
+}
+
+void Dict::add_document_word(const WERD_CHOICE& best_choice) {
+ // Do not add hyphenated word parts to the document dawg.
+ // hyphen_word_ will be non-nullptr after the set_hyphen_word() is
+ // called when the first part of the hyphenated word is
+ // discovered and while the second part of the word is recognized.
+ // hyphen_word_ is cleared in cc_recg() before the next word on
+ // the line is recognized.
+ if (hyphen_word_) return;
+
+ int stringlen = best_choice.length();
+
+ if (valid_word(best_choice) || stringlen < 2) return;
+
+ // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
+ if (best_choice.length() >= kDocDictMaxRepChars) {
+ int num_rep_chars = 1;
+ UNICHAR_ID uch_id = best_choice.unichar_id(0);
+ for (int i = 1; i < best_choice.length(); ++i) {
+ if (best_choice.unichar_id(i) != uch_id) {
+ num_rep_chars = 1;
+ uch_id = best_choice.unichar_id(i);
+ } else {
+ ++num_rep_chars;
+ if (num_rep_chars == kDocDictMaxRepChars) return;
+ }
+ }
+ }
+
+ if (best_choice.certainty() < doc_dict_certainty_threshold ||
+ stringlen == 2) {
+ if (best_choice.certainty() < doc_dict_pending_threshold) return;
+
+ if (!pending_words_->word_in_dawg(best_choice)) {
+ if (stringlen > 2 ||
+ (stringlen == 2 &&
+ getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
+ getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
+ pending_words_->add_word_to_dawg(best_choice);
+ }
+ return;
+ }
+ }
+
+ if (save_doc_words) {
+ STRING filename(getCCUtil()->imagefile);
+ filename += ".doc";
+ FILE* doc_word_file = fopen(filename.c_str(), "a");
+ if (doc_word_file == nullptr) {
+ tprintf("Error: Could not open file %s\n", filename.c_str());
+ ASSERT_HOST(doc_word_file);
+ }
+ fprintf(doc_word_file, "%s\n", best_choice.debug_string().c_str());
+ fclose(doc_word_file);
+ }
+ document_words_->add_word_to_dawg(best_choice);
+}
+
+void Dict::adjust_word(WERD_CHOICE* word, bool nonword,
+ XHeightConsistencyEnum xheight_consistency,
+ float additional_adjust, bool modify_rating,
+ bool debug) {
+ bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
+ word->GetTopScriptID() == getUnicharset().han_sid());
+ bool case_is_ok = (is_han || case_ok(*word));
+ bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
+
+ float adjust_factor = additional_adjust;
+ float new_rating = word->rating();
+ new_rating += kRatingPad;
+ const char* xheight_triggered = "";
+ if (word->length() > 1) {
+ // Calculate x-height and y-offset consistency penalties.
+ switch (xheight_consistency) {
+ case XH_INCONSISTENT:
+ adjust_factor += xheight_penalty_inconsistent;
+ xheight_triggered = ", xhtBAD";
+ break;
+ case XH_SUBNORMAL:
+ adjust_factor += xheight_penalty_subscripts;
+ xheight_triggered = ", xhtSUB";
+ break;
+ case XH_GOOD:
+ // leave the factor alone - all good!
+ break;
+ }
+ // TODO(eger): if nonword is true, but there is a "core" that is a dict
+ // word, negate nonword status.
+ } else {
+ if (debug) {
+ tprintf("Consistency could not be calculated.\n");
+ }
+ }
+ if (debug) {
+ tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "",
+ word->unichar_string().c_str(), word->rating(), xheight_triggered);
+ }
+
+ if (nonword) { // non-dictionary word
+ if (case_is_ok && punc_is_ok) {
+ adjust_factor += segment_penalty_dict_nonword;
+ new_rating *= adjust_factor;
+ if (debug) tprintf(", W");
+ } else {
+ adjust_factor += segment_penalty_garbage;
+ new_rating *= adjust_factor;
+ if (debug) {
+ if (!case_is_ok) tprintf(", C");
+ if (!punc_is_ok) tprintf(", P");
+ }
+ }
+ } else { // dictionary word
+ if (case_is_ok) {
+ if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) {
+ word->set_permuter(FREQ_DAWG_PERM);
+ adjust_factor += segment_penalty_dict_frequent_word;
+ new_rating *= adjust_factor;
+ if (debug) tprintf(", F");
+ } else {
+ adjust_factor += segment_penalty_dict_case_ok;
+ new_rating *= adjust_factor;
+ if (debug) tprintf(", ");
+ }
+ } else {
+ adjust_factor += segment_penalty_dict_case_bad;
+ new_rating *= adjust_factor;
+ if (debug) tprintf(", C");
+ }
+ }
+ new_rating -= kRatingPad;
+ if (modify_rating) word->set_rating(new_rating);
+ if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
+ word->set_adjust_factor(adjust_factor);
+}
+
+int Dict::valid_word(const WERD_CHOICE& word, bool numbers_ok) const {
+ const WERD_CHOICE* word_ptr = &word;
+ WERD_CHOICE temp_word(word.unicharset());
+ if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
+ copy_hyphen_info(&temp_word);
+ temp_word += word;
+ word_ptr = &temp_word;
+ }
+ if (word_ptr->length() == 0) return NO_PERM;
+ // Allocate vectors for holding current and updated
+ // active_dawgs and initialize them.
+ DawgPositionVector active_dawgs[2];
+ init_active_dawgs(&(active_dawgs[0]), false);
+ DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
+ int last_index = word_ptr->length() - 1;
+ // Call letter_is_okay for each letter in the word.
+ for (int i = hyphen_base_size(); i <= last_index; ++i) {
+ if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(),
+ word_ptr->unichar_id(i), i == last_index)))
+ break;
+ // Swap active_dawgs, constraints with the corresponding updated vector.
+ if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
+ dawg_args.updated_dawgs = &(active_dawgs[0]);
+ ++(dawg_args.active_dawgs);
+ } else {
+ ++(dawg_args.updated_dawgs);
+ dawg_args.active_dawgs = &(active_dawgs[0]);
+ }
+ }
+ return valid_word_permuter(dawg_args.permuter, numbers_ok)
+ ? dawg_args.permuter
+ : NO_PERM;
+}
+
+bool Dict::valid_bigram(const WERD_CHOICE& word1,
+ const WERD_CHOICE& word2) const {
+ if (bigram_dawg_ == nullptr) return false;
+
+ // Extract the core word from the middle of each word with any digits
+ // replaced with question marks.
+ int w1start, w1end, w2start, w2end;
+ word1.punct_stripped(&w1start, &w1end);
+ word2.punct_stripped(&w2start, &w2end);
+
+ // We don't want to penalize a single guillemet, hyphen, etc.
+ // But our bigram list doesn't have any information about punctuation.
+ if (w1start >= w1end) return word1.length() < 3;
+ if (w2start >= w2end) return word2.length() < 3;
+
+ const UNICHARSET& uchset = getUnicharset();
+ std::vector<UNICHAR_ID> bigram_string;
+ bigram_string.reserve(w1end + w2end + 1);
+ for (int i = w1start; i < w1end; i++) {
+ const auto &normed_ids =
+ getUnicharset().normed_ids(word1.unichar_id(i));
+ if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
+ bigram_string.push_back(question_unichar_id_);
+ else
+ bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
+ }
+ bigram_string.push_back(UNICHAR_SPACE);
+ for (int i = w2start; i < w2end; i++) {
+ const auto &normed_ids =
+ getUnicharset().normed_ids(word2.unichar_id(i));
+ if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
+ bigram_string.push_back(question_unichar_id_);
+ else
+ bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
+ }
+ WERD_CHOICE normalized_word(&uchset, bigram_string.size());
+ for (int i = 0; i < bigram_string.size(); ++i) {
+ normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1, 0.0f,
+ 0.0f);
+ }
+ return bigram_dawg_->word_in_dawg(normalized_word);
+}
+
+bool Dict::valid_punctuation(const WERD_CHOICE& word) {
+ if (word.length() == 0) return NO_PERM;
+ int i;
+ WERD_CHOICE new_word(word.unicharset());
+ int last_index = word.length() - 1;
+ int new_len = 0;
+ for (i = 0; i <= last_index; ++i) {
+ UNICHAR_ID unichar_id = (word.unichar_id(i));
+ if (getUnicharset().get_ispunctuation(unichar_id)) {
+ new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
+ } else if (!getUnicharset().get_isalpha(unichar_id) &&
+ !getUnicharset().get_isdigit(unichar_id)) {
+ return false; // neither punc, nor alpha, nor digit
+ } else if ((new_len = new_word.length()) == 0 ||
+ new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) {
+ new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
+ }
+ }
+ for (i = 0; i < dawgs_.size(); ++i) {
+ if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
+ dawgs_[i]->word_in_dawg(new_word))
+ return true;
+ }
+ return false;
+}
+
+/// Returns true if the language is space-delimited (not CJ, or T).
+bool Dict::IsSpaceDelimitedLang() const {
+ const UNICHARSET& u_set = getUnicharset();
+ if (u_set.han_sid() > 0) return false;
+ if (u_set.katakana_sid() > 0) return false;
+ if (u_set.thai_sid() > 0) return false;
+ return true;
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/dict/dict.h b/tesseract/src/dict/dict.h
new file mode 100644
index 00000000..e8ec2e37
--- /dev/null
+++ b/tesseract/src/dict/dict.h
@@ -0,0 +1,651 @@
+///////////////////////////////////////////////////////////////////////
+// File: dict.h
+// Description: dict class.
+// Author: Samuel Charron
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_DICT_DICT_H_
+#define TESSERACT_DICT_DICT_H_
+
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h" // DISABLED_LEGACY_ENGINE
+#endif
+
+#ifndef DISABLED_LEGACY_ENGINE
+#include "ambigs.h"
+#endif
+#include "dawg.h"
+#include "dawg_cache.h"
+#include "ratngs.h"
+#include "stopper.h"
+#include "trie.h"
+#include "unicharset.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#include "params_training_featdef.h"
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+namespace tesseract {
+
+class MATRIX;
+class WERD_RES;
+
+#define CHARS_PER_LINE 500
+#define MAX_WERD_LENGTH (int64_t) 128
+#define NO_RATING -1
+
+/** Struct used to hold temporary information about fragments. */
+struct CHAR_FRAGMENT_INFO {
+ UNICHAR_ID unichar_id;
+ const CHAR_FRAGMENT *fragment;
+ int num_fragments;
+ float rating;
+ float certainty;
+};
+
+using DawgVector = GenericVector<Dawg *>;
+
+//
+// Constants
+//
+static const int kRatingPad = 4;
+static const int kDictMaxWildcards = 2; // max wildcards for a word
+// TODO(daria): If hyphens are different in different languages and can be
+// inferred from training data we should load their values dynamically.
+static const char kHyphenSymbol[] = "-";
+static const char kSlashSymbol[] = "/";
+static const char kQuestionSymbol[] = "?";
+static const char kApostropheSymbol[] = "'";
+static const float kSimCertaintyScale = -10.0; // similarity matcher scaling
+static const float kSimCertaintyOffset = -10.0; // similarity matcher offset
+static const float kSimilarityFloor = 100.0; // worst E*L product to stop on
+static const int kDocDictMaxRepChars = 4;
+
+// Enum for describing whether the x-height for the word is consistent:
+// 0 - everything is good.
+// 1 - there are one or two secondary (but consistent) baselines
+// [think subscript and superscript], or there is an oversized
+// first character.
+// 2 - the word is inconsistent.
+enum XHeightConsistencyEnum {XH_GOOD, XH_SUBNORMAL, XH_INCONSISTENT};
+
+struct DawgArgs {
+ DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
+ : active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {}
+
+ DawgPositionVector *active_dawgs;
+ DawgPositionVector *updated_dawgs;
+ PermuterType permuter;
+ // True if the current position is a valid word end.
+ bool valid_end;
+};
+
+class TESS_API Dict {
+ public:
+ Dict(CCUtil* image_ptr);
+ ~Dict();
+ const CCUtil* getCCUtil() const {
+ return ccutil_;
+ }
+ CCUtil* getCCUtil() {
+ return ccutil_;
+ }
+ const UNICHARSET& getUnicharset() const {
+ return getCCUtil()->unicharset;
+ }
+ UNICHARSET& getUnicharset() {
+ return getCCUtil()->unicharset;
+ }
+#ifndef DISABLED_LEGACY_ENGINE
+ const UnicharAmbigs &getUnicharAmbigs() const {
+ return getCCUtil()->unichar_ambigs;
+ }
+#endif
+ // Returns true if unichar_id is a word compounding character like - or /.
+ inline bool compound_marker(UNICHAR_ID unichar_id) {
+ const UNICHARSET& unicharset = getUnicharset();
+ ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
+ const auto &normed_ids =
+ unicharset.normed_ids(unichar_id);
+ return normed_ids.size() == 1 &&
+ (normed_ids[0] == hyphen_unichar_id_ ||
+ normed_ids[0] == slash_unichar_id_);
+ }
+ // Returns true if unichar_id is an apostrophe-like character that may
+ // separate prefix/suffix words from a main body word.
+ inline bool is_apostrophe(UNICHAR_ID unichar_id) {
+ const UNICHARSET& unicharset = getUnicharset();
+ ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
+ const auto &normed_ids =
+ unicharset.normed_ids(unichar_id);
+ return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
+ }
+
+ /* hyphen.cpp ************************************************************/
+
+ /// Returns true if we've recorded the beginning of a hyphenated word.
+ inline bool hyphenated() const { return
+ !last_word_on_line_ && hyphen_word_;
+ }
+ /// Size of the base word (the part on the line before) of a hyphenated word.
+ inline int hyphen_base_size() const {
+ return this->hyphenated() ? hyphen_word_->length() : 0;
+ }
+ /// If this word is hyphenated copy the base word (the part on
+ /// the line before) of a hyphenated word into the given word.
+ /// This function assumes that word is not nullptr.
+ inline void copy_hyphen_info(WERD_CHOICE *word) const {
+ if (this->hyphenated()) {
+ *word = *hyphen_word_;
+ if (hyphen_debug_level) word->print("copy_hyphen_info: ");
+ }
+ }
+ /// Check whether the word has a hyphen at the end.
+ inline bool has_hyphen_end(const UNICHARSET* unicharset,
+ UNICHAR_ID unichar_id, bool first_pos) const {
+ if (!last_word_on_line_ || first_pos)
+ return false;
+ ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
+ const auto &normed_ids =
+ unicharset->normed_ids(unichar_id);
+ return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
+ }
+ /// Same as above, but check the unichar at the end of the word.
+ inline bool has_hyphen_end(const WERD_CHOICE &word) const {
+ int word_index = word.length() - 1;
+ return has_hyphen_end(word.unicharset(), word.unichar_id(word_index),
+ word_index == 0);
+ }
+ /// Unless the previous word was the last one on the line, and the current
+ /// one is not (thus it is the first one on the line), erase hyphen_word_,
+ /// clear hyphen_active_dawgs_, update last_word_on_line_.
+ void reset_hyphen_vars(bool last_word_on_line);
+ /// Update hyphen_word_, and copy the given DawgPositionVectors into
+ /// hyphen_active_dawgs_ .
+ void set_hyphen_word(const WERD_CHOICE &word,
+ const DawgPositionVector &active_dawgs);
+
+ /* permdawg.cpp ************************************************************/
+ // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig().
+ // When this function is refactored, permdawg.cpp can be removed.
+
+ /// Copies word into best_choice if its rating is smaller
+ /// than that of best_choice.
+ inline void update_best_choice(const WERD_CHOICE &word,
+ WERD_CHOICE *best_choice) {
+ if (word.rating() < best_choice->rating()) {
+ *best_choice = word;
+ }
+ }
+ /// Fill the given active_dawgs vector with dawgs that could contain the
+ /// beginning of the word. If hyphenated() returns true, copy the entries
+ /// from hyphen_active_dawgs_ instead.
+ void init_active_dawgs(DawgPositionVector *active_dawgs,
+ bool ambigs_mode) const;
+ // Fill the given vector with the default collection of any-length dawgs
+ void default_dawgs(DawgPositionVector *anylength_dawgs,
+ bool suppress_patterns) const;
+
+
+ /// Recursively explore all the possible character combinations in
+ /// the given char_choices. Use go_deeper_dawg_fxn() to explore all the
+ /// dawgs in the dawgs_ vector in parallel and discard invalid words.
+ ///
+ /// Allocate and return a WERD_CHOICE with the best valid word found.
+ WERD_CHOICE *dawg_permute_and_select(
+ const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit);
+ /// If the choice being composed so far could be a dictionary word
+ /// and we have not reached the end of the word keep exploring the
+ /// char_choices further.
+ void go_deeper_dawg_fxn(
+ const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
+ int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+ bool word_ending, WERD_CHOICE *word, float certainties[],
+ float *limit, WERD_CHOICE *best_choice, int *attempts_left,
+ void *void_more_args);
+
+ /// Pointer to go_deeper function.
+ void (Dict::*go_deeper_fxn_)(const char *debug,
+ const BLOB_CHOICE_LIST_VECTOR &char_choices,
+ int char_choice_index,
+ const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+ bool word_ending, WERD_CHOICE *word,
+ float certainties[], float *limit,
+ WERD_CHOICE *best_choice, int *attempts_left,
+ void *void_more_args);
+ //
+ // Helper functions for dawg_permute_and_select().
+ //
+ void permute_choices(
+ const char *debug,
+ const BLOB_CHOICE_LIST_VECTOR &char_choices,
+ int char_choice_index,
+ const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+ WERD_CHOICE *word,
+ float certainties[],
+ float *limit,
+ WERD_CHOICE *best_choice,
+ int *attempts_left,
+ void *more_args);
+
+ void append_choices(
+ const char *debug,
+ const BLOB_CHOICE_LIST_VECTOR &char_choices,
+ const BLOB_CHOICE &blob_choice,
+ int char_choice_index,
+ const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+ WERD_CHOICE *word,
+ float certainties[],
+ float *limit,
+ WERD_CHOICE *best_choice,
+ int *attempts_left,
+ void *more_args);
+
+ bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
+ float curr_rating, float curr_certainty,
+ const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+ const char *debug, int word_ending,
+ CHAR_FRAGMENT_INFO *char_frag_info);
+
+ /* stopper.cpp *************************************************************/
+#if !defined(DISABLED_LEGACY_ENGINE)
+ bool NoDangerousAmbig(WERD_CHOICE *BestChoice,
+ DANGERR *fixpt,
+ bool fix_replaceable,
+ MATRIX* ratings);
+#endif // !defined(DISABLED_LEGACY_ENGINE)
+ // Replaces the corresponding wrong ngram in werd_choice with the correct
+ // one. The whole correct n-gram is inserted into the ratings matrix and
+ // the werd_choice: no more fragments!. Rating and certainty of new entries
+ // in matrix and werd_choice are the sum and mean of the wrong ngram
+ // respectively.
+ // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes
+ // mystring", with a new entry in the ratings matrix for ".
+ void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
+ UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
+ MATRIX *ratings);
+
+ /// Returns the length of the shortest alpha run in WordChoice.
+ int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const;
+ /// Returns true if the certainty of the BestChoice word is within a
+ /// reasonable range of the average certainties for the best choices for
+ /// each character in the segmentation. This test is used to catch words
+ /// in which one character is much worse than the other characters in the
+ /// word (i.e. false will be returned in that case). The algorithm computes
+ /// the mean and std deviation of the certainties in the word with the worst
+ /// certainty thrown out.
+ int UniformCertainties(const WERD_CHOICE& word);
+ /// Returns true if the given best_choice is good enough to stop.
+ bool AcceptableChoice(const WERD_CHOICE& best_choice,
+ XHeightConsistencyEnum xheight_consistency);
+ /// Returns false if the best choice for the current word is questionable
+ /// and should be tried again on the second pass or should be flagged to
+ /// the user.
+ bool AcceptableResult(WERD_RES *word) const;
+#if !defined(DISABLED_LEGACY_ENGINE)
+ void EndDangerousAmbigs();
+#endif // !defined(DISABLED_LEGACY_ENGINE)
+ /// Prints the current choices for this word to stdout.
+ void DebugWordChoices();
+ /// Sets up stopper variables in preparation for the first pass.
+ void SettupStopperPass1();
+ /// Sets up stopper variables in preparation for the second pass.
+ void SettupStopperPass2();
+ /* context.cpp *************************************************************/
+ /// Check a string to see if it matches a set of lexical rules.
+ int case_ok(const WERD_CHOICE& word) const;
+ /// Returns true if the word looks like an absolute garbage
+ /// (e.g. image mistakenly recognized as text).
+ bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);
+
+ /* dict.cpp ****************************************************************/
+
+ /// Initialize Dict class - load dawgs from [lang].traineddata and
+ /// user-specified wordlist and parttern list.
+ static DawgCache *GlobalDawgCache();
+ // Sets up ready for a Load or LoadLSTM.
+ void SetupForLoad(DawgCache *dawg_cache);
+ // Loads the dawgs needed by Tesseract. Call FinishLoad() after.
+ void Load(const STRING &lang, TessdataManager *data_file);
+ // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
+ void LoadLSTM(const STRING &lang, TessdataManager *data_file);
+ // Completes the loading process after Load() and/or LoadLSTM().
+ // Returns false if no dictionaries were loaded.
+ bool FinishLoad();
+ void End();
+
+ // Resets the document dictionary analogous to ResetAdaptiveClassifier.
+ void ResetDocumentDictionary() {
+ if (pending_words_ != nullptr)
+ pending_words_->clear();
+ if (document_words_ != nullptr)
+ document_words_->clear();
+ }
+
+ /**
+ * Returns the maximal permuter code (from ccstruct/ratngs.h) if in light
+ * of the current state the letter at word_index in the given word
+ * is allowed according to at least one of the dawgs in dawgs_,
+ * otherwise returns NO_PERM.
+ *
+ * The state is described by void_dawg_args, which are interpreted as
+ * DawgArgs and contain relevant active dawg positions.
+ * Each entry in the active_dawgs vector contains an index
+ * into the dawgs_ vector and an EDGE_REF that indicates the last edge
+ * followed in the dawg. It also may contain a position in the punctuation
+ * dawg which describes surrounding punctuation (see struct DawgPosition).
+ *
+ * Input:
+ * At word_index 0 dawg_args->active_dawgs should contain an entry for each
+ * dawg that may start at the beginning of a word, with punc_ref and edge_ref
+ * initialized to NO_EDGE. Since the punctuation dawg includes the empty
+ * pattern " " (meaning anything without surrounding punctuation), having a
+ * single entry for the punctuation dawg will cover all dawgs reachable
+ * therefrom -- that includes all number and word dawgs. The only dawg
+ * non-reachable from the punctuation_dawg is the pattern dawg.
+ * If hyphen state needs to be applied, initial dawg_args->active_dawgs can
+ * be copied from the saved hyphen state (maintained by Dict).
+ * For word_index > 0 the corresponding state (active_dawgs and punc position)
+ * can be obtained from dawg_args->updated_dawgs passed to
+ * def_letter_is_okay for word_index-1.
+ * Note: the function assumes that active_dawgs, and updated_dawgs
+ * member variables of dawg_args are not nullptr.
+ *
+ * Output:
+ * The function fills in dawg_args->updated_dawgs vector with the
+ * entries for dawgs that contain the word up to the letter at word_index.
+ *
+ */
+
+ //
+ int def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset,
+ UNICHAR_ID unichar_id, bool word_end) const;
+
+ int (Dict::*letter_is_okay_)(void* void_dawg_args,
+ const UNICHARSET& unicharset,
+ UNICHAR_ID unichar_id, bool word_end) const;
+ /// Calls letter_is_okay_ member function.
+ int LetterIsOkay(void* void_dawg_args, const UNICHARSET& unicharset,
+ UNICHAR_ID unichar_id, bool word_end) const {
+ return (this->*letter_is_okay_)(void_dawg_args,
+ unicharset, unichar_id, word_end);
+ }
+
+
+ /// Probability in context function used by the ngram permuter.
+ double (Dict::*probability_in_context_)(const char* lang,
+ const char* context,
+ int context_bytes,
+ const char* character,
+ int character_bytes);
+ /// Calls probability_in_context_ member function.
+ double ProbabilityInContext(const char* context,
+ int context_bytes,
+ const char* character,
+ int character_bytes) {
+ return (this->*probability_in_context_)(
+ getCCUtil()->lang.c_str(),
+ context, context_bytes,
+ character, character_bytes);
+ }
+
+ /// Default (no-op) implementation of probability in context function.
+ double def_probability_in_context(
+ const char* lang, const char* context, int context_bytes,
+ const char* character, int character_bytes) {
+ (void)lang;
+ (void)context;
+ (void)context_bytes;
+ (void)character;
+ (void)character_bytes;
+ return 0.0;
+ }
+
+ inline void SetWildcardID(UNICHAR_ID id) { wildcard_unichar_id_ = id; }
+ inline UNICHAR_ID WildcardID() const { return wildcard_unichar_id_; }
+ /// Return the number of dawgs in the dawgs_ vector.
+ inline int NumDawgs() const { return dawgs_.size(); }
+ /// Return i-th dawg pointer recorded in the dawgs_ vector.
+ inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
+ /// Return the points to the punctuation dawg.
+ inline const Dawg *GetPuncDawg() const { return punc_dawg_; }
+ /// Return the points to the unambiguous words dawg.
+ inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; }
+ /// Returns the appropriate next node given the EDGE_REF.
+ static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {
+ if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg
+ NODE_REF node = dawg->next_node(edge_ref);
+ if (node == 0) node = NO_EDGE; // end of word
+ return node;
+ }
+
+ // Given a unichar from a string and a given dawg, return the unichar
+ // we should use to match in that dawg type. (for example, in the number
+ // dawg, all numbers are transformed to kPatternUnicharId).
+ UNICHAR_ID char_for_dawg(const UNICHARSET& unicharset, UNICHAR_ID ch,
+ const Dawg *dawg) const {
+ if (!dawg) return ch;
+ switch (dawg->type()) {
+ case DAWG_TYPE_NUMBER:
+ return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
+ default:
+ return ch;
+ }
+ }
+
+ /// For each of the character classes of the given unichar_id (and the
+ /// unichar_id itself) finds the corresponding outgoing node or self-loop
+ /// in the given dawg and (after checking that it is valid) records it in
+ /// dawg_args->updated_ative_dawgs. Updates current_permuter if any valid
+ /// edges were found.
+ void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info,
+ UNICHAR_ID unichar_id, bool word_end,
+ DawgArgs *dawg_args,
+ PermuterType *current_permuter) const;
+
+ /// Read/Write/Access special purpose dawgs which contain words
+ /// only of a certain length (used for phrase search for
+ /// non-space-delimited languages).
+
+ /// Check all the DAWGs to see if this word is in any of them.
+ inline static bool valid_word_permuter(uint8_t perm, bool numbers_ok) {
+ return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
+ perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
+ perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
+ (numbers_ok && perm == NUMBER_PERM));
+ }
+ int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
+ int valid_word(const WERD_CHOICE &word) const {
+ return valid_word(word, false); // return NO_PERM for words with digits
+ }
+ int valid_word_or_number(const WERD_CHOICE &word) const {
+ return valid_word(word, true); // return NUMBER_PERM for valid numbers
+ }
+ /// This function is used by api/tesseract_cube_combiner.cpp
+ int valid_word(const char *string) const {
+ WERD_CHOICE word(string, getUnicharset());
+ return valid_word(word);
+ }
+ // Do the two WERD_CHOICEs form a meaningful bigram?
+ bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
+ /// Returns true if the word contains a valid punctuation pattern.
+ /// Note: Since the domains of punctuation symbols and symblos
+ /// used in numbers are not disjoint, a valid number might contain
+ /// an invalid punctuation pattern (e.g. .99).
+ bool valid_punctuation(const WERD_CHOICE &word);
+ /// Returns true if a good answer is found for the unknown blob rating.
+ int good_choice(const WERD_CHOICE &choice);
+ /// Adds a word found on this document to the document specific dictionary.
+ void add_document_word(const WERD_CHOICE &best_choice);
+ /// Adjusts the rating of the given word.
+ void adjust_word(WERD_CHOICE *word,
+ bool nonword, XHeightConsistencyEnum xheight_consistency,
+ float additional_adjust,
+ bool modify_rating,
+ bool debug);
+ /// Set wordseg_rating_adjust_factor_ to the given value.
+ inline void SetWordsegRatingAdjustFactor(float f) {
+ wordseg_rating_adjust_factor_ = f;
+ }
+ /// Returns true if the language is space-delimited (not CJ, or T).
+ bool IsSpaceDelimitedLang() const;
+
+ private:
+ /** Private member variables. */
+ CCUtil* ccutil_;
+ /**
+ * Table that stores ambiguities computed during training
+ * (loaded when NoDangerousAmbigs() is called for the first time).
+ * Each entry i in the table stores a set of amibiguities whose
+ * wrong ngram starts with unichar id i.
+ */
+#ifndef DISABLED_LEGACY_ENGINE
+ UnicharAmbigs* dang_ambigs_table_ = nullptr;
+ /** Same as above, but for ambiguities with replace flag set. */
+ UnicharAmbigs* replace_ambigs_table_ = nullptr;
+#endif
+ /** Additional certainty padding allowed before a word is rejected. */
+ float reject_offset_;
+ // Cached UNICHAR_IDs:
+ UNICHAR_ID wildcard_unichar_id_; // kDictWildcard.
+ UNICHAR_ID apostrophe_unichar_id_; // kApostropheSymbol.
+ UNICHAR_ID question_unichar_id_; // kQuestionSymbol.
+ UNICHAR_ID slash_unichar_id_; // kSlashSymbol.
+ UNICHAR_ID hyphen_unichar_id_; // kHyphenSymbol.
+ // Hyphen-related variables.
+ WERD_CHOICE *hyphen_word_;
+ DawgPositionVector hyphen_active_dawgs_;
+ bool last_word_on_line_;
+ // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
+ // matching. The first member of each list is taken as canonical. For
+ // example, the first list contains hyphens and dashes with the first symbol
+ // being the ASCII hyphen minus.
+ std::vector<GenericVector<UNICHAR_ID> > equivalent_symbols_;
+ // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.
+ DawgCache *dawg_cache_;
+ bool dawg_cache_is_ours_; // we should delete our own dawg_cache_
+ // Dawgs.
+ DawgVector dawgs_;
+ SuccessorListsVector successors_;
+ Trie *pending_words_;
+ /// The following pointers are only cached for convenience.
+ /// The dawgs will be deleted when dawgs_ vector is destroyed.
+ // bigram_dawg_ points to a dawg of two-word bigrams which always supersede if
+ // any of them are present on the best choices list for a word pair.
+ // the bigrams are stored as space-separated words where:
+ // (1) leading and trailing punctuation has been removed from each word and
+ // (2) any digits have been replaced with '?' marks.
+ Dawg *bigram_dawg_;
+ // TODO(daria): need to support multiple languages in the future,
+ // so maybe will need to maintain a list of dawgs of each kind.
+ Dawg *freq_dawg_;
+ Dawg *unambig_dawg_;
+ Dawg *punc_dawg_;
+ Trie *document_words_;
+ /// Current segmentation cost adjust factor for word rating.
+ /// See comments in incorporate_segcost.
+ float wordseg_rating_adjust_factor_;
+ // File for recording ambiguities discovered during dictionary search.
+ FILE *output_ambig_words_file_;
+
+ public:
+ /// Variable members.
+ /// These have to be declared and initialized after image_ptr_, which contains
+ /// the pointer to the params vector - the member of its base CCUtil class.
+ STRING_VAR_H(user_words_file, "", "A filename of user-provided words.");
+ STRING_VAR_H(user_words_suffix, "",
+ "A suffix of user-provided words located in tessdata.");
+ STRING_VAR_H(user_patterns_file, "",
+ "A filename of user-provided patterns.");
+ STRING_VAR_H(user_patterns_suffix, "",
+ "A suffix of user-provided patterns located in tessdata.");
+ BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
+ BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
+ BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
+ BOOL_VAR_H(load_punc_dawg, true,
+ "Load dawg with punctuation patterns.");
+ BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns.");
+ BOOL_VAR_H(load_bigram_dawg, true,
+ "Load dawg with special word bigrams.");
+ double_VAR_H(xheight_penalty_subscripts, 0.125,
+ "Score penalty (0.1 = 10%) added if there are subscripts "
+ "or superscripts in a word, but it is otherwise OK.");
+ double_VAR_H(xheight_penalty_inconsistent, 0.25,
+ "Score penalty (0.1 = 10%) added if an xheight is "
+ "inconsistent.");
+ double_VAR_H(segment_penalty_dict_frequent_word, 1.0,
+ "Score multiplier for word matches which have good case and"
+ "are frequent in the given language (lower is better).");
+
+ double_VAR_H(segment_penalty_dict_case_ok, 1.1,
+ "Score multiplier for word matches that have good case "
+ "(lower is better).");
+
+ double_VAR_H(segment_penalty_dict_case_bad, 1.3125,
+ "Default score multiplier for word matches, which may have "
+ "case issues (lower is better).");
+
+ double_VAR_H(segment_penalty_dict_nonword, 1.25,
+ "Score multiplier for glyph fragment segmentations which "
+ "do not match a dictionary word (lower is better).");
+
+ double_VAR_H(segment_penalty_garbage, 1.50,
+ "Score multiplier for poorly cased strings that are not in"
+ " the dictionary and generally look like garbage (lower is"
+ " better).");
+ STRING_VAR_H(output_ambig_words_file, "",
+ "Output file for ambiguities found in the dictionary");
+ INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info"
+ ", to 2 for more details, to 3 to see all the debug messages");
+ INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
+ BOOL_VAR_H(use_only_first_uft8_step, false,
+ "Use only the first UTF8 step of the given string"
+ " when computing log probabilities.");
+ double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
+ double_VAR_H(stopper_nondict_certainty_base, -2.50,
+ "Certainty threshold for non-dict words");
+ double_VAR_H(stopper_phase2_certainty_rejection_offset, 1.0,
+ "Reject certainty offset");
+ INT_VAR_H(stopper_smallword_size, 2,
+ "Size of dict word to be treated as non-dict word");
+ double_VAR_H(stopper_certainty_per_char, -0.50,
+ "Certainty to add for each dict char above small word size.");
+ double_VAR_H(stopper_allowable_character_badness, 3.0,
+ "Max certaintly variation allowed in a word (in sigma)");
+ INT_VAR_H(stopper_debug_level, 0, "Stopper debug level");
+ BOOL_VAR_H(stopper_no_acceptable_choices, false,
+ "Make AcceptableChoice() always return false. Useful"
+ " when there is a need to explore all segmentations");
+ INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
+ STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
+ " should be printed to stdout");
+ BOOL_VAR_H(segment_nonalphabetic_script, false,
+ "Don't use any alphabetic-specific tricks."
+ "Set to true in the traineddata config file for"
+ " scripts that are cursive or inherently fixed-pitch");
+ BOOL_VAR_H(save_doc_words, 0, "Save Document Words");
+ double_VAR_H(doc_dict_pending_threshold, 0.0,
+ "Worst certainty for using pending dictionary");
+ double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty"
+ " for words that can be inserted into the document dictionary");
+ INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different"
+ " character choices to consider during permutation."
+ " This limit is especially useful when user patterns"
+ " are specified, since overly generic patterns can result in"
+ " dawg search exploring an overly large number of options.");
+};
+
+} // namespace tesseract
+
+#endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_
diff --git a/tesseract/src/dict/hyphen.cpp b/tesseract/src/dict/hyphen.cpp
new file mode 100644
index 00000000..7a0a622c
--- /dev/null
+++ b/tesseract/src/dict/hyphen.cpp
@@ -0,0 +1,61 @@
+/******************************************************************************
+ * File: hyphen.cpp (Formerly hyphen.c)
+ * Description: Functions for maintaining information about hyphenated words.
+ * Author: Mark Seaman, OCR Technology
+ * Status: Reusable Software Component
+ *
+ * (c) Copyright 1987, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *****************************************************************************/
+
+#include "dict.h"
+
+namespace tesseract {
+
+// Unless the previous word was the last one on the line, and the current
+// one is not (thus it is the first one on the line), erase hyphen_word_,
+// clear hyphen_active_dawgs_, hyphen_constraints_ update last_word_on_line_.
+void Dict::reset_hyphen_vars(bool last_word_on_line) {
+ if (!(last_word_on_line_ == true && last_word_on_line == false)) {
+ if (hyphen_word_ != nullptr) {
+ delete hyphen_word_;
+ hyphen_word_ = nullptr;
+ hyphen_active_dawgs_.clear();
+ }
+ }
+ if (hyphen_debug_level) {
+ tprintf("reset_hyphen_vars: last_word_on_line %d -> %d\n",
+ last_word_on_line_, last_word_on_line);
+ }
+ last_word_on_line_ = last_word_on_line;
+}
+
+// Update hyphen_word_, and copy the given DawgPositionVectors into
+// hyphen_active_dawgs_.
+void Dict::set_hyphen_word(const WERD_CHOICE &word,
+ const DawgPositionVector &active_dawgs) {
+ if (hyphen_word_ == nullptr) {
+ hyphen_word_ = new WERD_CHOICE(word.unicharset());
+ hyphen_word_->make_bad();
+ }
+ if (hyphen_word_->rating() > word.rating()) {
+ *hyphen_word_ = word;
+ // Remove the last unichar id as it is a hyphen, and remove
+ // any unichar_string/lengths that are present.
+ hyphen_word_->remove_last_unichar_id();
+ hyphen_active_dawgs_ = active_dawgs;
+ }
+ if (hyphen_debug_level) {
+ hyphen_word_->print("set_hyphen_word: ");
+ }
+}
+} // namespace tesseract
diff --git a/tesseract/src/dict/matchdefs.h b/tesseract/src/dict/matchdefs.h
new file mode 100644
index 00000000..b95dbc2b
--- /dev/null
+++ b/tesseract/src/dict/matchdefs.h
@@ -0,0 +1,50 @@
+/******************************************************************************
+ ** Filename: matchdefs.h
+ ** Purpose: Generic interface definitions for feature matchers.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#ifndef MATCHDEFS_H
+#define MATCHDEFS_H
+
+#include <tesseract/unichar.h>
+
+#include <climits> // INT16_MAX
+#include <cstdint> // int16_t
+
+namespace tesseract {
+
+/* define the maximum number of classes defined for any matcher
+ and the maximum class id for any matcher. This must be changed
+ if more different classes need to be classified */
+#define MAX_NUM_CLASSES INT16_MAX
+
+/** a CLASS_ID is the ascii character to be associated with a class */
+using CLASS_ID = UNICHAR_ID;
+#define NO_CLASS (0)
+
+/** a PROTO_ID is the index of a prototype within it's class. Valid proto
+ id's are 0 to N-1 where N is the number of prototypes that make up the
+ class. */
+using PROTO_ID = int16_t;
+#define NO_PROTO (-1)
+
+/** FEATURE_ID is the index of a feature within a character description
+ The feature id ranges from 0 to N-1 where N is the number
+ of features in a character description. */
+using FEATURE_ID = uint8_t;
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/dict/permdawg.cpp b/tesseract/src/dict/permdawg.cpp
new file mode 100644
index 00000000..33f6b697
--- /dev/null
+++ b/tesseract/src/dict/permdawg.cpp
@@ -0,0 +1,390 @@
+/******************************************************************************
+ *
+ * File: permdawg.cpp (Formerly permdawg.c)
+ * Description: Scale word choices by a dictionary
+ * Author: Mark Seaman, OCR Technology
+ *
+ * (c) Copyright 1987, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *****************************************************************************/
+/*----------------------------------------------------------------------
+ I n c l u d e s
+----------------------------------------------------------------------*/
+
+#include "dawg.h"
+#include "stopper.h"
+#include "tprintf.h"
+#include "params.h"
+
+#include <algorithm>
+#include <cctype>
+#include "dict.h"
+
+/*----------------------------------------------------------------------
+ F u n c t i o n s
+----------------------------------------------------------------------*/
+namespace tesseract {
+
+/**
+ * @name go_deeper_dawg_fxn
+ *
+ * If the choice being composed so far could be a dictionary word
+ * keep exploring choices.
+ */
+void Dict::go_deeper_dawg_fxn(
+ const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
+ int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+ bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
+ WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) {
+ auto *more_args = static_cast<DawgArgs *>(void_more_args);
+ word_ending = (char_choice_index == char_choices.size()-1);
+ int word_index = word->length() - 1;
+ if (best_choice->rating() < *limit) return;
+ // Look up char in DAWG
+
+ // If the current unichar is an ngram first try calling
+ // letter_is_okay() for each unigram it contains separately.
+ UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
+ bool checked_unigrams = false;
+ if (getUnicharset().get_isngram(orig_uch_id)) {
+ if (dawg_debug_level) {
+ tprintf("checking unigrams in an ngram %s\n",
+ getUnicharset().debug_str(orig_uch_id).c_str());
+ }
+ int num_unigrams = 0;
+ word->remove_last_unichar_id();
+ std::vector<UNICHAR_ID> encoding;
+ const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
+ // Since the string came out of the unicharset, failure is impossible.
+ ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr,
+ nullptr));
+ bool unigrams_ok = true;
+ // Construct DawgArgs that reflect the current state.
+ DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
+ DawgPositionVector unigram_updated_dawgs;
+ DawgArgs unigram_dawg_args(&unigram_active_dawgs,
+ &unigram_updated_dawgs,
+ more_args->permuter);
+ // Check unigrams in the ngram with letter_is_okay().
+ for (int i = 0; unigrams_ok && i < encoding.size(); ++i) {
+ UNICHAR_ID uch_id = encoding[i];
+ ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
+ ++num_unigrams;
+ word->append_unichar_id(uch_id, 1, 0.0, 0.0);
+ unigrams_ok = (this->*letter_is_okay_)(
+ &unigram_dawg_args, *word->unicharset(),
+ word->unichar_id(word_index+num_unigrams-1),
+ word_ending && i == encoding.size() - 1);
+ (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
+ if (dawg_debug_level) {
+ tprintf("unigram %s is %s\n",
+ getUnicharset().debug_str(uch_id).c_str(),
+ unigrams_ok ? "OK" : "not OK");
+ }
+ }
+ // Restore the word and copy the updated dawg state if needed.
+ while (num_unigrams-- > 0) word->remove_last_unichar_id();
+ word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
+ if (unigrams_ok) {
+ checked_unigrams = true;
+ more_args->permuter = unigram_dawg_args.permuter;
+ *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
+ }
+ }
+
+ // Check which dawgs from the dawgs_ vector contain the word
+ // up to and including the current unichar.
+ if (checked_unigrams || (this->*letter_is_okay_)(
+ more_args, *word->unicharset(), word->unichar_id(word_index),
+ word_ending)) {
+ // Add a new word choice
+ if (word_ending) {
+ if (dawg_debug_level) {
+ tprintf("found word = %s\n", word->debug_string().c_str());
+ }
+ if (strcmp(output_ambig_words_file.c_str(), "") != 0) {
+ if (output_ambig_words_file_ == nullptr) {
+ output_ambig_words_file_ =
+ fopen(output_ambig_words_file.c_str(), "wb+");
+ if (output_ambig_words_file_ == nullptr) {
+ tprintf("Failed to open output_ambig_words_file %s\n",
+ output_ambig_words_file.c_str());
+ exit(1);
+ }
+ STRING word_str;
+ word->string_and_lengths(&word_str, nullptr);
+ word_str += " ";
+ fprintf(output_ambig_words_file_, "%s", word_str.c_str());
+ }
+ STRING word_str;
+ word->string_and_lengths(&word_str, nullptr);
+ word_str += " ";
+ fprintf(output_ambig_words_file_, "%s", word_str.c_str());
+ }
+ WERD_CHOICE *adjusted_word = word;
+ adjusted_word->set_permuter(more_args->permuter);
+ update_best_choice(*adjusted_word, best_choice);
+ } else { // search the next letter
+ // Make updated_* point to the next entries in the DawgPositionVector
+ // arrays (that were originally created in dawg_permute_and_select)
+ ++(more_args->updated_dawgs);
+ // Make active_dawgs and constraints point to the updated ones.
+ ++(more_args->active_dawgs);
+ permute_choices(debug, char_choices, char_choice_index + 1,
+ prev_char_frag_info, word, certainties, limit,
+ best_choice, attempts_left, more_args);
+ // Restore previous state to explore another letter in this position.
+ --(more_args->updated_dawgs);
+ --(more_args->active_dawgs);
+ }
+ } else {
+ if (dawg_debug_level) {
+ tprintf("last unichar not OK at index %d in %s\n",
+ word_index, word->debug_string().c_str());
+ }
+ }
+}
+
+
+/**
+ * dawg_permute_and_select
+ *
+ * Recursively explore all the possible character combinations in
+ * the given char_choices. Use go_deeper_dawg_fxn() to search all the
+ * dawgs in the dawgs_ vector in parallel and discard invalid words.
+ *
+ * Allocate and return a WERD_CHOICE with the best valid word found.
+ */
+WERD_CHOICE *Dict::dawg_permute_and_select(
+ const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
+ auto *best_choice = new WERD_CHOICE(&getUnicharset());
+ best_choice->make_bad();
+ best_choice->set_rating(rating_limit);
+ if (char_choices.size() == 0 || char_choices.size() > MAX_WERD_LENGTH)
+ return best_choice;
+ auto *active_dawgs =
+ new DawgPositionVector[char_choices.size() + 1];
+ init_active_dawgs(&(active_dawgs[0]), true);
+ DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
+ WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);
+
+ float certainties[MAX_WERD_LENGTH];
+ this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
+ int attempts_left = max_permuter_attempts;
+ permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr,
+ char_choices, 0, nullptr, &word, certainties, &rating_limit, best_choice,
+ &attempts_left, &dawg_args);
+ delete[] active_dawgs;
+ return best_choice;
+}
+
+/**
+ * permute_choices
+ *
+ * Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST
+ * with the given char_choice_index in char_choices.
+ */
+void Dict::permute_choices(
+ const char *debug,
+ const BLOB_CHOICE_LIST_VECTOR &char_choices,
+ int char_choice_index,
+ const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+ WERD_CHOICE *word,
+ float certainties[],
+ float *limit,
+ WERD_CHOICE *best_choice,
+ int *attempts_left,
+ void *more_args) {
+ if (debug) {
+ tprintf("%s permute_choices: char_choice_index=%d"
+ " limit=%g rating=%g, certainty=%g word=%s\n",
+ debug, char_choice_index, *limit, word->rating(),
+ word->certainty(), word->debug_string().c_str());
+ }
+ if (char_choice_index < char_choices.size()) {
+ BLOB_CHOICE_IT blob_choice_it;
+ blob_choice_it.set_to_list(char_choices.get(char_choice_index));
+ for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
+ blob_choice_it.forward()) {
+ (*attempts_left)--;
+ append_choices(debug, char_choices, *(blob_choice_it.data()),
+ char_choice_index, prev_char_frag_info, word,
+ certainties, limit, best_choice, attempts_left, more_args);
+ if (*attempts_left <= 0) {
+ if (debug) tprintf("permute_choices(): attempts_left is 0\n");
+ break;
+ }
+ }
+ }
+}
+
+/**
+ * append_choices
+ *
+ * Checks to see whether or not the next choice is worth appending to
+ * the word being generated. If so then keeps going deeper into the word.
+ *
+ * This function assumes that Dict::go_deeper_fxn_ is set.
+ */
+void Dict::append_choices(
+ const char *debug,
+ const BLOB_CHOICE_LIST_VECTOR &char_choices,
+ const BLOB_CHOICE &blob_choice,
+ int char_choice_index,
+ const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+ WERD_CHOICE *word,
+ float certainties[],
+ float *limit,
+ WERD_CHOICE *best_choice,
+ int *attempts_left,
+ void *more_args) {
+ int word_ending = (char_choice_index == char_choices.size() - 1);
+
+ // Deal with fragments.
+ CHAR_FRAGMENT_INFO char_frag_info;
+ if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
+ blob_choice.certainty(), prev_char_frag_info, debug,
+ word_ending, &char_frag_info)) {
+ return; // blob_choice must be an invalid fragment
+ }
+ // Search the next letter if this character is a fragment.
+ if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
+ permute_choices(debug, char_choices, char_choice_index + 1,
+ &char_frag_info, word, certainties, limit,
+ best_choice, attempts_left, more_args);
+ return;
+ }
+
+ // Add the next unichar.
+ float old_rating = word->rating();
+ float old_certainty = word->certainty();
+ uint8_t old_permuter = word->permuter();
+ certainties[word->length()] = char_frag_info.certainty;
+ word->append_unichar_id_space_allocated(
+ char_frag_info.unichar_id, char_frag_info.num_fragments,
+ char_frag_info.rating, char_frag_info.certainty);
+
+ // Explore the next unichar.
+ (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
+ &char_frag_info, word_ending, word, certainties,
+ limit, best_choice, attempts_left, more_args);
+
+ // Remove the unichar we added to explore other choices in it's place.
+ word->remove_last_unichar_id();
+ word->set_rating(old_rating);
+ word->set_certainty(old_certainty);
+ word->set_permuter(old_permuter);
+}
+
+/**
+ * @name fragment_state
+ *
+ * Given the current char choice and information about previously seen
+ * fragments, determines whether adjacent character fragments are
+ * present and whether they can be concatenated.
+ *
+ * The given prev_char_frag_info contains:
+ * - fragment: if not nullptr contains information about immediately
+ * preceding fragmented character choice
+ * - num_fragments: number of fragments that have been used so far
+ * to construct a character
+ * - certainty: certainty of the current choice or minimum
+ * certainty of all fragments concatenated so far
+ * - rating: rating of the current choice or sum of fragment
+ * ratings concatenated so far
+ *
+ * The output char_frag_info is filled in as follows:
+ * - character: is set to be nullptr if the choice is a non-matching
+ * or non-ending fragment piece; is set to unichar of the given choice
+ * if it represents a regular character or a matching ending fragment
+ * - fragment,num_fragments,certainty,rating are set as described above
+ *
+ * @returns false if a non-matching fragment is discovered, true otherwise.
+ */
+bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id,
+ float curr_rating, float curr_certainty,
+ const CHAR_FRAGMENT_INFO *prev_char_frag_info,
+ const char *debug, int word_ending,
+ CHAR_FRAGMENT_INFO *char_frag_info) {
+ const CHAR_FRAGMENT *this_fragment =
+ getUnicharset().get_fragment(curr_unichar_id);
+ const CHAR_FRAGMENT *prev_fragment =
+ prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
+
+ // Print debug info for fragments.
+ if (debug && (prev_fragment || this_fragment)) {
+ tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
+ getUnicharset().debug_str(curr_unichar_id).c_str(),
+ word_ending);
+ if (prev_fragment) {
+ tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str());
+ }
+ if (this_fragment) {
+ tprintf("this_fragment %s\n", this_fragment->to_string().c_str());
+ }
+ }
+
+ char_frag_info->unichar_id = curr_unichar_id;
+ char_frag_info->fragment = this_fragment;
+ char_frag_info->rating = curr_rating;
+ char_frag_info->certainty = curr_certainty;
+ char_frag_info->num_fragments = 1;
+ if (prev_fragment && !this_fragment) {
+ if (debug) tprintf("Skip choice with incomplete fragment\n");
+ return false;
+ }
+ if (this_fragment) {
+ // We are dealing with a fragment.
+ char_frag_info->unichar_id = INVALID_UNICHAR_ID;
+ if (prev_fragment) {
+ if (!this_fragment->is_continuation_of(prev_fragment)) {
+ if (debug) tprintf("Non-matching fragment piece\n");
+ return false;
+ }
+ if (this_fragment->is_ending()) {
+ char_frag_info->unichar_id =
+ getUnicharset().unichar_to_id(this_fragment->get_unichar());
+ char_frag_info->fragment = nullptr;
+ if (debug) {
+ tprintf("Built character %s from fragments\n",
+ getUnicharset().debug_str(
+ char_frag_info->unichar_id).c_str());
+ }
+ } else {
+ if (debug) tprintf("Record fragment continuation\n");
+ char_frag_info->fragment = this_fragment;
+ }
+ // Update certainty and rating.
+ char_frag_info->rating =
+ prev_char_frag_info->rating + curr_rating;
+ char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
+ char_frag_info->certainty =
+ std::min(curr_certainty, prev_char_frag_info->certainty);
+ } else {
+ if (this_fragment->is_beginning()) {
+ if (debug) tprintf("Record fragment beginning\n");
+ } else {
+ if (debug) {
+ tprintf("Non-starting fragment piece with no prev_fragment\n");
+ }
+ return false;
+ }
+ }
+ }
+ if (word_ending && char_frag_info->fragment) {
+ if (debug) tprintf("Word can not end with a fragment\n");
+ return false;
+ }
+ return true;
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/dict/stopper.cpp b/tesseract/src/dict/stopper.cpp
new file mode 100644
index 00000000..36207d00
--- /dev/null
+++ b/tesseract/src/dict/stopper.cpp
@@ -0,0 +1,515 @@
+/******************************************************************************
+ ** Filename: stopper.c
+ ** Purpose: Stopping criteria for word classifier.
+ ** Author: Dan Johnson
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+
+#include <cstdio>
+#include <cstring>
+#include <cctype>
+#include <cmath>
+
+#include "stopper.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#include "ambigs.h"
+#endif
+#include "ccutil.h"
+#include "dict.h"
+#include "helpers.h"
+#include "matchdefs.h"
+#include "pageres.h"
+#include "params.h"
+#include "ratngs.h"
+#include <tesseract/unichar.h>
+
+/*----------------------------------------------------------------------------
+ Private Code
+----------------------------------------------------------------------------*/
+
+namespace tesseract {
+
+bool Dict::AcceptableChoice(const WERD_CHOICE& best_choice,
+ XHeightConsistencyEnum xheight_consistency) {
+ float CertaintyThreshold = stopper_nondict_certainty_base;
+ int WordSize;
+
+ if (stopper_no_acceptable_choices) return false;
+
+ if (best_choice.length() == 0) return false;
+
+ bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
+ bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
+ bool is_case_ok = case_ok(best_choice);
+
+ if (stopper_debug_level >= 1) {
+ const char *xht = "UNKNOWN";
+ switch (xheight_consistency) {
+ case XH_GOOD: xht = "NORMAL"; break;
+ case XH_SUBNORMAL: xht = "SUBNORMAL"; break;
+ case XH_INCONSISTENT: xht = "INCONSISTENT"; break;
+ default: xht = "UNKNOWN";
+ }
+ tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
+ best_choice.unichar_string().c_str(),
+ (is_valid_word ? 'y' : 'n'),
+ (is_case_ok ? 'y' : 'n'),
+ xht,
+ best_choice.min_x_height(),
+ best_choice.max_x_height());
+ }
+ // Do not accept invalid words in PASS1.
+ if (reject_offset_ <= 0.0f && !is_valid_word) return false;
+ if (is_valid_word && is_case_ok) {
+ WordSize = LengthOfShortestAlphaRun(best_choice);
+ WordSize -= stopper_smallword_size;
+ if (WordSize < 0)
+ WordSize = 0;
+ CertaintyThreshold += WordSize * stopper_certainty_per_char;
+ }
+
+ if (stopper_debug_level >= 1)
+ tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
+ best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
+
+ if (no_dang_ambigs &&
+ best_choice.certainty() > CertaintyThreshold &&
+ xheight_consistency < XH_INCONSISTENT &&
+ UniformCertainties(best_choice)) {
+ return true;
+ } else {
+ if (stopper_debug_level >= 1) {
+ tprintf("AcceptableChoice() returned false"
+ " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
+ no_dang_ambigs, best_choice.certainty(),
+ CertaintyThreshold,
+ UniformCertainties(best_choice));
+ }
+ return false;
+ }
+}
+
+bool Dict::AcceptableResult(WERD_RES *word) const {
+ if (word->best_choice == nullptr) return false;
+ float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
+ int WordSize;
+
+ if (stopper_debug_level >= 1) {
+ tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
+ word->best_choice->debug_string().c_str(),
+ (valid_word(*word->best_choice) ? 'y' : 'n'),
+ (case_ok(*word->best_choice) ? 'y' : 'n'),
+ word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
+ word->best_choices.singleton() ? 'n' : 'y');
+ }
+
+ if (word->best_choice->length() == 0 || !word->best_choices.singleton())
+ return false;
+ if (valid_word(*word->best_choice) && case_ok(*word->best_choice)) {
+ WordSize = LengthOfShortestAlphaRun(*word->best_choice);
+ WordSize -= stopper_smallword_size;
+ if (WordSize < 0)
+ WordSize = 0;
+ CertaintyThreshold += WordSize * stopper_certainty_per_char;
+ }
+
+ if (stopper_debug_level >= 1)
+ tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
+ word->best_choice->certainty(), CertaintyThreshold);
+
+ if (word->best_choice->certainty() > CertaintyThreshold &&
+ !stopper_no_acceptable_choices) {
+ if (stopper_debug_level >= 1)
+ tprintf("ACCEPTED\n");
+ return true;
+ } else {
+ if (stopper_debug_level >= 1)
+ tprintf("REJECTED\n");
+ return false;
+ }
+}
+
+#if !defined(DISABLED_LEGACY_ENGINE)
+
+bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
+ DANGERR *fixpt,
+ bool fix_replaceable,
+ MATRIX *ratings) {
+ if (stopper_debug_level > 2) {
+ tprintf("\nRunning NoDangerousAmbig() for %s\n",
+ best_choice->debug_string().c_str());
+ }
+
+ // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
+ // for each unichar id in BestChoice.
+ BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
+ int i;
+ bool ambigs_found = false;
+ // For each position in best_choice:
+ // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
+ // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
+ // -- look for ambiguities corresponding to wrong_ngram in the list while
+ // adding the following unichar_ids from best_choice to wrong_ngram
+ //
+ // Repeat the above procedure twice: first time look through
+ // ambigs to be replaced and replace all the ambiguities found;
+ // second time look through dangerous ambiguities and construct
+ // ambig_blob_choices with fake a blob choice for each ambiguity
+ // and pass them to dawg_permute_and_select() to search for
+ // ambiguous words in the dictionaries.
+ //
+ // Note that during the execution of the for loop (on the first pass)
+ // if replacements are made the length of best_choice might change.
+ for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
+ bool replace = (fix_replaceable && pass == 0);
+ const UnicharAmbigsVector &table = replace ?
+ getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs();
+ if (!replace) {
+ // Initialize ambig_blob_choices with lists containing a single
+ // unichar id for the corresponding position in best_choice.
+ // best_choice consisting from only the original letters will
+ // have a rating of 0.0.
+ for (i = 0; i < best_choice->length(); ++i) {
+ auto *lst = new BLOB_CHOICE_LIST();
+ BLOB_CHOICE_IT lst_it(lst);
+ // TODO(rays/antonova) Put real xheights and y shifts here.
+ lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
+ 0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
+ ambig_blob_choices.push_back(lst);
+ }
+ }
+ UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
+ int wrong_ngram_index;
+ int next_index;
+ int blob_index = 0;
+ for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i),
+ ++i) {
+ UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
+ if (stopper_debug_level > 2) {
+ tprintf("Looking for %s ngrams starting with %s:\n",
+ replace ? "replaceable" : "ambiguous",
+ getUnicharset().debug_str(curr_unichar_id).c_str());
+ }
+ int num_wrong_blobs = best_choice->state(i);
+ wrong_ngram_index = 0;
+ wrong_ngram[wrong_ngram_index] = curr_unichar_id;
+ if (curr_unichar_id == INVALID_UNICHAR_ID ||
+ curr_unichar_id >= table.size() ||
+ table[curr_unichar_id] == nullptr) {
+ continue; // there is no ambig spec for this unichar id
+ }
+ AmbigSpec_IT spec_it(table[curr_unichar_id]);
+ for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
+ const AmbigSpec *ambig_spec = spec_it.data();
+ wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
+ int compare = UnicharIdArrayUtils::compare(wrong_ngram,
+ ambig_spec->wrong_ngram);
+ if (stopper_debug_level > 2) {
+ tprintf("candidate ngram: ");
+ UnicharIdArrayUtils::print(wrong_ngram, getUnicharset());
+ tprintf("current ngram from spec: ");
+ UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
+ tprintf("comparison result: %d\n", compare);
+ }
+ if (compare == 0) {
+ // Record the place where we found an ambiguity.
+ if (fixpt != nullptr) {
+ UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
+ fixpt->push_back(DANGERR_INFO(
+ blob_index, blob_index + num_wrong_blobs, replace,
+ getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
+ leftmost_id));
+ if (stopper_debug_level > 1) {
+ tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index,
+ blob_index + num_wrong_blobs, false,
+ getUnicharset().get_isngram(
+ ambig_spec->correct_ngram_id),
+ getUnicharset().id_to_unichar(leftmost_id));
+ }
+ }
+
+ if (replace) {
+ if (stopper_debug_level > 2) {
+ tprintf("replace ambiguity with %s : ",
+ getUnicharset().id_to_unichar(
+ ambig_spec->correct_ngram_id));
+ UnicharIdArrayUtils::print(
+ ambig_spec->correct_fragments, getUnicharset());
+ }
+ ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
+ ambig_spec->correct_ngram_id,
+ best_choice, ratings);
+ } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
+ // We found dang ambig - update ambig_blob_choices.
+ if (stopper_debug_level > 2) {
+ tprintf("found ambiguity: ");
+ UnicharIdArrayUtils::print(
+ ambig_spec->correct_fragments, getUnicharset());
+ }
+ ambigs_found = true;
+ for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
+ ++tmp_index) {
+ // Add a blob choice for the corresponding fragment of the
+ // ambiguity. These fake blob choices are initialized with
+ // negative ratings (which are not possible for real blob
+ // choices), so that dawg_permute_and_select() considers any
+ // word not consisting of only the original letters a better
+ // choice and stops searching for alternatives once such a
+ // choice is found.
+ BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
+ bc_it.add_to_end(new BLOB_CHOICE(
+ ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
+ -1, 0, 1, 0, BCC_AMBIG));
+ }
+ }
+ spec_it.forward();
+ } else if (compare == -1) {
+ if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
+ ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
+ // Add the next unichar id to wrong_ngram and keep looking for
+ // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
+ wrong_ngram[++wrong_ngram_index] =
+ best_choice->unichar_id(next_index);
+ num_wrong_blobs += best_choice->state(next_index);
+ } else {
+ break; // no more matching ambigs in this AMBIG_SPEC_LIST
+ }
+ } else {
+ spec_it.forward();
+ }
+ } // end searching AmbigSpec_LIST
+ } // end searching best_choice
+ } // end searching replace and dangerous ambigs
+
+ // If any ambiguities were found permute the constructed ambig_blob_choices
+ // to see if an alternative dictionary word can be found.
+ if (ambigs_found) {
+ if (stopper_debug_level > 2) {
+ tprintf("\nResulting ambig_blob_choices:\n");
+ for (i = 0; i < ambig_blob_choices.size(); ++i) {
+ print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
+ tprintf("\n");
+ }
+ }
+ WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
+ ambigs_found = (alt_word->rating() < 0.0);
+ if (ambigs_found) {
+ if (stopper_debug_level >= 1) {
+ tprintf ("Stopper: Possible ambiguous word = %s\n",
+ alt_word->debug_string().c_str());
+ }
+ if (fixpt != nullptr) {
+ // Note: Currently character choices combined from fragments can only
+ // be generated by NoDangrousAmbigs(). This code should be updated if
+ // the capability to produce classifications combined from character
+ // fragments is added to other functions.
+ int orig_i = 0;
+ for (i = 0; i < alt_word->length(); ++i) {
+ const UNICHARSET &uchset = getUnicharset();
+ bool replacement_is_ngram =
+ uchset.get_isngram(alt_word->unichar_id(i));
+ UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
+ if (replacement_is_ngram) {
+ // we have to extract the leftmost unichar from the ngram.
+ const char *str = uchset.id_to_unichar(leftmost_id);
+ int step = uchset.step(str);
+ if (step) leftmost_id = uchset.unichar_to_id(str, step);
+ }
+ int end_i = orig_i + alt_word->state(i);
+ if (alt_word->state(i) > 1 ||
+ (orig_i + 1 == end_i && replacement_is_ngram)) {
+ // Compute proper blob indices.
+ int blob_start = 0;
+ for (int j = 0; j < orig_i; ++j)
+ blob_start += best_choice->state(j);
+ int blob_end = blob_start;
+ for (int j = orig_i; j < end_i; ++j)
+ blob_end += best_choice->state(j);
+ fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true,
+ replacement_is_ngram, leftmost_id));
+ if (stopper_debug_level > 1) {
+ tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i,
+ true, replacement_is_ngram,
+ uchset.id_to_unichar(leftmost_id));
+ }
+ }
+ orig_i += alt_word->state(i);
+ }
+ }
+ }
+ delete alt_word;
+ }
+ if (output_ambig_words_file_ != nullptr) {
+ fprintf(output_ambig_words_file_, "\n");
+ }
+
+ ambig_blob_choices.delete_data_pointers();
+ return !ambigs_found;
+}
+
+void Dict::EndDangerousAmbigs() {}
+
+#endif // !defined(DISABLED_LEGACY_ENGINE)
+
+void Dict::SettupStopperPass1() {
+ reject_offset_ = 0.0;
+}
+
+void Dict::SettupStopperPass2() {
+ reject_offset_ = stopper_phase2_certainty_rejection_offset;
+}
+
+void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
+ UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
+ MATRIX *ratings) {
+ int num_blobs_to_replace = 0;
+ int begin_blob_index = 0;
+ int i;
+ // Rating and certainty for the new BLOB_CHOICE are derived from the
+ // replaced choices.
+ float new_rating = 0.0f;
+ float new_certainty = 0.0f;
+ BLOB_CHOICE* old_choice = nullptr;
+ for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
+ if (i >= wrong_ngram_begin_index) {
+ int num_blobs = werd_choice->state(i);
+ int col = begin_blob_index + num_blobs_to_replace;
+ int row = col + num_blobs - 1;
+ BLOB_CHOICE_LIST* choices = ratings->get(col, row);
+ ASSERT_HOST(choices != nullptr);
+ old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
+ ASSERT_HOST(old_choice != nullptr);
+ new_rating += old_choice->rating();
+ new_certainty += old_choice->certainty();
+ num_blobs_to_replace += num_blobs;
+ } else {
+ begin_blob_index += werd_choice->state(i);
+ }
+ }
+ new_certainty /= wrong_ngram_size;
+ // If there is no entry in the ratings matrix, add it.
+ MATRIX_COORD coord(begin_blob_index,
+ begin_blob_index + num_blobs_to_replace - 1);
+ if (!coord.Valid(*ratings)) {
+ ratings->IncreaseBandSize(coord.row - coord.col + 1);
+ }
+ if (ratings->get(coord.col, coord.row) == nullptr)
+ ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
+ BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row);
+ BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices);
+ if (choice != nullptr) {
+ // Already there. Upgrade if new rating better.
+ if (new_rating < choice->rating())
+ choice->set_rating(new_rating);
+ if (new_certainty < choice->certainty())
+ choice->set_certainty(new_certainty);
+ // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
+ } else {
+ // Need a new choice with the correct_ngram_id.
+ choice = new BLOB_CHOICE(*old_choice);
+ choice->set_unichar_id(correct_ngram_id);
+ choice->set_rating(new_rating);
+ choice->set_certainty(new_certainty);
+ choice->set_classifier(BCC_AMBIG);
+ choice->set_matrix_cell(coord.col, coord.row);
+ BLOB_CHOICE_IT it (new_choices);
+ it.add_to_end(choice);
+ }
+ // Remove current unichar from werd_choice. On the last iteration
+ // set the correct replacement unichar instead of removing a unichar.
+ for (int replaced_count = 0; replaced_count < wrong_ngram_size;
+ ++replaced_count) {
+ if (replaced_count + 1 == wrong_ngram_size) {
+ werd_choice->set_blob_choice(wrong_ngram_begin_index,
+ num_blobs_to_replace, choice);
+ } else {
+ werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
+ }
+ }
+ if (stopper_debug_level >= 1) {
+ werd_choice->print("ReplaceAmbig() ");
+ tprintf("Modified blob_choices: ");
+ print_ratings_list("\n", new_choices, getUnicharset());
+ }
+}
+
+int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const {
+ int shortest = INT32_MAX;
+ int curr_len = 0;
+ for (int w = 0; w < WordChoice.length(); ++w) {
+ if (WordChoice.unicharset()->get_isalpha(WordChoice.unichar_id(w))) {
+ curr_len++;
+ } else if (curr_len > 0) {
+ if (curr_len < shortest) shortest = curr_len;
+ curr_len = 0;
+ }
+ }
+ if (curr_len > 0 && curr_len < shortest) {
+ shortest = curr_len;
+ } else if (shortest == INT32_MAX) {
+ shortest = 0;
+ }
+ return shortest;
+}
+
+int Dict::UniformCertainties(const WERD_CHOICE& word) {
+ float Certainty;
+ float WorstCertainty = FLT_MAX;
+ float CertaintyThreshold;
+ double TotalCertainty;
+ double TotalCertaintySquared;
+ double Variance;
+ float Mean, StdDev;
+ int word_length = word.length();
+
+ if (word_length < 3)
+ return true;
+
+ TotalCertainty = TotalCertaintySquared = 0.0;
+ for (int i = 0; i < word_length; ++i) {
+ Certainty = word.certainty(i);
+ TotalCertainty += Certainty;
+ TotalCertaintySquared += static_cast<double>(Certainty) * Certainty;
+ if (Certainty < WorstCertainty)
+ WorstCertainty = Certainty;
+ }
+
+ // Subtract off worst certainty from statistics.
+ word_length--;
+ TotalCertainty -= WorstCertainty;
+ TotalCertaintySquared -= static_cast<double>(WorstCertainty) * WorstCertainty;
+
+ Mean = TotalCertainty / word_length;
+ Variance = ((word_length * TotalCertaintySquared -
+ TotalCertainty * TotalCertainty) /
+ (word_length * (word_length - 1)));
+ if (Variance < 0.0)
+ Variance = 0.0;
+ StdDev = sqrt(Variance);
+
+ CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
+ if (CertaintyThreshold > stopper_nondict_certainty_base)
+ CertaintyThreshold = stopper_nondict_certainty_base;
+
+ if (word.certainty() < CertaintyThreshold) {
+ if (stopper_debug_level >= 1)
+ tprintf("Stopper: Non-uniform certainty = %4.1f"
+ " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
+ word.certainty(), Mean, StdDev, CertaintyThreshold);
+ return false;
+ } else {
+ return true;
+ }
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/dict/stopper.h b/tesseract/src/dict/stopper.h
new file mode 100644
index 00000000..d7df3988
--- /dev/null
+++ b/tesseract/src/dict/stopper.h
@@ -0,0 +1,50 @@
+/******************************************************************************
+ ** Filename: stopper.h
+ ** Purpose: Stopping criteria for word classifier.
+ ** Author: Dan Johnson
+ ** History: Wed May 1 09:42:57 1991, DSJ, Created.
+ **
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ ******************************************************************************/
+#ifndef STOPPER_H
+#define STOPPER_H
+
+#include "params.h"
+#include "ratngs.h"
+
+#include "genericvector.h"
+#include <tesseract/unichar.h>
+
+namespace tesseract {
+
+class WERD_CHOICE;
+
+using BLOB_WIDTH = uint8_t;
+
+struct DANGERR_INFO {
+ DANGERR_INFO() :
+ begin(-1), end(-1), dangerous(false), correct_is_ngram(false),
+ leftmost(INVALID_UNICHAR_ID) {}
+ DANGERR_INFO(int b, int e, bool d, bool n, UNICHAR_ID l) :
+ begin(b), end(e), dangerous(d), correct_is_ngram(n), leftmost(l) {}
+ int begin;
+ int end;
+ bool dangerous;
+ bool correct_is_ngram;
+ UNICHAR_ID leftmost; // in the replacement, what's the leftmost character?
+};
+
+using DANGERR = GenericVector<DANGERR_INFO>;
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/dict/trie.cpp b/tesseract/src/dict/trie.cpp
new file mode 100644
index 00000000..e9edbbaf
--- /dev/null
+++ b/tesseract/src/dict/trie.cpp
@@ -0,0 +1,716 @@
+/******************************************************************************
+ *
+ * File: trie.cpp (Formerly trie.c)
+ * Description: Functions to build a trie data structure.
+ * Author: Mark Seaman, OCR Technology
+ *
+ * (c) Copyright 1987, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *****************************************************************************/
+/*----------------------------------------------------------------------
+ I n c l u d e s
+----------------------------------------------------------------------*/
+
+#include "trie.h"
+
+#include "dawg.h"
+#include "dict.h"
+#include "genericvector.h"
+#include "helpers.h"
+#include "kdpair.h"
+
+namespace tesseract {
+
+const char kDoNotReverse[] = "RRP_DO_NO_REVERSE";
+const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL";
+const char kForceReverse[] = "RRP_FORCE_REVERSE";
+
+const char * const RTLReversePolicyNames[] = {
+ kDoNotReverse,
+ kReverseIfHasRTL,
+ kForceReverse
+};
+
+const char Trie::kAlphaPatternUnicode[] = "\u2000";
+const char Trie::kDigitPatternUnicode[] = "\u2001";
+const char Trie::kAlphanumPatternUnicode[] = "\u2002";
+const char Trie::kPuncPatternUnicode[] = "\u2003";
+const char Trie::kLowerPatternUnicode[] = "\u2004";
+const char Trie::kUpperPatternUnicode[] = "\u2005";
+
+const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {
+ return RTLReversePolicyNames[reverse_policy];
+}
+
+// Reset the Trie to empty.
+void Trie::clear() {
+ nodes_.delete_data_pointers();
+ nodes_.clear();
+ root_back_freelist_.clear();
+ num_edges_ = 0;
+ new_dawg_node(); // Need to allocate node 0.
+}
+
+bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node,
+ int direction, bool word_end, UNICHAR_ID unichar_id,
+ EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const {
+ if (debug_level_ == 3) {
+ tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT
+ " direction %d word_end %d unichar_id %d, exploring node:\n",
+ node_ref, next_node, direction, word_end, unichar_id);
+ if (node_ref != NO_EDGE) {
+ print_node(node_ref, nodes_[node_ref]->forward_edges.size());
+ }
+ }
+ if (node_ref == NO_EDGE) return false;
+ assert(node_ref < nodes_.size());
+ EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ?
+ nodes_[node_ref]->forward_edges : nodes_[node_ref]->backward_edges;
+ int vec_size = vec.size();
+ if (node_ref == 0 && direction == FORWARD_EDGE) { // binary search
+ EDGE_INDEX start = 0;
+ EDGE_INDEX end = vec_size - 1;
+ EDGE_INDEX k;
+ int compare;
+ while (start <= end) {
+ k = (start + end) >> 1; // (start + end) / 2
+ compare = given_greater_than_edge_rec(next_node, word_end,
+ unichar_id, vec[k]);
+ if (compare == 0) { // given == vec[k]
+ *edge_ptr = &(vec[k]);
+ *edge_index = k;
+ return true;
+ } else if (compare == 1) { // given > vec[k]
+ start = k + 1;
+ } else { // given < vec[k]
+ end = k - 1;
+ }
+ }
+ } else { // linear search
+ for (int i = 0; i < vec_size; ++i) {
+ EDGE_RECORD &edge_rec = vec[i];
+ if (edge_rec_match(next_node, word_end, unichar_id,
+ next_node_from_edge_rec(edge_rec),
+ end_of_word_from_edge_rec(edge_rec),
+ unichar_id_from_edge_rec(edge_rec))) {
+ *edge_ptr = &(edge_rec);
+ *edge_index = i;
+ return true;
+ }
+ }
+ }
+ return false; // not found
+}
+
+bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag,
+ int direction, bool word_end,
+ UNICHAR_ID unichar_id) {
+ EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
+ &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
+ int search_index;
+ if (node1 == 0 && direction == FORWARD_EDGE) {
+ search_index = 0; // find the index to make the add sorted
+ while (search_index < vec->size() &&
+ given_greater_than_edge_rec(node2, word_end, unichar_id,
+ (*vec)[search_index]) == 1) {
+ search_index++;
+ }
+ } else {
+ search_index = vec->size(); // add is unsorted, so index does not matter
+ }
+ EDGE_RECORD edge_rec;
+ link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
+ if (node1 == 0 && direction == BACKWARD_EDGE &&
+ !root_back_freelist_.empty()) {
+ EDGE_INDEX edge_index = root_back_freelist_.pop_back();
+ (*vec)[edge_index] = edge_rec;
+ } else if (search_index < vec->size()) {
+ vec->insert(edge_rec, search_index);
+ } else {
+ vec->push_back(edge_rec);
+ }
+ if (debug_level_ > 1) {
+ tprintf("new edge in nodes_[" REFFORMAT "]: ", node1);
+ print_edge_rec(edge_rec);
+ tprintf("\n");
+ }
+ num_edges_++;
+ return true;
+}
+
+void Trie::add_word_ending(EDGE_RECORD *edge_ptr,
+ NODE_REF the_next_node,
+ bool marker_flag,
+ UNICHAR_ID unichar_id) {
+ EDGE_RECORD *back_edge_ptr;
+ EDGE_INDEX back_edge_index;
+ ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false,
+ unichar_id, &back_edge_ptr, &back_edge_index));
+ if (marker_flag) {
+ *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_);
+ *edge_ptr |= (MARKER_FLAG << flag_start_bit_);
+ }
+ // Mark both directions as end of word.
+ *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
+ *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
+}
+
+bool Trie::add_word_to_dawg(const WERD_CHOICE &word,
+ const GenericVector<bool> *repetitions) {
+ if (word.length() <= 0) return false; // can't add empty words
+ if (repetitions != nullptr) ASSERT_HOST(repetitions->size() == word.length());
+ // Make sure the word does not contain invalid unchar ids.
+ for (int i = 0; i < word.length(); ++i) {
+ if (word.unichar_id(i) < 0 ||
+ word.unichar_id(i) >= unicharset_size_) return false;
+ }
+
+ EDGE_RECORD *edge_ptr;
+ NODE_REF last_node = 0;
+ NODE_REF the_next_node;
+ bool marker_flag = false;
+ EDGE_INDEX edge_index;
+ int i;
+ int32_t still_finding_chars = true;
+ int32_t word_end = false;
+ bool add_failed = false;
+ bool found;
+
+ if (debug_level_ > 1) word.print("\nAdding word: ");
+
+ UNICHAR_ID unichar_id;
+ for (i = 0; i < word.length() - 1; ++i) {
+ unichar_id = word.unichar_id(i);
+ marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
+ if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
+ if (still_finding_chars) {
+ found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end,
+ unichar_id, &edge_ptr, &edge_index);
+ if (found && debug_level_ > 1) {
+ tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n",
+ edge_index, last_node);
+ }
+ if (!found) {
+ still_finding_chars = false;
+ } else if (next_node_from_edge_rec(*edge_ptr) == 0) {
+ // We hit the end of an existing word, but the new word is longer.
+ // In this case we have to disconnect the existing word from the
+ // backwards root node, mark the current position as end-of-word
+ // and add new nodes for the increased length. Disconnecting the
+ // existing word from the backwards root node requires a linear
+ // search, so it is much faster to add the longest words first,
+ // to avoid having to come here.
+ word_end = true;
+ still_finding_chars = false;
+ remove_edge(last_node, 0, word_end, unichar_id);
+ } else {
+ // We have to add a new branch here for the new word.
+ if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr);
+ last_node = next_node_from_edge_rec(*edge_ptr);
+ }
+ }
+ if (!still_finding_chars) {
+ the_next_node = new_dawg_node();
+ if (debug_level_ > 1)
+ tprintf("adding node " REFFORMAT "\n", the_next_node);
+ if (the_next_node == 0) {
+ add_failed = true;
+ break;
+ }
+ if (!add_new_edge(last_node, the_next_node,
+ marker_flag, word_end, unichar_id)) {
+ add_failed = true;
+ break;
+ }
+ word_end = false;
+ last_node = the_next_node;
+ }
+ }
+ the_next_node = 0;
+ unichar_id = word.unichar_id(i);
+ marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
+ if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
+ if (still_finding_chars &&
+ edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false,
+ unichar_id, &edge_ptr, &edge_index)) {
+ // An extension of this word already exists in the trie, so we
+ // only have to add the ending flags in both directions.
+ add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr),
+ marker_flag, unichar_id);
+ } else {
+ // Add a link to node 0. All leaves connect to node 0 so the back links can
+ // be used in reduction to a dawg. This root backward node has one edge
+ // entry for every word, (except prefixes of longer words) so it is huge.
+ if (!add_failed &&
+ !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id))
+ add_failed = true;
+ }
+ if (add_failed) {
+ tprintf("Re-initializing document dictionary...\n");
+ clear();
+ return false;
+ } else {
+ return true;
+ }
+}
+
+NODE_REF Trie::new_dawg_node() {
+ auto *node = new TRIE_NODE_RECORD();
+ nodes_.push_back(node);
+ return nodes_.size() - 1;
+}
+
+bool Trie::read_and_add_word_list(const char *filename,
+ const UNICHARSET &unicharset,
+ Trie::RTLReversePolicy reverse_policy) {
+ std::vector<STRING> word_list;
+ if (!read_word_list(filename, &word_list)) return false;
+ std::sort(word_list.begin(), word_list.end(), [](auto &s1, auto &s2) {
+ return s1.size() > s2.size();
+ });
+ return add_word_list(word_list, unicharset, reverse_policy);
+}
+
+bool Trie::read_word_list(const char *filename,
+ std::vector<STRING>* words) {
+ FILE *word_file;
+ char line_str[CHARS_PER_LINE];
+ int word_count = 0;
+
+ word_file = fopen(filename, "rb");
+ if (word_file == nullptr) return false;
+
+ while (fgets(line_str, sizeof(line_str), word_file) != nullptr) {
+ chomp_string(line_str); // remove newline
+ STRING word_str(line_str);
+ ++word_count;
+ if (debug_level_ && word_count % 10000 == 0)
+ tprintf("Read %d words so far\n", word_count);
+ words->push_back(word_str);
+ }
+ if (debug_level_)
+ tprintf("Read %d words total.\n", word_count);
+ fclose(word_file);
+ return true;
+}
+
+bool Trie::add_word_list(const std::vector<STRING> &words,
+ const UNICHARSET &unicharset,
+ Trie::RTLReversePolicy reverse_policy) {
+ for (int i = 0; i < words.size(); ++i) {
+ WERD_CHOICE word(words[i].c_str(), unicharset);
+ if (word.length() == 0 || word.contains_unichar_id(INVALID_UNICHAR_ID))
+ continue;
+ if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL &&
+ word.has_rtl_unichar_id()) ||
+ reverse_policy == RRP_FORCE_REVERSE) {
+ word.reverse_and_mirror_unichar_ids();
+ }
+ if (!word_in_dawg(word)) {
+ add_word_to_dawg(word);
+ if (!word_in_dawg(word)) {
+ tprintf("Error: word '%s' not in DAWG after adding it\n",
+ words[i].c_str());
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+void Trie::initialize_patterns(UNICHARSET *unicharset) {
+ unicharset->unichar_insert(kAlphaPatternUnicode);
+ alpha_pattern_ = unicharset->unichar_to_id(kAlphaPatternUnicode);
+ unicharset->unichar_insert(kDigitPatternUnicode);
+ digit_pattern_ = unicharset->unichar_to_id(kDigitPatternUnicode);
+ unicharset->unichar_insert(kAlphanumPatternUnicode);
+ alphanum_pattern_ = unicharset->unichar_to_id(kAlphanumPatternUnicode);
+ unicharset->unichar_insert(kPuncPatternUnicode);
+ punc_pattern_ = unicharset->unichar_to_id(kPuncPatternUnicode);
+ unicharset->unichar_insert(kLowerPatternUnicode);
+ lower_pattern_ = unicharset->unichar_to_id(kLowerPatternUnicode);
+ unicharset->unichar_insert(kUpperPatternUnicode);
+ upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode);
+ initialized_patterns_ = true;
+ unicharset_size_ = unicharset->size();
+}
+
+void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id,
+ const UNICHARSET &unicharset,
+ GenericVector<UNICHAR_ID> *vec) const {
+ bool is_alpha = unicharset.get_isalpha(unichar_id);
+ if (is_alpha) {
+ vec->push_back(alpha_pattern_);
+ vec->push_back(alphanum_pattern_);
+ if (unicharset.get_islower(unichar_id)) {
+ vec->push_back(lower_pattern_);
+ } else if (unicharset.get_isupper(unichar_id)) {
+ vec->push_back(upper_pattern_);
+ }
+ }
+ if (unicharset.get_isdigit(unichar_id)) {
+ vec->push_back(digit_pattern_);
+ if (!is_alpha) vec->push_back(alphanum_pattern_);
+ }
+ if (unicharset.get_ispunctuation(unichar_id)) {
+ vec->push_back(punc_pattern_);
+ }
+}
+
+UNICHAR_ID Trie::character_class_to_pattern(char ch) {
+ if (ch == 'c') {
+ return alpha_pattern_;
+ } else if (ch == 'd') {
+ return digit_pattern_;
+ } else if (ch == 'n') {
+ return alphanum_pattern_;
+ } else if (ch == 'p') {
+ return punc_pattern_;
+ } else if (ch == 'a') {
+ return lower_pattern_;
+ } else if (ch == 'A') {
+ return upper_pattern_;
+ } else {
+ return INVALID_UNICHAR_ID;
+ }
+}
+
+bool Trie::read_pattern_list(const char *filename,
+ const UNICHARSET &unicharset) {
+ if (!initialized_patterns_) {
+ tprintf("please call initialize_patterns() before read_pattern_list()\n");
+ return false;
+ }
+
+ FILE *pattern_file = fopen(filename, "rb");
+ if (pattern_file == nullptr) {
+ tprintf("Error opening pattern file %s\n", filename);
+ return false;
+ }
+
+ int pattern_count = 0;
+ char string[CHARS_PER_LINE];
+ while (fgets(string, CHARS_PER_LINE, pattern_file) != nullptr) {
+ chomp_string(string); // remove newline
+ // Parse the pattern and construct a unichar id vector.
+ // Record the number of repetitions of each unichar in the parallel vector.
+ WERD_CHOICE word(&unicharset);
+ GenericVector<bool> repetitions_vec;
+ const char *str_ptr = string;
+ int step = unicharset.step(str_ptr);
+ bool failed = false;
+ while (step > 0) {
+ UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
+ if (step == 1 && *str_ptr == '\\') {
+ ++str_ptr;
+ if (*str_ptr == '\\') { // regular '\' unichar that was escaped
+ curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
+ } else {
+ if (word.length() < kSaneNumConcreteChars) {
+ tprintf("Please provide at least %d concrete characters at the"
+ " beginning of the pattern\n", kSaneNumConcreteChars);
+ failed = true;
+ break;
+ }
+ // Parse character class from expression.
+ curr_unichar_id = character_class_to_pattern(*str_ptr);
+ }
+ } else {
+ curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
+ }
+ if (curr_unichar_id == INVALID_UNICHAR_ID) {
+ failed = true;
+ break; // failed to parse this pattern
+ }
+ word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);
+ repetitions_vec.push_back(false);
+ str_ptr += step;
+ step = unicharset.step(str_ptr);
+ // Check if there is a repetition pattern specified after this unichar.
+ if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') {
+ repetitions_vec[repetitions_vec.size()-1] = true;
+ str_ptr += 2;
+ step = unicharset.step(str_ptr);
+ }
+ }
+ if (failed) {
+ tprintf("Invalid user pattern %s\n", string);
+ continue;
+ }
+ // Insert the pattern into the trie.
+ if (debug_level_ > 2) {
+ tprintf("Inserting expanded user pattern %s\n",
+ word.debug_string().c_str());
+ }
+ if (!this->word_in_dawg(word)) {
+ this->add_word_to_dawg(word, &repetitions_vec);
+ if (!this->word_in_dawg(word)) {
+ tprintf("Error: failed to insert pattern '%s'\n", string);
+ }
+ }
+ ++pattern_count;
+ }
+ if (debug_level_) {
+ tprintf("Read %d valid patterns from %s\n", pattern_count, filename);
+ }
+ fclose(pattern_file);
+ return true;
+}
+
+void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction,
+ bool word_end, UNICHAR_ID unichar_id) {
+ EDGE_RECORD *edge_ptr = nullptr;
+ EDGE_INDEX edge_index = 0;
+ ASSERT_HOST(edge_char_of(node1, node2, direction, word_end,
+ unichar_id, &edge_ptr, &edge_index));
+ if (debug_level_ > 1) {
+ tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1);
+ print_edge_rec(*edge_ptr);
+ tprintf("\n");
+ }
+ if (direction == FORWARD_EDGE) {
+ nodes_[node1]->forward_edges.remove(edge_index);
+ } else if (node1 == 0) {
+ KillEdge(&nodes_[node1]->backward_edges[edge_index]);
+ root_back_freelist_.push_back(edge_index);
+ } else {
+ nodes_[node1]->backward_edges.remove(edge_index);
+ }
+ --num_edges_;
+}
+
+// Some optimizations employed in add_word_to_dawg and trie_to_dawg:
+// 1 Avoid insertion sorting or bubble sorting the tail root node
+// (back links on node 0, a list of all the leaves.). The node is
+// huge, and sorting it with n^2 time is terrible.
+// 2 Avoid using GenericVector::remove on the tail root node.
+// (a) During add of words to the trie, zero-out the unichars and
+// keep a freelist of spaces to re-use.
+// (b) During reduction, just zero-out the unichars of deleted back
+// links, skipping zero entries while searching.
+// 3 Avoid linear search of the tail root node. This has to be done when
+// a suffix is added to an existing word. Adding words by decreasing
+// length avoids this problem entirely. Words can still be added in
+// any order, but it is faster to add the longest first.
+SquishedDawg *Trie::trie_to_dawg() {
+ root_back_freelist_.clear(); // Will be invalided by trie_to_dawg.
+ if (debug_level_ > 2) {
+ print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY);
+ }
+ auto reduced_nodes = new bool[nodes_.size()];
+ for (int i = 0; i < nodes_.size(); i++) reduced_nodes[i] = false;
+ this->reduce_node_input(0, reduced_nodes);
+ delete[] reduced_nodes;
+
+ if (debug_level_ > 2) {
+ print_all("After reduction:", MAX_NODE_EDGES_DISPLAY);
+ }
+ // Build a translation map from node indices in nodes_ vector to
+ // their target indices in EDGE_ARRAY.
+ auto *node_ref_map = new NODE_REF[nodes_.size() + 1];
+ int i, j;
+ node_ref_map[0] = 0;
+ for (i = 0; i < nodes_.size(); ++i) {
+ node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
+ }
+ int num_forward_edges = node_ref_map[i];
+
+ // Convert nodes_ vector into EDGE_ARRAY translating the next node references
+ // in edges using node_ref_map. Empty nodes and backward edges are dropped.
+ auto edge_array = new EDGE_RECORD[num_forward_edges];
+ EDGE_ARRAY edge_array_ptr = edge_array;
+ for (i = 0; i < nodes_.size(); ++i) {
+ TRIE_NODE_RECORD *node_ptr = nodes_[i];
+ int end = node_ptr->forward_edges.size();
+ for (j = 0; j < end; ++j) {
+ EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
+ NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
+ ASSERT_HOST(node_ref < nodes_.size());
+ UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
+ link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
+ end_of_word_from_edge_rec(edge_rec), unichar_id);
+ if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr);
+ ++edge_array_ptr;
+ }
+ }
+ delete[] node_ref_map;
+
+ return new SquishedDawg(edge_array, num_forward_edges, type_, lang_,
+ perm_, unicharset_size_, debug_level_);
+}
+
+bool Trie::eliminate_redundant_edges(NODE_REF node,
+ const EDGE_RECORD &edge1,
+ const EDGE_RECORD &edge2) {
+ if (debug_level_ > 1) {
+ tprintf("\nCollapsing node %" PRIi64 ":\n", node);
+ print_node(node, MAX_NODE_EDGES_DISPLAY);
+ tprintf("Candidate edges: ");
+ print_edge_rec(edge1);
+ tprintf(", ");
+ print_edge_rec(edge2);
+ tprintf("\n\n");
+ }
+ NODE_REF next_node1 = next_node_from_edge_rec(edge1);
+ NODE_REF next_node2 = next_node_from_edge_rec(edge2);
+ TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2];
+ // Translate all edges going to/from next_node2 to go to/from next_node1.
+ EDGE_RECORD *edge_ptr = nullptr;
+ EDGE_INDEX edge_index;
+ int i;
+ // The backward link in node to next_node2 will be zeroed out by the caller.
+ // Copy all the backward links in next_node2 to node next_node1
+ for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {
+ const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
+ NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
+ UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
+ int curr_word_end = end_of_word_from_edge_rec(bkw_edge);
+ bool marker_flag = marker_flag_from_edge_rec(bkw_edge);
+ add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE,
+ curr_word_end, curr_unichar_id);
+ // Relocate the corresponding forward edge in curr_next_node
+ ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE,
+ curr_word_end, curr_unichar_id,
+ &edge_ptr, &edge_index));
+ set_next_node_in_edge_rec(edge_ptr, next_node1);
+ }
+ int next_node2_num_edges = (next_node2_ptr->forward_edges.size() +
+ next_node2_ptr->backward_edges.size());
+ if (debug_level_ > 1) {
+ tprintf("removed %d edges from node " REFFORMAT "\n",
+ next_node2_num_edges, next_node2);
+ }
+ next_node2_ptr->forward_edges.clear();
+ next_node2_ptr->backward_edges.clear();
+ num_edges_ -= next_node2_num_edges;
+ return true;
+}
+
+bool Trie::reduce_lettered_edges(EDGE_INDEX edge_index,
+ UNICHAR_ID unichar_id,
+ NODE_REF node,
+ EDGE_VECTOR* backward_edges,
+ NODE_MARKER reduced_nodes) {
+ if (debug_level_ > 1)
+ tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
+ // Compare each of the edge pairs with the given unichar_id.
+ bool did_something = false;
+ for (int i = edge_index; i < backward_edges->size() - 1; ++i) {
+ // Find the first edge that can be eliminated.
+ UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
+ while (i < backward_edges->size()) {
+ if (!DeadEdge((*backward_edges)[i])) {
+ curr_unichar_id = unichar_id_from_edge_rec((*backward_edges)[i]);
+ if (curr_unichar_id != unichar_id) return did_something;
+ if (can_be_eliminated((*backward_edges)[i])) break;
+ }
+ ++i;
+ }
+ if (i == backward_edges->size()) break;
+ const EDGE_RECORD &edge_rec = (*backward_edges)[i];
+ // Compare it to the rest of the edges with the given unichar_id.
+ for (int j = i + 1; j < backward_edges->size(); ++j) {
+ const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
+ if (DeadEdge(next_edge_rec)) continue;
+ UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec);
+ if (next_id != unichar_id) break;
+ if (end_of_word_from_edge_rec(next_edge_rec) ==
+ end_of_word_from_edge_rec(edge_rec) &&
+ can_be_eliminated(next_edge_rec) &&
+ eliminate_redundant_edges(node, edge_rec, next_edge_rec)) {
+ reduced_nodes[next_node_from_edge_rec(edge_rec)] = false;
+ did_something = true;
+ KillEdge(&(*backward_edges)[j]);
+ }
+ }
+ }
+ return did_something;
+}
+
+void Trie::sort_edges(EDGE_VECTOR *edges) {
+ int num_edges = edges->size();
+ if (num_edges <= 1) return;
+ GenericVector<KDPairInc<UNICHAR_ID, EDGE_RECORD> > sort_vec;
+ sort_vec.reserve(num_edges);
+ for (int i = 0; i < num_edges; ++i) {
+ sort_vec.push_back(KDPairInc<UNICHAR_ID, EDGE_RECORD>(
+ unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]));
+ }
+ sort_vec.sort();
+ for (int i = 0; i < num_edges; ++i)
+ (*edges)[i] = sort_vec[i].data();
+}
+
+void Trie::reduce_node_input(NODE_REF node,
+ NODE_MARKER reduced_nodes) {
+ EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
+ sort_edges(&backward_edges);
+ if (debug_level_ > 1) {
+ tprintf("reduce_node_input(node=" REFFORMAT ")\n", node);
+ print_node(node, MAX_NODE_EDGES_DISPLAY);
+ }
+
+ EDGE_INDEX edge_index = 0;
+ while (edge_index < backward_edges.size()) {
+ if (DeadEdge(backward_edges[edge_index])) continue;
+ UNICHAR_ID unichar_id =
+ unichar_id_from_edge_rec(backward_edges[edge_index]);
+ while (reduce_lettered_edges(edge_index, unichar_id, node,
+ &backward_edges, reduced_nodes));
+ while (++edge_index < backward_edges.size()) {
+ UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]);
+ if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) break;
+ }
+ }
+ reduced_nodes[node] = true; // mark as reduced
+
+ if (debug_level_ > 1) {
+ tprintf("Node " REFFORMAT " after reduction:\n", node);
+ print_node(node, MAX_NODE_EDGES_DISPLAY);
+ }
+
+ for (int i = 0; i < backward_edges.size(); ++i) {
+ if (DeadEdge(backward_edges[i])) continue;
+ NODE_REF next_node = next_node_from_edge_rec(backward_edges[i]);
+ if (next_node != 0 && !reduced_nodes[next_node]) {
+ reduce_node_input(next_node, reduced_nodes);
+ }
+ }
+}
+
+void Trie::print_node(NODE_REF node, int max_num_edges) const {
+ if (node == NO_EDGE) return; // nothing to print
+ TRIE_NODE_RECORD *node_ptr = nodes_[node];
+ int num_fwd = node_ptr->forward_edges.size();
+ int num_bkw = node_ptr->backward_edges.size();
+ EDGE_VECTOR *vec;
+ for (int dir = 0; dir < 2; ++dir) {
+ if (dir == 0) {
+ vec = &(node_ptr->forward_edges);
+ tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw);
+ } else {
+ vec = &(node_ptr->backward_edges);
+ tprintf("\t");
+ }
+ int i;
+ for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) &&
+ i < max_num_edges; ++i) {
+ if (DeadEdge((*vec)[i])) continue;
+ print_edge_rec((*vec)[i]);
+ tprintf(" ");
+ }
+ if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("...");
+ tprintf("\n");
+ }
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/dict/trie.h b/tesseract/src/dict/trie.h
new file mode 100644
index 00000000..f516b3c5
--- /dev/null
+++ b/tesseract/src/dict/trie.h
@@ -0,0 +1,432 @@
+/******************************************************************************
+ *
+ * File: trie.h
+ * Description: Functions to build a trie data structure.
+ * Author: Mark Seaman, SW Productivity
+ *
+ * (c) Copyright 1987, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *****************************************************************************/
+#ifndef TRIE_H
+#define TRIE_H
+
+#include "dawg.h"
+
+#include "genericvector.h"
+
+namespace tesseract {
+
+class UNICHARSET;
+
+// Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed
+// max int32, we will need to change GenericVector to use int64 for size
+// and address indices. This does not seem to be needed immediately,
+// since currently the largest number of edges limit used by tesseract
+// (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32.
+// There are also int casts below to satisfy the WIN32 compiler that would
+// need to be changed.
+// It might be cleanest to change the types of most of the Trie/Dawg related
+// typedefs to int and restrict the casts to extracting these values from
+// the 64 bit EDGE_RECORD.
+using EDGE_INDEX = int64_t ; // index of an edge in a given node
+using NODE_MARKER = bool *;
+using EDGE_VECTOR = GenericVector<EDGE_RECORD> ;
+
+struct TRIE_NODE_RECORD {
+ EDGE_VECTOR forward_edges;
+ EDGE_VECTOR backward_edges;
+};
+using TRIE_NODES = GenericVector<TRIE_NODE_RECORD *> ;
+
+/**
+ * Concrete class for Trie data structure that allows to store a list of
+ * words (extends Dawg base class) as well as dynamically add new words.
+ * This class stores a vector of pointers to TRIE_NODE_RECORDs, each of
+ * which has a vector of forward and backward edges.
+ */
+class TESS_API Trie : public Dawg {
+ public:
+ enum RTLReversePolicy {
+ RRP_DO_NO_REVERSE,
+ RRP_REVERSE_IF_HAS_RTL,
+ RRP_FORCE_REVERSE,
+ };
+
+ // Minimum number of concrete characters at the beginning of user patterns.
+ static const int kSaneNumConcreteChars = 0;
+ // Various unicode whitespace characters are used to denote unichar patterns,
+ // (character classifier would never produce these whitespace characters as a
+ // valid classification).
+ static const char kAlphaPatternUnicode[];
+ static const char kDigitPatternUnicode[];
+ static const char kAlphanumPatternUnicode[];
+ static const char kPuncPatternUnicode[];
+ static const char kLowerPatternUnicode[];
+ static const char kUpperPatternUnicode[];
+
+ static const char *get_reverse_policy_name(
+ RTLReversePolicy reverse_policy);
+
+ // max_num_edges argument allows limiting the amount of memory this
+ // Trie can consume (if a new word insert would cause the Trie to
+ // contain more edges than max_num_edges, all the edges are cleared
+ // so that new inserts can proceed).
+ Trie(DawgType type, const STRING &lang, PermuterType perm,
+ int unicharset_size, int debug_level)
+ : Dawg(type, lang, perm, debug_level) {
+ init(unicharset_size);
+ num_edges_ = 0;
+ deref_node_index_mask_ = ~letter_mask_;
+ new_dawg_node(); // need to allocate node 0
+ initialized_patterns_ = false;
+ }
+ ~Trie() override { nodes_.delete_data_pointers(); }
+
+ // Reset the Trie to empty.
+ void clear();
+
+ /** Returns the edge that corresponds to the letter out of this node. */
+ EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id,
+ bool word_end) const override {
+ EDGE_RECORD *edge_ptr;
+ EDGE_INDEX edge_index;
+ if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id,
+ &edge_ptr, &edge_index)) return NO_EDGE;
+ return make_edge_ref(node_ref, edge_index);
+ }
+
+ /**
+ * Fills the given NodeChildVector with all the unichar ids (and the
+ * corresponding EDGE_REFs) for which there is an edge out of this node.
+ */
+ void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
+ bool word_end) const override {
+ const EDGE_VECTOR &forward_edges =
+ nodes_[static_cast<int>(node)]->forward_edges;
+ for (int i = 0; i < forward_edges.size(); ++i) {
+ if (!word_end || end_of_word_from_edge_rec(forward_edges[i])) {
+ vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]),
+ make_edge_ref(node, i)));
+ }
+ }
+ }
+
+ /**
+ * Returns the next node visited by following the edge
+ * indicated by the given EDGE_REF.
+ */
+ NODE_REF next_node(EDGE_REF edge_ref) const override {
+ if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE;
+ return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
+ }
+
+ /**
+ * Returns true if the edge indicated by the given EDGE_REF
+ * marks the end of a word.
+ */
+ bool end_of_word(EDGE_REF edge_ref) const override {
+ if (edge_ref == NO_EDGE || num_edges_ == 0) return false;
+ return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
+ }
+
+ /** Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. */
+ UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override {
+ if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID;
+ return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
+ }
+ // Sets the UNICHAR_ID in the given edge_rec to unicharset_size_, marking
+ // the edge dead.
+ void KillEdge(EDGE_RECORD* edge_rec) const {
+ *edge_rec &= ~letter_mask_;
+ *edge_rec |= (unicharset_size_ << LETTER_START_BIT);
+ }
+ bool DeadEdge(const EDGE_RECORD& edge_rec) const {
+ return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;
+ }
+
+ // Prints the contents of the node indicated by the given NODE_REF.
+ // At most max_num_edges will be printed.
+ void print_node(NODE_REF node, int max_num_edges) const override;
+
+ // Writes edges from nodes_ to an EDGE_ARRAY and creates a SquishedDawg.
+ // Eliminates redundant edges and returns the pointer to the SquishedDawg.
+ // Note: the caller is responsible for deallocating memory associated
+ // with the returned SquishedDawg pointer.
+ SquishedDawg *trie_to_dawg();
+
+ // Reads a list of words from the given file and adds into the Trie.
+ // Calls WERD_CHOICE::reverse_unichar_ids_if_rtl() according to the reverse
+ // policy and information in the unicharset.
+ // Returns false on error.
+ bool read_and_add_word_list(const char *filename,
+ const UNICHARSET &unicharset,
+ Trie::RTLReversePolicy reverse);
+
+ // Reads a list of words from the given file.
+ // Returns false on error.
+ bool read_word_list(const char *filename,
+ std::vector<STRING>* words);
+ // Adds a list of words previously read using read_word_list to the trie
+ // using the given unicharset and reverse_policy to convert to unichar-ids.
+ // Returns false on error.
+ bool add_word_list(const std::vector<STRING> &words,
+ const UNICHARSET &unicharset,
+ Trie::RTLReversePolicy reverse_policy);
+
+ // Inserts the list of patterns from the given file into the Trie.
+ // The pattern list file should contain one pattern per line in UTF-8 format.
+ //
+ // Each pattern can contain any non-whitespace characters, however only the
+ // patterns that contain characters from the unicharset of the corresponding
+ // language will be useful.
+ // The only meta character is '\'. To be used in a pattern as an ordinary
+ // string it should be escaped with '\' (e.g. string "C:\Documents" should
+ // be written in the patterns file as "C:\\Documents").
+ // This function supports a very limited regular expression syntax. One can
+ // express a character, a certain character class and a number of times the
+ // entity should be repeated in the pattern.
+ //
+ // To denote a character class use one of:
+ // \c - unichar for which UNICHARSET::get_isalpha() is true (character)
+ // \d - unichar for which UNICHARSET::get_isdigit() is true
+ // \n - unichar for which UNICHARSET::get_isdigit() or
+ // UNICHARSET::isalpha() are true
+ // \p - unichar for which UNICHARSET::get_ispunct() is true
+ // \a - unichar for which UNICHARSET::get_islower() is true
+ // \A - unichar for which UNICHARSET::get_isupper() is true
+ //
+ // \* could be specified after each character or pattern to indicate that
+ // the character/pattern can be repeated any number of times before the next
+ // character/pattern occurs.
+ //
+ // Examples:
+ // 1-8\d\d-GOOG-411 will be expanded to strings:
+ // 1-800-GOOG-411, 1-801-GOOG-411, ... 1-899-GOOG-411.
+ //
+ // http://www.\n\*.com will be expanded to strings like:
+ // http://www.a.com http://www.a123.com ... http://www.ABCDefgHIJKLMNop.com
+ //
+ // Note: In choosing which patterns to include please be aware of the fact
+ // providing very generic patterns will make tesseract run slower.
+ // For example \n\* at the beginning of the pattern will make Tesseract
+ // consider all the combinations of proposed character choices for each
+ // of the segmentations, which will be unacceptably slow.
+ // Because of potential problems with speed that could be difficult to
+ // identify, each user pattern has to have at least kSaneNumConcreteChars
+ // concrete characters from the unicharset at the beginning.
+ bool read_pattern_list(const char *filename, const UNICHARSET &unicharset);
+
+ // Initializes the values of *_pattern_ unichar ids.
+ // This function should be called before calling read_pattern_list().
+ void initialize_patterns(UNICHARSET *unicharset);
+
+ // Fills in the given unichar id vector with the unichar ids that represent
+ // the patterns of the character classes of the given unichar_id.
+ void unichar_id_to_patterns(UNICHAR_ID unichar_id,
+ const UNICHARSET &unicharset,
+ GenericVector<UNICHAR_ID> *vec) const override;
+
+ // Returns the given EDGE_REF if the EDGE_RECORD that it points to has
+ // a self loop and the given unichar_id matches the unichar_id stored in the
+ // EDGE_RECORD, returns NO_EDGE otherwise.
+ EDGE_REF pattern_loop_edge(EDGE_REF edge_ref,
+ UNICHAR_ID unichar_id,
+ bool word_end) const override {
+ if (edge_ref == NO_EDGE) return NO_EDGE;
+ EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
+ return (marker_flag_from_edge_rec(*edge_rec) &&
+ unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
+ word_end == end_of_word_from_edge_rec(*edge_rec)) ?
+ edge_ref : NO_EDGE;
+ }
+
+ // Adds a word to the Trie (creates the necessary nodes and edges).
+ //
+ // If repetitions vector is not nullptr, each entry in the vector indicates
+ // whether the unichar id with the corresponding index in the word is allowed
+ // to repeat an unlimited number of times. For each entry that is true, MARKER
+ // flag of the corresponding edge created for this unichar id is set to true).
+ //
+ // Return true if add succeeded, false otherwise (e.g. when a word contained
+ // an invalid unichar id or the trie was getting too large and was cleared).
+ bool add_word_to_dawg(const WERD_CHOICE &word,
+ const GenericVector<bool> *repetitions);
+ bool add_word_to_dawg(const WERD_CHOICE &word) {
+ return add_word_to_dawg(word, nullptr);
+ }
+
+ protected:
+ // The structure of an EDGE_REF for Trie edges is as follows:
+ // [LETTER_START_BIT, flag_start_bit_):
+ // edge index in *_edges in a TRIE_NODE_RECORD
+ // [flag_start_bit, 30th bit]: node index in nodes (TRIE_NODES vector)
+ //
+ // With this arrangement there are enough bits to represent edge indices
+ // (each node can have at most unicharset_size_ forward edges and
+ // the position of flag_start_bit is set to be log2(unicharset_size_)).
+ // It is also possible to accommodate a maximum number of nodes that is at
+ // least as large as that of the SquishedDawg representation (in SquishedDawg
+ // each EDGE_RECORD has 32-(flag_start_bit+NUM_FLAG_BITS) bits to represent
+ // the next node index).
+ //
+
+ // Returns the pointer to EDGE_RECORD after decoding the location
+ // of the edge from the information in the given EDGE_REF.
+ // This function assumes that EDGE_REF holds valid node/edge indices.
+ inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const {
+ int edge_index = static_cast<int>(
+ (edge_ref & letter_mask_) >> LETTER_START_BIT);
+ int node_index = static_cast<int>(
+ (edge_ref & deref_node_index_mask_) >> flag_start_bit_);
+ TRIE_NODE_RECORD *node_rec = nodes_[node_index];
+ return &(node_rec->forward_edges[edge_index]);
+ }
+ /** Constructs EDGE_REF from the given node_index and edge_index. */
+ inline EDGE_REF make_edge_ref(NODE_REF node_index,
+ EDGE_INDEX edge_index) const {
+ return ((node_index << flag_start_bit_) |
+ (edge_index << LETTER_START_BIT));
+ }
+ /** Sets up this edge record to the requested values. */
+ inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats,
+ int direction, bool word_end, UNICHAR_ID unichar_id) {
+ EDGE_RECORD flags = 0;
+ if (repeats) flags |= MARKER_FLAG;
+ if (word_end) flags |= WERD_END_FLAG;
+ if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG;
+ *edge = ((nxt << next_node_start_bit_) |
+ (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
+ (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
+ }
+ /** Prints the given EDGE_RECORD. */
+ inline void print_edge_rec(const EDGE_RECORD &edge_rec) const {
+ tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
+ marker_flag_from_edge_rec(edge_rec) ? "R," : "",
+ (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
+ end_of_word_from_edge_rec(edge_rec) ? ",E" : "",
+ unichar_id_from_edge_rec(edge_rec));
+ }
+ // Returns true if the next node in recorded the given EDGE_RECORD
+ // has exactly one forward edge.
+ inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) {
+ NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
+ return (node_ref != NO_EDGE &&
+ nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);
+ }
+
+ // Prints the contents of the Trie.
+ // At most max_num_edges will be printed for each node.
+ void print_all(const char* msg, int max_num_edges) {
+ tprintf("\n__________________________\n%s\n", msg);
+ for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges);
+ tprintf("__________________________\n");
+ }
+
+ // Finds the edge with the given direction, word_end and unichar_id
+ // in the node indicated by node_ref. Fills in the pointer to the
+ // EDGE_RECORD and the index of the edge with the the values
+ // corresponding to the edge found. Returns true if an edge was found.
+ bool edge_char_of(NODE_REF node_ref, NODE_REF next_node,
+ int direction, bool word_end, UNICHAR_ID unichar_id,
+ EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const;
+
+ // Adds an single edge linkage between node1 and node2 in the direction
+ // indicated by direction argument.
+ bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats,
+ int direction, bool word_end,
+ UNICHAR_ID unichar_id);
+
+ // Adds forward edge linkage from node1 to node2 and the corresponding
+ // backward edge linkage in the other direction.
+ bool add_new_edge(NODE_REF node1, NODE_REF node2,
+ bool repeats, bool word_end, UNICHAR_ID unichar_id) {
+ return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE,
+ word_end, unichar_id) &&
+ add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE,
+ word_end, unichar_id));
+ }
+
+ // Sets the word ending flags in an already existing edge pair.
+ // Returns true on success.
+ void add_word_ending(EDGE_RECORD *edge,
+ NODE_REF the_next_node,
+ bool repeats,
+ UNICHAR_ID unichar_id);
+
+ // Allocates space for a new node in the Trie.
+ NODE_REF new_dawg_node();
+
+ // Removes a single edge linkage to between node1 and node2 in the
+ // direction indicated by direction argument.
+ void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction,
+ bool word_end, UNICHAR_ID unichar_id);
+
+ // Removes forward edge linkage from node1 to node2 and the corresponding
+ // backward edge linkage in the other direction.
+ void remove_edge(NODE_REF node1, NODE_REF node2,
+ bool word_end, UNICHAR_ID unichar_id) {
+ remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
+ remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
+ }
+
+ // Compares edge1 and edge2 in the given node to see if they point to two
+ // next nodes that could be collapsed. If they do, performs the reduction
+ // and returns true.
+ bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1,
+ const EDGE_RECORD &edge2);
+
+ // Assuming that edge_index indicates the first edge in a group of edges
+ // in this node with a particular letter value, looks through these edges
+ // to see if any of them can be collapsed. If so does it. Returns to the
+ // caller when all edges with this letter have been reduced.
+ // Returns true if further reduction is possible with this same letter.
+ bool reduce_lettered_edges(EDGE_INDEX edge_index,
+ UNICHAR_ID unichar_id,
+ NODE_REF node,
+ EDGE_VECTOR* backward_edges,
+ NODE_MARKER reduced_nodes);
+
+ /**
+ * Order num_edges of consecutive EDGE_RECORDS in the given EDGE_VECTOR in
+ * increasing order of unichar ids. This function is normally called
+ * for all edges in a single node, and since number of edges in each node
+ * is usually quite small, selection sort is used.
+ */
+ void sort_edges(EDGE_VECTOR *edges);
+
+ /** Eliminates any redundant edges from this node in the Trie. */
+ void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes);
+
+ // Returns the pattern unichar id for the given character class code.
+ UNICHAR_ID character_class_to_pattern(char ch);
+
+ // Member variables
+ TRIE_NODES nodes_; // vector of nodes in the Trie
+ // Freelist of edges in the root backwards node that were previously zeroed.
+ GenericVector<EDGE_INDEX> root_back_freelist_;
+ uint64_t num_edges_; // sum of all edges (forward and backward)
+ uint64_t deref_direction_mask_; // mask for EDGE_REF to extract direction
+ uint64_t deref_node_index_mask_; // mask for EDGE_REF to extract node index
+ // Variables for translating character class codes denoted in user patterns
+ // file to the unichar ids used to represent them in a Trie.
+ UNICHAR_ID alpha_pattern_;
+ UNICHAR_ID digit_pattern_;
+ UNICHAR_ID alphanum_pattern_;
+ UNICHAR_ID punc_pattern_;
+ UNICHAR_ID lower_pattern_;
+ UNICHAR_ID upper_pattern_;
+ bool initialized_patterns_;
+};
+
+} // namespace tesseract
+
+#endif