From 5ff1d6955496b3cf9a35042c9ac35db43bc336b1 Mon Sep 17 00:00:00 2001 From: Thomas Deutschmann Date: Tue, 30 Mar 2021 10:59:39 +0200 Subject: Import Ghostscript 9.54 Signed-off-by: Thomas Deutschmann --- tesseract/src/dict/context.cpp | 76 ++++ tesseract/src/dict/dawg.cpp | 417 ++++++++++++++++++ tesseract/src/dict/dawg.h | 554 ++++++++++++++++++++++++ tesseract/src/dict/dawg_cache.cpp | 96 +++++ tesseract/src/dict/dawg_cache.h | 53 +++ tesseract/src/dict/dict.cpp | 888 ++++++++++++++++++++++++++++++++++++++ tesseract/src/dict/dict.h | 651 ++++++++++++++++++++++++++++ tesseract/src/dict/hyphen.cpp | 61 +++ tesseract/src/dict/matchdefs.h | 50 +++ tesseract/src/dict/permdawg.cpp | 390 +++++++++++++++++ tesseract/src/dict/stopper.cpp | 515 ++++++++++++++++++++++ tesseract/src/dict/stopper.h | 50 +++ tesseract/src/dict/trie.cpp | 716 ++++++++++++++++++++++++++++++ tesseract/src/dict/trie.h | 432 +++++++++++++++++++ 14 files changed, 4949 insertions(+) create mode 100644 tesseract/src/dict/context.cpp create mode 100644 tesseract/src/dict/dawg.cpp create mode 100644 tesseract/src/dict/dawg.h create mode 100644 tesseract/src/dict/dawg_cache.cpp create mode 100644 tesseract/src/dict/dawg_cache.h create mode 100644 tesseract/src/dict/dict.cpp create mode 100644 tesseract/src/dict/dict.h create mode 100644 tesseract/src/dict/hyphen.cpp create mode 100644 tesseract/src/dict/matchdefs.h create mode 100644 tesseract/src/dict/permdawg.cpp create mode 100644 tesseract/src/dict/stopper.cpp create mode 100644 tesseract/src/dict/stopper.h create mode 100644 tesseract/src/dict/trie.cpp create mode 100644 tesseract/src/dict/trie.h (limited to 'tesseract/src/dict') diff --git a/tesseract/src/dict/context.cpp b/tesseract/src/dict/context.cpp new file mode 100644 index 00000000..93cff5ff --- /dev/null +++ b/tesseract/src/dict/context.cpp @@ -0,0 +1,76 @@ +/****************************************************************************** + * + * File: context.cpp (Formerly context.c) + * Description: Context checking functions + * Author: Mark Seaman, OCR Technology + * + * (c) Copyright 1990, Hewlett-Packard Company. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + *****************************************************************************/ + +#include "dict.h" +#include "unicharset.h" + +namespace tesseract { + +static const int kMinAbsoluteGarbageWordLength = 10; +static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f; + +const int case_state_table[6][4] = { + {/* 0. Beginning of word */ + /* P U L D */ + /* -1. Error on case */ + 0, 1, 5, 4}, + {/* 1. After initial capital */ + 0, 3, 2, 4}, + {/* 2. After lower case */ + 0, -1, 2, -1}, + {/* 3. After upper case */ + 0, 3, -1, 4}, + {/* 4. After a digit */ + 0, -1, -1, 4}, + {/* 5. After initial lower case */ + 5, -1, 2, -1}, +}; + +int Dict::case_ok(const WERD_CHOICE &word) const { + int state = 0; + int x; + const UNICHARSET* unicharset = word.unicharset(); + for (x = 0; x < word.length(); ++x) { + UNICHAR_ID ch_id = word.unichar_id(x); + if (unicharset->get_isupper(ch_id)) + state = case_state_table[state][1]; + else if (unicharset->get_islower(ch_id)) + state = case_state_table[state][2]; + else if (unicharset->get_isdigit(ch_id)) + state = case_state_table[state][3]; + else + state = case_state_table[state][0]; + if (state == -1) return false; + } + return state != 5; // single lower is bad +} + +bool Dict::absolute_garbage(const WERD_CHOICE &word, + const UNICHARSET &unicharset) { + if (word.length() < kMinAbsoluteGarbageWordLength) return false; + int num_alphanum = 0; + for (int x = 0; x < word.length(); ++x) { + num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) || + unicharset.get_isdigit(word.unichar_id(x))); + } + return (static_cast(num_alphanum) / + static_cast(word.length()) < kMinAbsoluteGarbageAlphanumFrac); +} + +} // namespace tesseract diff --git a/tesseract/src/dict/dawg.cpp b/tesseract/src/dict/dawg.cpp new file mode 100644 index 00000000..aa339727 --- /dev/null +++ b/tesseract/src/dict/dawg.cpp @@ -0,0 +1,417 @@ +/******************************************************************************** + * + * File: dawg.cpp (Formerly dawg.c) + * Description: Use a Directed Acyclic Word Graph + * Author: Mark Seaman, OCR Technology + * + * (c) Copyright 1987, Hewlett-Packard Company. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + *********************************************************************************/ +/*---------------------------------------------------------------------- + I n c l u d e s +----------------------------------------------------------------------*/ + +#include "dawg.h" + +#include "dict.h" +#include "helpers.h" +#include "strngs.h" +#include "tprintf.h" + +#include + +/*---------------------------------------------------------------------- + F u n c t i o n s f o r D a w g +----------------------------------------------------------------------*/ +namespace tesseract { + +// Destructor. +// It is defined here, so the compiler can create a single vtable +// instead of weak vtables in every compilation unit. +Dawg::~Dawg() = default; + +bool Dawg::prefix_in_dawg(const WERD_CHOICE &word, + bool requires_complete) const { + if (word.length() == 0) return !requires_complete; + NODE_REF node = 0; + int end_index = word.length() - 1; + for (int i = 0; i < end_index; i++) { + EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false); + if (edge == NO_EDGE) { + return false; + } + if ((node = next_node(edge)) == 0) { + // This only happens if all words following this edge terminate -- + // there are no larger words. See Trie::add_word_to_dawg() + return false; + } + } + // Now check the last character. + return edge_char_of(node, word.unichar_id(end_index), requires_complete) != + NO_EDGE; +} + +bool Dawg::word_in_dawg(const WERD_CHOICE &word) const { + return prefix_in_dawg(word, true); +} + +int Dawg::check_for_words(const char *filename, + const UNICHARSET &unicharset, + bool enable_wildcard) const { + if (filename == nullptr) return 0; + + FILE *word_file; + char string [CHARS_PER_LINE]; + int misses = 0; + UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard); + + word_file = fopen(filename, "r"); + if (word_file == nullptr) { + tprintf("Error: Could not open file %s\n", filename); + ASSERT_HOST(word_file); + } + + while (fgets (string, CHARS_PER_LINE, word_file) != nullptr) { + chomp_string(string); // remove newline + WERD_CHOICE word(string, unicharset); + if (word.length() > 0 && + !word.contains_unichar_id(INVALID_UNICHAR_ID)) { + if (!match_words(&word, 0, 0, + enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) { + tprintf("Missing word: %s\n", string); + ++misses; + } + } else { + tprintf("Failed to create a valid word from %s\n", string); + } + } + fclose (word_file); + // Make sure the user sees this with fprintf instead of tprintf. + if (debug_level_) tprintf("Number of lost words=%d\n", misses); + return misses; +} + +void Dawg::iterate_words(const UNICHARSET& unicharset, + std::function cb) const { + WERD_CHOICE word(&unicharset); + iterate_words_rec(word, 0, cb); +} + +static void CallWithUTF8(std::function cb, + const WERD_CHOICE* wc) { + STRING s; + wc->string_and_lengths(&s, nullptr); + cb(s.c_str()); +} + +void Dawg::iterate_words(const UNICHARSET& unicharset, + std::function cb) const { + using namespace std::placeholders; // for _1 + std::function shim( + std::bind(CallWithUTF8, cb, _1)); + WERD_CHOICE word(&unicharset); + iterate_words_rec(word, 0, shim); +} + +void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far, + NODE_REF to_explore, + std::function cb) const { + NodeChildVector children; + this->unichar_ids_of(to_explore, &children, false); + for (int i = 0; i < children.size(); i++) { + WERD_CHOICE next_word(word_so_far); + next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0); + if (this->end_of_word(children[i].edge_ref)) { + cb(&next_word); + } + NODE_REF next = next_node(children[i].edge_ref); + if (next != 0) { + iterate_words_rec(next_word, next, cb); + } + } +} + +bool Dawg::match_words(WERD_CHOICE *word, int32_t index, + NODE_REF node, UNICHAR_ID wildcard) const { + EDGE_REF edge; + int32_t word_end; + + if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) { + bool any_matched = false; + NodeChildVector vec; + this->unichar_ids_of(node, &vec, false); + for (int i = 0; i < vec.size(); ++i) { + word->set_unichar_id(vec[i].unichar_id, index); + if (match_words(word, index, node, wildcard)) + any_matched = true; + } + word->set_unichar_id(wildcard, index); + return any_matched; + } else { + word_end = index == word->length() - 1; + edge = edge_char_of(node, word->unichar_id(index), word_end); + if (edge != NO_EDGE) { // normal edge in DAWG + node = next_node(edge); + if (word_end) { + if (debug_level_ > 1) word->print("match_words() found: "); + return true; + } else if (node != 0) { + return match_words(word, index+1, node, wildcard); + } + } + } + return false; +} + +void Dawg::init(int unicharset_size) { + ASSERT_HOST(unicharset_size > 0); + unicharset_size_ = unicharset_size; + // Set bit masks. We will use the value unicharset_size_ as a null char, so + // the actual number of unichars is unicharset_size_ + 1. + flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0)); + next_node_start_bit_ = flag_start_bit_ + NUM_FLAG_BITS; + letter_mask_ = ~(~0ull << flag_start_bit_); + next_node_mask_ = ~0ull << (flag_start_bit_ + NUM_FLAG_BITS); + flags_mask_ = ~(letter_mask_ | next_node_mask_); +} + + +/*---------------------------------------------------------------------- + F u n c t i o n s f o r S q u i s h e d D a w g +----------------------------------------------------------------------*/ + +SquishedDawg::~SquishedDawg() { delete[] edges_; } + +EDGE_REF SquishedDawg::edge_char_of(NODE_REF node, + UNICHAR_ID unichar_id, + bool word_end) const { + EDGE_REF edge = node; + if (node == 0) { // binary search + EDGE_REF start = 0; + EDGE_REF end = num_forward_edges_in_node0 - 1; + int compare; + while (start <= end) { + edge = (start + end) >> 1; // (start + end) / 2 + compare = given_greater_than_edge_rec(NO_EDGE, word_end, + unichar_id, edges_[edge]); + if (compare == 0) { // given == vec[k] + return edge; + } else if (compare == 1) { // given > vec[k] + start = edge + 1; + } else { // given < vec[k] + end = edge - 1; + } + } + } else { // linear search + if (edge != NO_EDGE && edge_occupied(edge)) { + do { + if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) && + (!word_end || end_of_word_from_edge_rec(edges_[edge]))) + return (edge); + } while (!last_edge(edge++)); + } + } + return (NO_EDGE); // not found +} + +int32_t SquishedDawg::num_forward_edges(NODE_REF node) const { + EDGE_REF edge = node; + int32_t num = 0; + + if (forward_edge (edge)) { + do { + num++; + } while (!last_edge(edge++)); + } + + return (num); +} + +void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const { + if (node == NO_EDGE) return; // nothing to print + + EDGE_REF edge = node; + const char *forward_string = "FORWARD"; + const char *backward_string = " "; + + const char *last_string = "LAST"; + const char *not_last_string = " "; + + const char *eow_string = "EOW"; + const char *not_eow_string = " "; + + const char *direction; + const char *is_last; + const char *eow; + + UNICHAR_ID unichar_id; + + if (edge_occupied(edge)) { + do { + direction = + forward_edge(edge) ? forward_string : backward_string; + is_last = last_edge(edge) ? last_string : not_last_string; + eow = end_of_word(edge) ? eow_string : not_eow_string; + + unichar_id = edge_letter(edge); + tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n", + edge, next_node(edge), unichar_id, + direction, is_last, eow); + + if (edge - node > max_num_edges) return; + } while (!last_edge(edge++)); + + if (edge < num_edges_ && + edge_occupied(edge) && backward_edge(edge)) { + do { + direction = + forward_edge(edge) ? forward_string : backward_string; + is_last = last_edge(edge) ? last_string : not_last_string; + eow = end_of_word(edge) ? eow_string : not_eow_string; + + unichar_id = edge_letter(edge); + tprintf(REFFORMAT " : next = " REFFORMAT + ", unichar_id = %d, %s %s %s\n", + edge, next_node(edge), unichar_id, + direction, is_last, eow); + + if (edge - node > MAX_NODE_EDGES_DISPLAY) return; + } while (!last_edge(edge++)); + } + } + else { + tprintf(REFFORMAT " : no edges in this node\n", node); + } + tprintf("\n"); +} + +void SquishedDawg::print_edge(EDGE_REF edge) const { + if (edge == NO_EDGE) { + tprintf("NO_EDGE\n"); + } else { + tprintf(REFFORMAT " : next = " REFFORMAT + ", unichar_id = '%d', %s %s %s\n", edge, + next_node(edge), edge_letter(edge), + (forward_edge(edge) ? "FORWARD" : " "), + (last_edge(edge) ? "LAST" : " "), + (end_of_word(edge) ? "EOW" : "")); + } +} + +bool SquishedDawg::read_squished_dawg(TFile *file) { + if (debug_level_) tprintf("Reading squished dawg\n"); + + // Read the magic number and check that it matches kDawgMagicNumber, as + // auto-endian fixing should make sure it is always correct. + int16_t magic; + if (!file->DeSerialize(&magic)) return false; + if (magic != kDawgMagicNumber) { + tprintf("Bad magic number on dawg: %d vs %d\n", magic, kDawgMagicNumber); + return false; + } + + int32_t unicharset_size; + if (!file->DeSerialize(&unicharset_size)) return false; + if (!file->DeSerialize(&num_edges_)) return false; + ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty + Dawg::init(unicharset_size); + + edges_ = new EDGE_RECORD[num_edges_]; + if (!file->DeSerialize(&edges_[0], num_edges_)) return false; + if (debug_level_ > 2) { + tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n", + type_, lang_.c_str(), perm_, unicharset_size_, num_edges_); + for (EDGE_REF edge = 0; edge < num_edges_; ++edge) print_edge(edge); + } + return true; +} + +std::unique_ptr SquishedDawg::build_node_map( + int32_t *num_nodes) const { + EDGE_REF edge; + std::unique_ptr node_map(new EDGE_REF[num_edges_]); + int32_t node_counter; + int32_t num_edges; + + for (edge = 0; edge < num_edges_; edge++) // init all slots + node_map[edge] = -1; + + node_counter = num_forward_edges(0); + + *num_nodes = 0; + for (edge = 0; edge < num_edges_; edge++) { // search all slots + + if (forward_edge(edge)) { + (*num_nodes)++; // count nodes links + node_map[edge] = (edge ? node_counter : 0); + num_edges = num_forward_edges(edge); + if (edge != 0) node_counter += num_edges; + edge += num_edges; + if (edge >= num_edges_) break; + if (backward_edge(edge)) while (!last_edge(edge++)); + edge--; + } + } + return node_map; +} + +bool SquishedDawg::write_squished_dawg(TFile *file) { + EDGE_REF edge; + int32_t num_edges; + int32_t node_count = 0; + EDGE_REF old_index; + EDGE_RECORD temp_record; + + if (debug_level_) tprintf("write_squished_dawg\n"); + + std::unique_ptr node_map(build_node_map(&node_count)); + + // Write the magic number to help detecting a change in endianness. + int16_t magic = kDawgMagicNumber; + if (!file->Serialize(&magic)) return false; + if (!file->Serialize(&unicharset_size_)) return false; + + // Count the number of edges in this Dawg. + num_edges = 0; + for (edge=0; edge < num_edges_; edge++) + if (forward_edge(edge)) + num_edges++; + + // Write edge count to file. + if (!file->Serialize(&num_edges)) return false; + + if (debug_level_) { + tprintf("%d nodes in DAWG\n", node_count); + tprintf("%d edges in DAWG\n", num_edges); + } + + for (edge = 0; edge < num_edges_; edge++) { + if (forward_edge(edge)) { // write forward edges + do { + old_index = next_node_from_edge_rec(edges_[edge]); + set_next_node(edge, node_map[old_index]); + temp_record = edges_[edge]; + if (!file->Serialize(&temp_record)) return false; + set_next_node(edge, old_index); + } while (!last_edge(edge++)); + + if (edge >= num_edges_) break; + if (backward_edge(edge)) // skip back links + while (!last_edge(edge++)); + + edge--; + } + } + return true; +} + +} // namespace tesseract diff --git a/tesseract/src/dict/dawg.h b/tesseract/src/dict/dawg.h new file mode 100644 index 00000000..26d76d1e --- /dev/null +++ b/tesseract/src/dict/dawg.h @@ -0,0 +1,554 @@ +/****************************************************************************** + * + * File: dawg.h + * Description: Definition of a class that represents Directed Acyclic Word + * Graph (DAWG), functions to build and manipulate the DAWG. + * Author: Mark Seaman, SW Productivity + * + * (c) Copyright 1987, Hewlett-Packard Company. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + *****************************************************************************/ + +#ifndef DICT_DAWG_H_ +#define DICT_DAWG_H_ + +/*---------------------------------------------------------------------- + I n c l u d e s +----------------------------------------------------------------------*/ + +#include // for PRId64 +#include // for std::function +#include +#include "elst.h" +#include "params.h" +#include "ratngs.h" + +#ifndef __GNUC__ +#ifdef _WIN32 +#define NO_EDGE (int64_t) 0xffffffffffffffffi64 +#endif /*_WIN32*/ +#else +#define NO_EDGE (int64_t) 0xffffffffffffffffll +#endif /*__GNUC__*/ + +namespace tesseract { + +class UNICHARSET; + +using EDGE_RECORD = uint64_t; +using EDGE_ARRAY = EDGE_RECORD *; +using EDGE_REF = int64_t; +using NODE_REF = int64_t; +using NODE_MAP = EDGE_REF *; + +struct NodeChild { + UNICHAR_ID unichar_id; + EDGE_REF edge_ref; + NodeChild(UNICHAR_ID id, EDGE_REF ref): unichar_id(id), edge_ref(ref) {} + NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {} +}; + +using NodeChildVector = GenericVector; +using SuccessorList = GenericVector; +using SuccessorListsVector = GenericVector; + +enum DawgType { + DAWG_TYPE_PUNCTUATION, + DAWG_TYPE_WORD, + DAWG_TYPE_NUMBER, + DAWG_TYPE_PATTERN, + + DAWG_TYPE_COUNT // number of enum entries +}; + +/*---------------------------------------------------------------------- + C o n s t a n t s +----------------------------------------------------------------------*/ + +#define FORWARD_EDGE (int32_t) 0 +#define BACKWARD_EDGE (int32_t) 1 +#define MAX_NODE_EDGES_DISPLAY (int64_t) 100 +#define MARKER_FLAG (int64_t) 1 +#define DIRECTION_FLAG (int64_t) 2 +#define WERD_END_FLAG (int64_t) 4 +#define LETTER_START_BIT 0 +#define NUM_FLAG_BITS 3 +#define REFFORMAT "%" PRId64 + +static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = { + { false, true, true, false }, // for DAWG_TYPE_PUNCTUATION + { true, false, false, false }, // for DAWG_TYPE_WORD + { true, false, false, false }, // for DAWG_TYPE_NUMBER + { false, false, false, false }, // for DAWG_TYPE_PATTERN +}; + +static const char kWildcard[] = "*"; + + +/*---------------------------------------------------------------------- + C l a s s e s a n d S t r u c t s +----------------------------------------------------------------------*/ +// +/// Abstract class (an interface) that declares methods needed by the +/// various tesseract classes to operate on SquishedDawg and Trie objects. +/// +/// This class initializes all the edge masks (since their usage by +/// SquishedDawg and Trie is identical) and implements simple accessors +/// for each of the fields encoded in an EDGE_RECORD. +/// This class also implements word_in_dawg() and check_for_words() +/// (since they use only the public methods of SquishedDawg and Trie +/// classes that are inherited from the Dawg base class). +// +class TESS_API Dawg { + public: + /// Magic number to determine endianness when reading the Dawg from file. + static const int16_t kDawgMagicNumber = 42; + /// A special unichar id that indicates that any appropriate pattern + /// (e.g.dicitonary word, 0-9 digit, etc) can be inserted instead + /// Used for expressing patterns in punctuation and number Dawgs. + static const UNICHAR_ID kPatternUnicharID = 0; + + inline DawgType type() const { return type_; } + inline const STRING &lang() const { return lang_; } + inline PermuterType permuter() const { return perm_; } + + virtual ~Dawg(); + + /// Returns true if the given word is in the Dawg. + bool word_in_dawg(const WERD_CHOICE &word) const; + + // Returns true if the given word prefix is not contraindicated by the dawg. + // If requires_complete is true, then the exact complete word must be present. + bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const; + + /// Checks the Dawg for the words that are listed in the requested file. + /// Returns the number of words in the given file missing from the Dawg. + int check_for_words(const char *filename, + const UNICHARSET &unicharset, + bool enable_wildcard) const; + + // For each word in the Dawg, call the given (permanent) callback with the + // text (UTF-8) version of the word. + void iterate_words(const UNICHARSET& unicharset, + std::function cb) const; + + // For each word in the Dawg, call the given (permanent) callback with the + // text (UTF-8) version of the word. + void iterate_words(const UNICHARSET& unicharset, + std::function cb) const; + + // Pure virtual function that should be implemented by the derived classes. + + /// Returns the edge that corresponds to the letter out of this node. + virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, + bool word_end) const = 0; + + /// Fills the given NodeChildVector with all the unichar ids (and the + /// corresponding EDGE_REFs) for which there is an edge out of this node. + virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec, + bool word_end) const = 0; + + /// Returns the next node visited by following the edge + /// indicated by the given EDGE_REF. + virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0; + + /// Returns true if the edge indicated by the given EDGE_REF + /// marks the end of a word. + virtual bool end_of_word(EDGE_REF edge_ref) const = 0; + + /// Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. + virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0; + + /// Prints the contents of the node indicated by the given NODE_REF. + /// At most max_num_edges will be printed. + virtual void print_node(NODE_REF node, int max_num_edges) const = 0; + + /// Fills vec with unichar ids that represent the character classes + /// of the given unichar_id. + virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, + const UNICHARSET &unicharset, + GenericVector *vec) const { + (void)unichar_id; + (void)unicharset; + (void)vec; + } + + /// Returns the given EDGE_REF if the EDGE_RECORD that it points to has + /// a self loop and the given unichar_id matches the unichar_id stored in the + /// EDGE_RECORD, returns NO_EDGE otherwise. + virtual EDGE_REF pattern_loop_edge( + EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const { + (void)edge_ref; + (void)unichar_id; + (void)word_end; + return false; + } + + protected: + Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level) + : lang_(lang), + type_(type), + perm_(perm), + unicharset_size_(0), + debug_level_(debug_level) {} + + /// Returns the next node visited by following this edge. + inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const { + return ((edge_rec & next_node_mask_) >> next_node_start_bit_); + } + /// Returns the marker flag of this edge. + inline bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const { + return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0; + } + /// Returns the direction flag of this edge. + inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const { + return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ? + BACKWARD_EDGE : FORWARD_EDGE; + } + /// Returns true if this edge marks the end of a word. + inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const { + return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0; + } + /// Returns UNICHAR_ID recorded in this edge. + inline UNICHAR_ID unichar_id_from_edge_rec( + const EDGE_RECORD &edge_rec) const { + return ((edge_rec & letter_mask_) >> LETTER_START_BIT); + } + /// Sets the next node link for this edge in the Dawg. + inline void set_next_node_in_edge_rec( + EDGE_RECORD *edge_rec, EDGE_REF value) { + *edge_rec &= (~next_node_mask_); + *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_); + } + /// Sets this edge record to be the last one in a sequence of edges. + inline void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec) { + *edge_rec |= (MARKER_FLAG << flag_start_bit_); + } + /// Sequentially compares the given values of unichar ID, next node + /// and word end marker with the values in the given EDGE_RECORD. + /// Returns: 1 if at any step the given input value exceeds + /// that of edge_rec (and all the values already + /// checked are the same) + /// 0 if edge_rec_match() returns true + /// -1 otherwise + inline int given_greater_than_edge_rec(NODE_REF next_node, + bool word_end, + UNICHAR_ID unichar_id, + const EDGE_RECORD &edge_rec) const { + UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec); + NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec); + bool curr_word_end = end_of_word_from_edge_rec(edge_rec); + if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node, + curr_word_end, curr_unichar_id)) return 0; + if (unichar_id > curr_unichar_id) return 1; + if (unichar_id == curr_unichar_id) { + if (next_node > curr_next_node) return 1; + if (next_node == curr_next_node) { + if (word_end > curr_word_end) return 1; + } + } + return -1; + } + /// Returns true if all the values are equal (any value matches + /// next_node if next_node == NO_EDGE, any value matches word_end + /// if word_end is false). + inline bool edge_rec_match(NODE_REF next_node, + bool word_end, + UNICHAR_ID unichar_id, + NODE_REF other_next_node, + bool other_word_end, + UNICHAR_ID other_unichar_id) const { + return ((unichar_id == other_unichar_id) && + (next_node == NO_EDGE || next_node == other_next_node) && + (!word_end || (word_end == other_word_end))); + } + + /// Sets unicharset_size_. + /// Initializes the values of various masks from unicharset_size_. + void init(int unicharset_size); + + /// Matches all of the words that are represented by this string. + /// If wildcard is set to something other than INVALID_UNICHAR_ID, + /// the *'s in this string are interpreted as wildcards. + /// WERD_CHOICE param is not passed by const so that wildcard searches + /// can modify it and work without having to copy WERD_CHOICEs. + bool match_words(WERD_CHOICE *word, int32_t index, + NODE_REF node, UNICHAR_ID wildcard) const; + + // Recursively iterate over all words in a dawg (see public iterate_words). + void iterate_words_rec(const WERD_CHOICE& word_so_far, + NODE_REF to_explore, + std::function cb) const; + + // Member Variables. + STRING lang_; + DawgType type_; + /// Permuter code that should be used if the word is found in this Dawg. + PermuterType perm_; + // Variables to construct various edge masks. Formerly: + // #define NEXT_EDGE_MASK (int64_t) 0xfffffff800000000i64 + // #define FLAGS_MASK (int64_t) 0x0000000700000000i64 + // #define LETTER_MASK (int64_t) 0x00000000ffffffffi64 + uint64_t next_node_mask_ = 0; + uint64_t flags_mask_ = 0; + uint64_t letter_mask_ = 0; + int unicharset_size_; + int flag_start_bit_ = 0; + int next_node_start_bit_ = 0; + // Level of debug statements to print to stdout. + int debug_level_; +}; + +// +// DawgPosition keeps track of where we are in the primary dawg we're searching +// as well as where we may be in the "punctuation dawg" which may provide +// surrounding context. +// +// Example: +// punctuation dawg -- space is the "pattern character" +// " " // no punctuation +// "' '" // leading and trailing apostrophes +// " '" // trailing apostrophe +// word dawg: +// "cat" +// "cab" +// "cat's" +// +// DawgPosition(dawg_index, dawg_ref, punc_index, punc_ref, rtp) +// +// DawgPosition(-1, NO_EDGE, p, pe, false) +// We're in the punctuation dawg, no other dawg has been started. +// (1) If there's a pattern edge as a punc dawg child of us, +// for each punc-following dawg starting with ch, produce: +// Result: DawgPosition(k, w, p', false) +// (2) If there's a valid continuation in the punc dawg, produce: +// Result: DawgPosition(-k, NO_EDGE, p', false) +// +// DawgPosition(k, w, -1, NO_EDGE, false) +// We're in dawg k. Going back to punctuation dawg is not an option. +// Follow ch in dawg k. +// +// DawgPosition(k, w, p, pe, false) +// We're in dawg k. Continue in dawg k and/or go back to the punc dawg. +// If ending, check that the punctuation dawg is also ok to end here. +// +// DawgPosition(k, w, p, pe true) +// We're back in the punctuation dawg. Continuing there is the only option. +struct DawgPosition { + DawgPosition() = default; + DawgPosition(int dawg_idx, EDGE_REF dawgref, + int punc_idx, EDGE_REF puncref, + bool backtopunc) + : dawg_ref(dawgref), punc_ref(puncref), + dawg_index(dawg_idx), punc_index(punc_idx), + back_to_punc(backtopunc) { + } + bool operator==(const DawgPosition &other) { + return dawg_index == other.dawg_index && + dawg_ref == other.dawg_ref && + punc_index == other.punc_index && + punc_ref == other.punc_ref && + back_to_punc == other.back_to_punc; + } + + EDGE_REF dawg_ref = NO_EDGE; + EDGE_REF punc_ref = NO_EDGE; + int8_t dawg_index = -1; + int8_t punc_index = -1; + // Have we returned to the punc dawg at the end of the word? + bool back_to_punc = false; +}; + +class DawgPositionVector : public GenericVector { + public: + /// Adds an entry for the given dawg_index with the given node to the vec. + /// Returns false if the same entry already exists in the vector, + /// true otherwise. + inline bool add_unique(const DawgPosition &new_pos, + bool debug, + const char *debug_msg) { + for (int i = 0; i < size(); ++i) { + if (data_[i] == new_pos) return false; + } + push_back(new_pos); + if (debug) { + tprintf("%s[%d, " REFFORMAT "] [punc: " REFFORMAT "%s]\n", + debug_msg, new_pos.dawg_index, new_pos.dawg_ref, + new_pos.punc_ref, new_pos.back_to_punc ? " returned" : ""); + } + return true; + } +}; + +// +/// Concrete class that can operate on a compacted (squished) Dawg (read, +/// search and write to file). This class is read-only in the sense that +/// new words can not be added to an instance of SquishedDawg. +/// The underlying representation of the nodes and edges in SquishedDawg +/// is stored as a contiguous EDGE_ARRAY (read from file or given as an +/// argument to the constructor). +// +class TESS_API SquishedDawg : public Dawg { + public: + SquishedDawg(DawgType type, const STRING &lang, PermuterType perm, + int debug_level) + : Dawg(type, lang, perm, debug_level) {} + SquishedDawg(const char *filename, DawgType type, const STRING &lang, + PermuterType perm, int debug_level) + : Dawg(type, lang, perm, debug_level) { + TFile file; + ASSERT_HOST(file.Open(filename, nullptr)); + ASSERT_HOST(read_squished_dawg(&file)); + num_forward_edges_in_node0 = num_forward_edges(0); + } + SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type, + const STRING &lang, PermuterType perm, int unicharset_size, + int debug_level) + : Dawg(type, lang, perm, debug_level), + edges_(edges), + num_edges_(num_edges) { + init(unicharset_size); + num_forward_edges_in_node0 = num_forward_edges(0); + if (debug_level > 3) print_all("SquishedDawg:"); + } + ~SquishedDawg() override; + + // Loads using the given TFile. Returns false on failure. + bool Load(TFile *fp) { + if (!read_squished_dawg(fp)) return false; + num_forward_edges_in_node0 = num_forward_edges(0); + return true; + } + + int NumEdges() { return num_edges_; } + + /// Returns the edge that corresponds to the letter out of this node. + EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, + bool word_end) const override; + + /// Fills the given NodeChildVector with all the unichar ids (and the + /// corresponding EDGE_REFs) for which there is an edge out of this node. + void unichar_ids_of(NODE_REF node, NodeChildVector *vec, + bool word_end) const override { + EDGE_REF edge = node; + if (!edge_occupied(edge) || edge == NO_EDGE) return; + assert(forward_edge(edge)); // we don't expect any backward edges to + do { // be present when this function is called + if (!word_end || end_of_word_from_edge_rec(edges_[edge])) { + vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge)); + } + } while (!last_edge(edge++)); + } + + /// Returns the next node visited by following the edge + /// indicated by the given EDGE_REF. + NODE_REF next_node(EDGE_REF edge) const override { + return next_node_from_edge_rec((edges_[edge])); + } + + /// Returns true if the edge indicated by the given EDGE_REF + /// marks the end of a word. + bool end_of_word(EDGE_REF edge_ref) const override { + return end_of_word_from_edge_rec((edges_[edge_ref])); + } + + /// Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. + UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override { + return unichar_id_from_edge_rec((edges_[edge_ref])); + } + + /// Prints the contents of the node indicated by the given NODE_REF. + /// At most max_num_edges will be printed. + void print_node(NODE_REF node, int max_num_edges) const override; + + /// Writes the squished/reduced Dawg to a file. + bool write_squished_dawg(TFile *file); + + /// Opens the file with the given filename and writes the + /// squished/reduced Dawg to the file. + bool write_squished_dawg(const char *filename) { + TFile file; + file.OpenWrite(nullptr); + if (!this->write_squished_dawg(&file)) { + tprintf("Error serializing %s\n", filename); + return false; + } + if (!file.CloseWrite(filename, nullptr)) { + tprintf("Error writing file %s\n", filename); + return false; + } + return true; + } + + private: + /// Sets the next node link for this edge. + inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) { + set_next_node_in_edge_rec(&(edges_[edge_ref]), value); + } + /// Sets the edge to be empty. + inline void set_empty_edge(EDGE_REF edge_ref) { + (edges_[edge_ref] = next_node_mask_); + } + /// Goes through all the edges and clears each one out. + inline void clear_all_edges() { + for (int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge); + } + /// Clears the last flag of this edge. + inline void clear_marker_flag(EDGE_REF edge_ref) { + (edges_[edge_ref] &= ~(MARKER_FLAG << flag_start_bit_)); + } + /// Returns true if this edge is in the forward direction. + inline bool forward_edge(EDGE_REF edge_ref) const { + return (edge_occupied(edge_ref) && + (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref]))); + } + /// Returns true if this edge is in the backward direction. + inline bool backward_edge(EDGE_REF edge_ref) const { + return (edge_occupied(edge_ref) && + (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref]))); + } + /// Returns true if the edge spot in this location is occupied. + inline bool edge_occupied(EDGE_REF edge_ref) const { + return (edges_[edge_ref] != next_node_mask_); + } + /// Returns true if this edge is the last edge in a sequence. + inline bool last_edge(EDGE_REF edge_ref) const { + return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0; + } + + /// Counts and returns the number of forward edges in this node. + int32_t num_forward_edges(NODE_REF node) const; + + /// Reads SquishedDawg from a file. + bool read_squished_dawg(TFile *file); + + /// Prints the contents of an edge indicated by the given EDGE_REF. + void print_edge(EDGE_REF edge) const; + + /// Prints the contents of the SquishedDawg. + void print_all(const char* msg) { + tprintf("\n__________________________\n%s\n", msg); + for (int i = 0; i < num_edges_; ++i) print_edge(i); + tprintf("__________________________\n"); + } + /// Constructs a mapping from the memory node indices to disk node indices. + std::unique_ptr build_node_map(int32_t *num_nodes) const; + + // Member variables. + EDGE_ARRAY edges_ = nullptr; + int32_t num_edges_ = 0; + int num_forward_edges_in_node0 = 0; +}; + +} // namespace tesseract + +#endif // DICT_DAWG_H_ diff --git a/tesseract/src/dict/dawg_cache.cpp b/tesseract/src/dict/dawg_cache.cpp new file mode 100644 index 00000000..0ade27e6 --- /dev/null +++ b/tesseract/src/dict/dawg_cache.cpp @@ -0,0 +1,96 @@ +/////////////////////////////////////////////////////////////////////// +// File: dawg_cache.cpp +// Description: A class that knows about loading and caching dawgs. +// Author: David Eger +// +// (C) Copyright 2012, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include "dawg_cache.h" + +#include "dawg.h" +#include "object_cache.h" +#include "strngs.h" +#include "tessdatamanager.h" + +namespace tesseract { + +struct DawgLoader { + DawgLoader(const STRING &lang, TessdataType tessdata_dawg_type, + int dawg_debug_level, TessdataManager *data_file) + : lang_(lang), + data_file_(data_file), + tessdata_dawg_type_(tessdata_dawg_type), + dawg_debug_level_(dawg_debug_level) {} + + Dawg *Load(); + + STRING lang_; + TessdataManager *data_file_; + TessdataType tessdata_dawg_type_; + int dawg_debug_level_; +}; + +Dawg *DawgCache::GetSquishedDawg(const STRING &lang, + TessdataType tessdata_dawg_type, + int debug_level, TessdataManager *data_file) { + std::string data_id = data_file->GetDataFileName(); + data_id += kTessdataFileSuffixes[tessdata_dawg_type]; + DawgLoader loader(lang, tessdata_dawg_type, debug_level, data_file); + return dawgs_.Get(data_id, std::bind(&DawgLoader::Load, &loader)); +} + +Dawg *DawgLoader::Load() { + TFile fp; + if (!data_file_->GetComponent(tessdata_dawg_type_, &fp)) return nullptr; + DawgType dawg_type; + PermuterType perm_type; + switch (tessdata_dawg_type_) { + case TESSDATA_PUNC_DAWG: + case TESSDATA_LSTM_PUNC_DAWG: + dawg_type = DAWG_TYPE_PUNCTUATION; + perm_type = PUNC_PERM; + break; + case TESSDATA_SYSTEM_DAWG: + case TESSDATA_LSTM_SYSTEM_DAWG: + dawg_type = DAWG_TYPE_WORD; + perm_type = SYSTEM_DAWG_PERM; + break; + case TESSDATA_NUMBER_DAWG: + case TESSDATA_LSTM_NUMBER_DAWG: + dawg_type = DAWG_TYPE_NUMBER; + perm_type = NUMBER_PERM; + break; + case TESSDATA_BIGRAM_DAWG: + dawg_type = DAWG_TYPE_WORD; // doesn't actually matter + perm_type = COMPOUND_PERM; // doesn't actually matter + break; + case TESSDATA_UNAMBIG_DAWG: + dawg_type = DAWG_TYPE_WORD; + perm_type = SYSTEM_DAWG_PERM; + break; + case TESSDATA_FREQ_DAWG: + dawg_type = DAWG_TYPE_WORD; + perm_type = FREQ_DAWG_PERM; + break; + default: + return nullptr; + } + auto *retval = + new SquishedDawg(dawg_type, lang_, perm_type, dawg_debug_level_); + if (retval->Load(&fp)) return retval; + delete retval; + return nullptr; +} + +} // namespace tesseract diff --git a/tesseract/src/dict/dawg_cache.h b/tesseract/src/dict/dawg_cache.h new file mode 100644 index 00000000..83233802 --- /dev/null +++ b/tesseract/src/dict/dawg_cache.h @@ -0,0 +1,53 @@ +/////////////////////////////////////////////////////////////////////// +// File: dawg_cache.h +// Description: A class that knows about loading and caching dawgs. +// Author: David Eger +// Created: Fri Jan 27 12:08:00 PST 2012 +// +// (C) Copyright 2012, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_DICT_DAWG_CACHE_H_ +#define TESSERACT_DICT_DAWG_CACHE_H_ + +#include "dawg.h" +#include "object_cache.h" +#include "strngs.h" +#include "tessdatamanager.h" + +namespace tesseract { + +class DawgCache { + public: + Dawg *GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, + int debug_level, TessdataManager *data_file); + + // If we manage the given dawg, decrement its count, + // and possibly delete it if the count reaches zero. + // If dawg is unknown to us, return false. + bool FreeDawg(Dawg *dawg) { + return dawgs_.Free(dawg); + } + + // Free up any currently unused dawgs. + void DeleteUnusedDawgs() { + dawgs_.DeleteUnusedObjects(); + } + + private: + ObjectCache dawgs_; +}; + +} // namespace tesseract + +#endif // TESSERACT_DICT_DAWG_CACHE_H_ diff --git a/tesseract/src/dict/dict.cpp b/tesseract/src/dict/dict.cpp new file mode 100644 index 00000000..eb69b569 --- /dev/null +++ b/tesseract/src/dict/dict.cpp @@ -0,0 +1,888 @@ +/////////////////////////////////////////////////////////////////////// +// File: dict.cpp +// Description: dict class. +// Author: Samuel Charron +// +// (C) Copyright 2006, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include "dict.h" + +#include "tprintf.h" + +#include + +namespace tesseract { + +class Image; + +Dict::Dict(CCUtil* ccutil) + : letter_is_okay_(&tesseract::Dict::def_letter_is_okay), + probability_in_context_(&tesseract::Dict::def_probability_in_context), + ccutil_(ccutil), + wildcard_unichar_id_(INVALID_UNICHAR_ID), + apostrophe_unichar_id_(INVALID_UNICHAR_ID), + question_unichar_id_(INVALID_UNICHAR_ID), + slash_unichar_id_(INVALID_UNICHAR_ID), + hyphen_unichar_id_(INVALID_UNICHAR_ID), + STRING_MEMBER(user_words_file, "", "A filename of user-provided words.", + getCCUtil()->params()), + STRING_INIT_MEMBER(user_words_suffix, "", + "A suffix of user-provided words located in tessdata.", + getCCUtil()->params()), + STRING_MEMBER(user_patterns_file, "", + "A filename of user-provided patterns.", + getCCUtil()->params()), + STRING_INIT_MEMBER(user_patterns_suffix, "", + "A suffix of user-provided patterns located in " + "tessdata.", + getCCUtil()->params()), + BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.", + getCCUtil()->params()), + BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.", + getCCUtil()->params()), + BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.", + getCCUtil()->params()), + BOOL_INIT_MEMBER(load_punc_dawg, true, + "Load dawg with punctuation" + " patterns.", + getCCUtil()->params()), + BOOL_INIT_MEMBER(load_number_dawg, true, + "Load dawg with number" + " patterns.", + getCCUtil()->params()), + BOOL_INIT_MEMBER(load_bigram_dawg, true, + "Load dawg with special word " + "bigrams.", + getCCUtil()->params()), + double_MEMBER(xheight_penalty_subscripts, 0.125, + "Score penalty (0.1 = 10%) added if there are subscripts " + "or superscripts in a word, but it is otherwise OK.", + getCCUtil()->params()), + double_MEMBER(xheight_penalty_inconsistent, 0.25, + "Score penalty (0.1 = 10%) added if an xheight is " + "inconsistent.", + getCCUtil()->params()), + double_MEMBER(segment_penalty_dict_frequent_word, 1.0, + "Score multiplier for word matches which have good case and" + " are frequent in the given language (lower is better).", + getCCUtil()->params()), + double_MEMBER(segment_penalty_dict_case_ok, 1.1, + "Score multiplier for word matches that have good case " + "(lower is better).", + getCCUtil()->params()), + double_MEMBER(segment_penalty_dict_case_bad, 1.3125, + "Default score multiplier for word matches, which may have " + "case issues (lower is better).", + getCCUtil()->params()), + double_MEMBER(segment_penalty_dict_nonword, 1.25, + "Score multiplier for glyph fragment segmentations which " + "do not match a dictionary word (lower is better).", + getCCUtil()->params()), + double_MEMBER(segment_penalty_garbage, 1.50, + "Score multiplier for poorly cased strings that are not in" + " the dictionary and generally look like garbage (lower is" + " better).", + getCCUtil()->params()), + STRING_MEMBER(output_ambig_words_file, "", + "Output file for ambiguities found in the dictionary", + getCCUtil()->params()), + INT_MEMBER(dawg_debug_level, 0, + "Set to 1 for general debug info" + ", to 2 for more details, to 3 to see all the debug messages", + getCCUtil()->params()), + INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.", + getCCUtil()->params()), + BOOL_MEMBER(use_only_first_uft8_step, false, + "Use only the first UTF8 step of the given string" + " when computing log probabilities.", + getCCUtil()->params()), + double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor", + getCCUtil()->params()), + double_MEMBER(stopper_nondict_certainty_base, -2.50, + "Certainty threshold for non-dict words", + getCCUtil()->params()), + double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0, + "Reject certainty offset", getCCUtil()->params()), + INT_MEMBER(stopper_smallword_size, 2, + "Size of dict word to be treated as non-dict word", + getCCUtil()->params()), + double_MEMBER(stopper_certainty_per_char, -0.50, + "Certainty to add" + " for each dict char above small word size.", + getCCUtil()->params()), + double_MEMBER(stopper_allowable_character_badness, 3.0, + "Max certaintly variation allowed in a word (in sigma)", + getCCUtil()->params()), + INT_MEMBER(stopper_debug_level, 0, "Stopper debug level", + getCCUtil()->params()), + BOOL_MEMBER(stopper_no_acceptable_choices, false, + "Make AcceptableChoice() always return false. Useful" + " when there is a need to explore all segmentations", + getCCUtil()->params()), + INT_MEMBER(tessedit_truncate_wordchoice_log, 10, + "Max words to keep in list", getCCUtil()->params()), + STRING_MEMBER(word_to_debug, "", + "Word for which stopper debug" + " information should be printed to stdout", + getCCUtil()->params()), + BOOL_MEMBER(segment_nonalphabetic_script, false, + "Don't use any alphabetic-specific tricks." + " Set to true in the traineddata config file for" + " scripts that are cursive or inherently fixed-pitch", + getCCUtil()->params()), + BOOL_MEMBER(save_doc_words, 0, "Save Document Words", + getCCUtil()->params()), + double_MEMBER(doc_dict_pending_threshold, 0.0, + "Worst certainty for using pending dictionary", + getCCUtil()->params()), + double_MEMBER(doc_dict_certainty_threshold, -2.25, + "Worst certainty for words that can be inserted into the" + " document dictionary", + getCCUtil()->params()), + INT_MEMBER(max_permuter_attempts, 10000, + "Maximum number of different" + " character choices to consider during permutation." + " This limit is especially useful when user patterns" + " are specified, since overly generic patterns can result in" + " dawg search exploring an overly large number of options.", + getCCUtil()->params()) { + reject_offset_ = 0.0; + go_deeper_fxn_ = nullptr; + hyphen_word_ = nullptr; + last_word_on_line_ = false; + document_words_ = nullptr; + dawg_cache_ = nullptr; + dawg_cache_is_ours_ = false; + pending_words_ = nullptr; + bigram_dawg_ = nullptr; + freq_dawg_ = nullptr; + punc_dawg_ = nullptr; + unambig_dawg_ = nullptr; + wordseg_rating_adjust_factor_ = -1.0f; + output_ambig_words_file_ = nullptr; +} + +Dict::~Dict() { + End(); + delete hyphen_word_; + if (output_ambig_words_file_ != nullptr) fclose(output_ambig_words_file_); +} + +DawgCache* Dict::GlobalDawgCache() { + // This global cache (a singleton) will outlive every Tesseract instance + // (even those that someone else might declare as global statics). + static DawgCache cache; + return &cache; +} + +// Sets up ready for a Load or LoadLSTM. +void Dict::SetupForLoad(DawgCache* dawg_cache) { + if (dawgs_.size() != 0) this->End(); + + apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol); + question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol); + slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol); + hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol); + + if (dawg_cache != nullptr) { + dawg_cache_ = dawg_cache; + dawg_cache_is_ours_ = false; + } else { + dawg_cache_ = new DawgCache(); + dawg_cache_is_ours_ = true; + } +} + +// Loads the dawgs needed by Tesseract. Call FinishLoad() after. +void Dict::Load(const STRING& lang, TessdataManager* data_file) { + // Load dawgs_. + if (load_punc_dawg) { + punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, + dawg_debug_level, data_file); + if (punc_dawg_) dawgs_ += punc_dawg_; + } + if (load_system_dawg) { + Dawg* system_dawg = dawg_cache_->GetSquishedDawg( + lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file); + if (system_dawg) dawgs_ += system_dawg; + } + if (load_number_dawg) { + Dawg* number_dawg = dawg_cache_->GetSquishedDawg( + lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file); + if (number_dawg) dawgs_ += number_dawg; + } + if (load_bigram_dawg) { + bigram_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG, + dawg_debug_level, data_file); + // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the + // dawgs_!! + } + if (load_freq_dawg) { + freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, + dawg_debug_level, data_file); + if (freq_dawg_) dawgs_ += freq_dawg_; + } + if (load_unambig_dawg) { + unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, + dawg_debug_level, data_file); + if (unambig_dawg_) dawgs_ += unambig_dawg_; + } + + STRING name; + if (!user_words_suffix.empty() || !user_words_file.empty()) { + Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, + getUnicharset().size(), dawg_debug_level); + if (!user_words_file.empty()) { + name = user_words_file; + } else { + name = getCCUtil()->language_data_path_prefix; + name += user_words_suffix; + } + if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(), + Trie::RRP_REVERSE_IF_HAS_RTL)) { + tprintf("Error: failed to load %s\n", name.c_str()); + delete trie_ptr; + } else { + dawgs_ += trie_ptr; + } + } + + if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) { + Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, + getUnicharset().size(), dawg_debug_level); + trie_ptr->initialize_patterns(&(getUnicharset())); + if (!user_patterns_file.empty()) { + name = user_patterns_file; + } else { + name = getCCUtil()->language_data_path_prefix; + name += user_patterns_suffix; + } + if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) { + tprintf("Error: failed to load %s\n", name.c_str()); + delete trie_ptr; + } else { + dawgs_ += trie_ptr; + } + } + + document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, + getUnicharset().size(), dawg_debug_level); + dawgs_ += document_words_; + + // This dawg is temporary and should not be searched by letter_is_ok. + pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM, + getUnicharset().size(), dawg_debug_level); +} + +// Loads the dawgs needed by the LSTM model. Call FinishLoad() after. +void Dict::LoadLSTM(const STRING& lang, TessdataManager* data_file) { + // Load dawgs_. + if (load_punc_dawg) { + punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, + dawg_debug_level, data_file); + if (punc_dawg_) dawgs_ += punc_dawg_; + } + if (load_system_dawg) { + Dawg* system_dawg = dawg_cache_->GetSquishedDawg( + lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file); + if (system_dawg) dawgs_ += system_dawg; + } + if (load_number_dawg) { + Dawg* number_dawg = dawg_cache_->GetSquishedDawg( + lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file); + if (number_dawg) dawgs_ += number_dawg; + } + + // stolen from Dict::Load (but needs params_ from Tesseract + // langdata/config/api): + STRING name; + if (!user_words_suffix.empty() || !user_words_file.empty()) { + Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, + getUnicharset().size(), dawg_debug_level); + if (!user_words_file.empty()) { + name = user_words_file; + } else { + name = getCCUtil()->language_data_path_prefix; + name += user_words_suffix; + } + if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(), + Trie::RRP_REVERSE_IF_HAS_RTL)) { + tprintf("Error: failed to load %s\n", name.c_str()); + delete trie_ptr; + } else { + dawgs_ += trie_ptr; + } + } + + if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) { + Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, + getUnicharset().size(), dawg_debug_level); + trie_ptr->initialize_patterns(&(getUnicharset())); + if (!user_patterns_file.empty()) { + name = user_patterns_file; + } else { + name = getCCUtil()->language_data_path_prefix; + name += user_patterns_suffix; + } + if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) { + tprintf("Error: failed to load %s\n", name.c_str()); + delete trie_ptr; + } else { + dawgs_ += trie_ptr; + } + } +} + +// Completes the loading process after Load() and/or LoadLSTM(). +// Returns false if no dictionaries were loaded. +bool Dict::FinishLoad() { + if (dawgs_.empty()) return false; + // Construct a list of corresponding successors for each dawg. Each entry, i, + // in the successors_ vector is a vector of integers that represent the + // indices into the dawgs_ vector of the successors for dawg i. + successors_.reserve(dawgs_.size()); + for (int i = 0; i < dawgs_.size(); ++i) { + const Dawg* dawg = dawgs_[i]; + auto* lst = new SuccessorList(); + for (int j = 0; j < dawgs_.size(); ++j) { + const Dawg* other = dawgs_[j]; + if (dawg != nullptr && other != nullptr && + (dawg->lang() == other->lang()) && + kDawgSuccessors[dawg->type()][other->type()]) + *lst += j; + } + successors_ += lst; + } + return true; +} + +void Dict::End() { + if (dawgs_.size() == 0) return; // Not safe to call twice. + for (int i = 0; i < dawgs_.size(); i++) { + if (!dawg_cache_->FreeDawg(dawgs_[i])) { + delete dawgs_[i]; + } + } + dawg_cache_->FreeDawg(bigram_dawg_); + if (dawg_cache_is_ours_) { + delete dawg_cache_; + dawg_cache_ = nullptr; + } + successors_.delete_data_pointers(); + dawgs_.clear(); + successors_.clear(); + document_words_ = nullptr; + delete pending_words_; + pending_words_ = nullptr; +} + +// Returns true if in light of the current state unichar_id is allowed +// according to at least one of the dawgs in the dawgs_ vector. +// See more extensive comments in dict.h where this function is declared. +int Dict::def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset, + UNICHAR_ID unichar_id, bool word_end) const { + auto* dawg_args = static_cast(void_dawg_args); + + ASSERT_HOST(unicharset.contains_unichar_id(unichar_id)); + + if (dawg_debug_level >= 3) { + tprintf( + "def_letter_is_okay: current unichar=%s word_end=%d" + " num active dawgs=%d\n", + getUnicharset().debug_str(unichar_id).c_str(), word_end, + dawg_args->active_dawgs->size()); + } + + // Do not accept words that contain kPatternUnicharID. + // (otherwise pattern dawgs would not function correctly). + // Do not accept words containing INVALID_UNICHAR_IDs. + if (unichar_id == Dawg::kPatternUnicharID || + unichar_id == INVALID_UNICHAR_ID) { + dawg_args->permuter = NO_PERM; + return NO_PERM; + } + + // Initialization. + PermuterType curr_perm = NO_PERM; + dawg_args->updated_dawgs->clear(); + dawg_args->valid_end = false; + + // Go over the active_dawgs vector and insert DawgPosition records + // with the updated ref (an edge with the corresponding unichar id) into + // dawg_args->updated_pos. + for (int a = 0; a < dawg_args->active_dawgs->size(); ++a) { + const DawgPosition& pos = (*dawg_args->active_dawgs)[a]; + const Dawg* punc_dawg = + pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr; + const Dawg* dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr; + + if (!dawg && !punc_dawg) { + // shouldn't happen. + tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n"); + continue; + } + if (!dawg) { + // We're in the punctuation dawg. A core dawg has not been chosen. + NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref); + EDGE_REF punc_transition_edge = + punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end); + if (punc_transition_edge != NO_EDGE) { + // Find all successors, and see which can transition. + const SuccessorList& slist = *(successors_[pos.punc_index]); + for (int s = 0; s < slist.size(); ++s) { + int sdawg_index = slist[s]; + const Dawg* sdawg = dawgs_[sdawg_index]; + UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg); + EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end); + if (dawg_edge != NO_EDGE) { + if (dawg_debug_level >= 3) { + tprintf("Letter found in dawg %d\n", sdawg_index); + } + dawg_args->updated_dawgs->add_unique( + DawgPosition(sdawg_index, dawg_edge, pos.punc_index, + punc_transition_edge, false), + dawg_debug_level > 0, + "Append transition from punc dawg to current dawgs: "); + if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter(); + if (sdawg->end_of_word(dawg_edge) && + punc_dawg->end_of_word(punc_transition_edge)) + dawg_args->valid_end = true; + } + } + } + EDGE_REF punc_edge = + punc_dawg->edge_char_of(punc_node, unichar_id, word_end); + if (punc_edge != NO_EDGE) { + if (dawg_debug_level >= 3) { + tprintf("Letter found in punctuation dawg\n"); + } + dawg_args->updated_dawgs->add_unique( + DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false), + dawg_debug_level > 0, "Extend punctuation dawg: "); + if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM; + if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true; + } + continue; + } + + if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) { + // We can end the main word here. + // If we can continue on the punc ref, add that possibility. + NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref); + EDGE_REF punc_edge = + punc_node == NO_EDGE + ? NO_EDGE + : punc_dawg->edge_char_of(punc_node, unichar_id, word_end); + if (punc_edge != NO_EDGE) { + dawg_args->updated_dawgs->add_unique( + DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index, + punc_edge, true), + dawg_debug_level > 0, "Return to punctuation dawg: "); + if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter(); + if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true; + } + } + + if (pos.back_to_punc) continue; + + // If we are dealing with the pattern dawg, look up all the + // possible edges, not only for the exact unichar_id, but also + // for all its character classes (alpha, digit, etc). + if (dawg->type() == DAWG_TYPE_PATTERN) { + ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args, + &curr_perm); + // There can't be any successors to dawg that is of type + // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition. + continue; + } + + // Find the edge out of the node for the unichar_id. + NODE_REF node = GetStartingNode(dawg, pos.dawg_ref); + EDGE_REF edge = + (node == NO_EDGE) + ? NO_EDGE + : dawg->edge_char_of( + node, char_for_dawg(unicharset, unichar_id, dawg), word_end); + + if (dawg_debug_level >= 3) { + tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", + pos.dawg_index, node, edge); + } + + if (edge != NO_EDGE) { // the unichar was found in the current dawg + if (dawg_debug_level >= 3) { + tprintf("Letter found in dawg %d\n", pos.dawg_index); + } + if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) { + if (dawg_debug_level >= 3) { + tprintf("Punctuation constraint not satisfied at end of word.\n"); + } + continue; + } + if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter(); + if (dawg->end_of_word(edge) && + (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref))) + dawg_args->valid_end = true; + dawg_args->updated_dawgs->add_unique( + DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, + false), + dawg_debug_level > 0, + "Append current dawg to updated active dawgs: "); + } + } // end for + // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM + // or if we found the current letter in a non-punctuation dawg. This + // allows preserving information on which dawg the "core" word came from. + // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM. + if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM || + (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) { + dawg_args->permuter = curr_perm; + } + if (dawg_debug_level >= 2) { + tprintf("Returning %d for permuter code for this character.\n", + dawg_args->permuter); + } + return dawg_args->permuter; +} + +void Dict::ProcessPatternEdges(const Dawg* dawg, const DawgPosition& pos, + UNICHAR_ID unichar_id, bool word_end, + DawgArgs* dawg_args, + PermuterType* curr_perm) const { + NODE_REF node = GetStartingNode(dawg, pos.dawg_ref); + // Try to find the edge corresponding to the exact unichar_id and to all the + // edges corresponding to the character class of unichar_id. + GenericVector unichar_id_patterns; + unichar_id_patterns.push_back(unichar_id); + dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), + &unichar_id_patterns); + for (int i = 0; i < unichar_id_patterns.size(); ++i) { + // On the first iteration check all the outgoing edges. + // On the second iteration check all self-loops. + for (int k = 0; k < 2; ++k) { + EDGE_REF edge = + (k == 0) ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end) + : dawg->pattern_loop_edge(pos.dawg_ref, + unichar_id_patterns[i], word_end); + if (edge == NO_EDGE) continue; + if (dawg_debug_level >= 3) { + tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", + pos.dawg_index, node, edge); + tprintf("Letter found in pattern dawg %d\n", pos.dawg_index); + } + if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter(); + if (dawg->end_of_word(edge)) dawg_args->valid_end = true; + dawg_args->updated_dawgs->add_unique( + DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, + pos.back_to_punc), + dawg_debug_level > 0, + "Append current dawg to updated active dawgs: "); + } + } +} + +// Fill the given active_dawgs vector with dawgs that could contain the +// beginning of the word. If hyphenated() returns true, copy the entries +// from hyphen_active_dawgs_ instead. +void Dict::init_active_dawgs(DawgPositionVector* active_dawgs, + bool ambigs_mode) const { + int i; + if (hyphenated()) { + *active_dawgs = hyphen_active_dawgs_; + if (dawg_debug_level >= 3) { + for (i = 0; i < hyphen_active_dawgs_.size(); ++i) { + tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n", + hyphen_active_dawgs_[i].dawg_index, + hyphen_active_dawgs_[i].dawg_ref); + } + } + } else { + default_dawgs(active_dawgs, ambigs_mode); + } +} + +void Dict::default_dawgs(DawgPositionVector* dawg_pos_vec, + bool suppress_patterns) const { + bool punc_dawg_available = + (punc_dawg_ != nullptr) && + punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE; + + for (int i = 0; i < dawgs_.size(); i++) { + if (dawgs_[i] != nullptr && + !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) { + int dawg_ty = dawgs_[i]->type(); + bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty]; + if (dawg_ty == DAWG_TYPE_PUNCTUATION) { + *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false); + if (dawg_debug_level >= 3) { + tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, + NO_EDGE); + } + } else if (!punc_dawg_available || !subsumed_by_punc) { + *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false); + if (dawg_debug_level >= 3) { + tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE); + } + } + } + } +} + +void Dict::add_document_word(const WERD_CHOICE& best_choice) { + // Do not add hyphenated word parts to the document dawg. + // hyphen_word_ will be non-nullptr after the set_hyphen_word() is + // called when the first part of the hyphenated word is + // discovered and while the second part of the word is recognized. + // hyphen_word_ is cleared in cc_recg() before the next word on + // the line is recognized. + if (hyphen_word_) return; + + int stringlen = best_choice.length(); + + if (valid_word(best_choice) || stringlen < 2) return; + + // Discard words that contain >= kDocDictMaxRepChars repeating unichars. + if (best_choice.length() >= kDocDictMaxRepChars) { + int num_rep_chars = 1; + UNICHAR_ID uch_id = best_choice.unichar_id(0); + for (int i = 1; i < best_choice.length(); ++i) { + if (best_choice.unichar_id(i) != uch_id) { + num_rep_chars = 1; + uch_id = best_choice.unichar_id(i); + } else { + ++num_rep_chars; + if (num_rep_chars == kDocDictMaxRepChars) return; + } + } + } + + if (best_choice.certainty() < doc_dict_certainty_threshold || + stringlen == 2) { + if (best_choice.certainty() < doc_dict_pending_threshold) return; + + if (!pending_words_->word_in_dawg(best_choice)) { + if (stringlen > 2 || + (stringlen == 2 && + getUnicharset().get_isupper(best_choice.unichar_id(0)) && + getUnicharset().get_isupper(best_choice.unichar_id(1)))) { + pending_words_->add_word_to_dawg(best_choice); + } + return; + } + } + + if (save_doc_words) { + STRING filename(getCCUtil()->imagefile); + filename += ".doc"; + FILE* doc_word_file = fopen(filename.c_str(), "a"); + if (doc_word_file == nullptr) { + tprintf("Error: Could not open file %s\n", filename.c_str()); + ASSERT_HOST(doc_word_file); + } + fprintf(doc_word_file, "%s\n", best_choice.debug_string().c_str()); + fclose(doc_word_file); + } + document_words_->add_word_to_dawg(best_choice); +} + +void Dict::adjust_word(WERD_CHOICE* word, bool nonword, + XHeightConsistencyEnum xheight_consistency, + float additional_adjust, bool modify_rating, + bool debug) { + bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() && + word->GetTopScriptID() == getUnicharset().han_sid()); + bool case_is_ok = (is_han || case_ok(*word)); + bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word)); + + float adjust_factor = additional_adjust; + float new_rating = word->rating(); + new_rating += kRatingPad; + const char* xheight_triggered = ""; + if (word->length() > 1) { + // Calculate x-height and y-offset consistency penalties. + switch (xheight_consistency) { + case XH_INCONSISTENT: + adjust_factor += xheight_penalty_inconsistent; + xheight_triggered = ", xhtBAD"; + break; + case XH_SUBNORMAL: + adjust_factor += xheight_penalty_subscripts; + xheight_triggered = ", xhtSUB"; + break; + case XH_GOOD: + // leave the factor alone - all good! + break; + } + // TODO(eger): if nonword is true, but there is a "core" that is a dict + // word, negate nonword status. + } else { + if (debug) { + tprintf("Consistency could not be calculated.\n"); + } + } + if (debug) { + tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "", + word->unichar_string().c_str(), word->rating(), xheight_triggered); + } + + if (nonword) { // non-dictionary word + if (case_is_ok && punc_is_ok) { + adjust_factor += segment_penalty_dict_nonword; + new_rating *= adjust_factor; + if (debug) tprintf(", W"); + } else { + adjust_factor += segment_penalty_garbage; + new_rating *= adjust_factor; + if (debug) { + if (!case_is_ok) tprintf(", C"); + if (!punc_is_ok) tprintf(", P"); + } + } + } else { // dictionary word + if (case_is_ok) { + if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) { + word->set_permuter(FREQ_DAWG_PERM); + adjust_factor += segment_penalty_dict_frequent_word; + new_rating *= adjust_factor; + if (debug) tprintf(", F"); + } else { + adjust_factor += segment_penalty_dict_case_ok; + new_rating *= adjust_factor; + if (debug) tprintf(", "); + } + } else { + adjust_factor += segment_penalty_dict_case_bad; + new_rating *= adjust_factor; + if (debug) tprintf(", C"); + } + } + new_rating -= kRatingPad; + if (modify_rating) word->set_rating(new_rating); + if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating); + word->set_adjust_factor(adjust_factor); +} + +int Dict::valid_word(const WERD_CHOICE& word, bool numbers_ok) const { + const WERD_CHOICE* word_ptr = &word; + WERD_CHOICE temp_word(word.unicharset()); + if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) { + copy_hyphen_info(&temp_word); + temp_word += word; + word_ptr = &temp_word; + } + if (word_ptr->length() == 0) return NO_PERM; + // Allocate vectors for holding current and updated + // active_dawgs and initialize them. + DawgPositionVector active_dawgs[2]; + init_active_dawgs(&(active_dawgs[0]), false); + DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM); + int last_index = word_ptr->length() - 1; + // Call letter_is_okay for each letter in the word. + for (int i = hyphen_base_size(); i <= last_index; ++i) { + if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(), + word_ptr->unichar_id(i), i == last_index))) + break; + // Swap active_dawgs, constraints with the corresponding updated vector. + if (dawg_args.updated_dawgs == &(active_dawgs[1])) { + dawg_args.updated_dawgs = &(active_dawgs[0]); + ++(dawg_args.active_dawgs); + } else { + ++(dawg_args.updated_dawgs); + dawg_args.active_dawgs = &(active_dawgs[0]); + } + } + return valid_word_permuter(dawg_args.permuter, numbers_ok) + ? dawg_args.permuter + : NO_PERM; +} + +bool Dict::valid_bigram(const WERD_CHOICE& word1, + const WERD_CHOICE& word2) const { + if (bigram_dawg_ == nullptr) return false; + + // Extract the core word from the middle of each word with any digits + // replaced with question marks. + int w1start, w1end, w2start, w2end; + word1.punct_stripped(&w1start, &w1end); + word2.punct_stripped(&w2start, &w2end); + + // We don't want to penalize a single guillemet, hyphen, etc. + // But our bigram list doesn't have any information about punctuation. + if (w1start >= w1end) return word1.length() < 3; + if (w2start >= w2end) return word2.length() < 3; + + const UNICHARSET& uchset = getUnicharset(); + std::vector bigram_string; + bigram_string.reserve(w1end + w2end + 1); + for (int i = w1start; i < w1end; i++) { + const auto &normed_ids = + getUnicharset().normed_ids(word1.unichar_id(i)); + if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) + bigram_string.push_back(question_unichar_id_); + else + bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end()); + } + bigram_string.push_back(UNICHAR_SPACE); + for (int i = w2start; i < w2end; i++) { + const auto &normed_ids = + getUnicharset().normed_ids(word2.unichar_id(i)); + if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) + bigram_string.push_back(question_unichar_id_); + else + bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end()); + } + WERD_CHOICE normalized_word(&uchset, bigram_string.size()); + for (int i = 0; i < bigram_string.size(); ++i) { + normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1, 0.0f, + 0.0f); + } + return bigram_dawg_->word_in_dawg(normalized_word); +} + +bool Dict::valid_punctuation(const WERD_CHOICE& word) { + if (word.length() == 0) return NO_PERM; + int i; + WERD_CHOICE new_word(word.unicharset()); + int last_index = word.length() - 1; + int new_len = 0; + for (i = 0; i <= last_index; ++i) { + UNICHAR_ID unichar_id = (word.unichar_id(i)); + if (getUnicharset().get_ispunctuation(unichar_id)) { + new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0); + } else if (!getUnicharset().get_isalpha(unichar_id) && + !getUnicharset().get_isdigit(unichar_id)) { + return false; // neither punc, nor alpha, nor digit + } else if ((new_len = new_word.length()) == 0 || + new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) { + new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0); + } + } + for (i = 0; i < dawgs_.size(); ++i) { + if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION && + dawgs_[i]->word_in_dawg(new_word)) + return true; + } + return false; +} + +/// Returns true if the language is space-delimited (not CJ, or T). +bool Dict::IsSpaceDelimitedLang() const { + const UNICHARSET& u_set = getUnicharset(); + if (u_set.han_sid() > 0) return false; + if (u_set.katakana_sid() > 0) return false; + if (u_set.thai_sid() > 0) return false; + return true; +} + +} // namespace tesseract diff --git a/tesseract/src/dict/dict.h b/tesseract/src/dict/dict.h new file mode 100644 index 00000000..e8ec2e37 --- /dev/null +++ b/tesseract/src/dict/dict.h @@ -0,0 +1,651 @@ +/////////////////////////////////////////////////////////////////////// +// File: dict.h +// Description: dict class. +// Author: Samuel Charron +// +// (C) Copyright 2006, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_DICT_DICT_H_ +#define TESSERACT_DICT_DICT_H_ + +#ifdef HAVE_CONFIG_H +#include "config_auto.h" // DISABLED_LEGACY_ENGINE +#endif + +#ifndef DISABLED_LEGACY_ENGINE +#include "ambigs.h" +#endif +#include "dawg.h" +#include "dawg_cache.h" +#include "ratngs.h" +#include "stopper.h" +#include "trie.h" +#include "unicharset.h" +#ifndef DISABLED_LEGACY_ENGINE +#include "params_training_featdef.h" +#endif // ndef DISABLED_LEGACY_ENGINE + +namespace tesseract { + +class MATRIX; +class WERD_RES; + +#define CHARS_PER_LINE 500 +#define MAX_WERD_LENGTH (int64_t) 128 +#define NO_RATING -1 + +/** Struct used to hold temporary information about fragments. */ +struct CHAR_FRAGMENT_INFO { + UNICHAR_ID unichar_id; + const CHAR_FRAGMENT *fragment; + int num_fragments; + float rating; + float certainty; +}; + +using DawgVector = GenericVector; + +// +// Constants +// +static const int kRatingPad = 4; +static const int kDictMaxWildcards = 2; // max wildcards for a word +// TODO(daria): If hyphens are different in different languages and can be +// inferred from training data we should load their values dynamically. +static const char kHyphenSymbol[] = "-"; +static const char kSlashSymbol[] = "/"; +static const char kQuestionSymbol[] = "?"; +static const char kApostropheSymbol[] = "'"; +static const float kSimCertaintyScale = -10.0; // similarity matcher scaling +static const float kSimCertaintyOffset = -10.0; // similarity matcher offset +static const float kSimilarityFloor = 100.0; // worst E*L product to stop on +static const int kDocDictMaxRepChars = 4; + +// Enum for describing whether the x-height for the word is consistent: +// 0 - everything is good. +// 1 - there are one or two secondary (but consistent) baselines +// [think subscript and superscript], or there is an oversized +// first character. +// 2 - the word is inconsistent. +enum XHeightConsistencyEnum {XH_GOOD, XH_SUBNORMAL, XH_INCONSISTENT}; + +struct DawgArgs { + DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p) + : active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {} + + DawgPositionVector *active_dawgs; + DawgPositionVector *updated_dawgs; + PermuterType permuter; + // True if the current position is a valid word end. + bool valid_end; +}; + +class TESS_API Dict { + public: + Dict(CCUtil* image_ptr); + ~Dict(); + const CCUtil* getCCUtil() const { + return ccutil_; + } + CCUtil* getCCUtil() { + return ccutil_; + } + const UNICHARSET& getUnicharset() const { + return getCCUtil()->unicharset; + } + UNICHARSET& getUnicharset() { + return getCCUtil()->unicharset; + } +#ifndef DISABLED_LEGACY_ENGINE + const UnicharAmbigs &getUnicharAmbigs() const { + return getCCUtil()->unichar_ambigs; + } +#endif + // Returns true if unichar_id is a word compounding character like - or /. + inline bool compound_marker(UNICHAR_ID unichar_id) { + const UNICHARSET& unicharset = getUnicharset(); + ASSERT_HOST(unicharset.contains_unichar_id(unichar_id)); + const auto &normed_ids = + unicharset.normed_ids(unichar_id); + return normed_ids.size() == 1 && + (normed_ids[0] == hyphen_unichar_id_ || + normed_ids[0] == slash_unichar_id_); + } + // Returns true if unichar_id is an apostrophe-like character that may + // separate prefix/suffix words from a main body word. + inline bool is_apostrophe(UNICHAR_ID unichar_id) { + const UNICHARSET& unicharset = getUnicharset(); + ASSERT_HOST(unicharset.contains_unichar_id(unichar_id)); + const auto &normed_ids = + unicharset.normed_ids(unichar_id); + return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_; + } + + /* hyphen.cpp ************************************************************/ + + /// Returns true if we've recorded the beginning of a hyphenated word. + inline bool hyphenated() const { return + !last_word_on_line_ && hyphen_word_; + } + /// Size of the base word (the part on the line before) of a hyphenated word. + inline int hyphen_base_size() const { + return this->hyphenated() ? hyphen_word_->length() : 0; + } + /// If this word is hyphenated copy the base word (the part on + /// the line before) of a hyphenated word into the given word. + /// This function assumes that word is not nullptr. + inline void copy_hyphen_info(WERD_CHOICE *word) const { + if (this->hyphenated()) { + *word = *hyphen_word_; + if (hyphen_debug_level) word->print("copy_hyphen_info: "); + } + } + /// Check whether the word has a hyphen at the end. + inline bool has_hyphen_end(const UNICHARSET* unicharset, + UNICHAR_ID unichar_id, bool first_pos) const { + if (!last_word_on_line_ || first_pos) + return false; + ASSERT_HOST(unicharset->contains_unichar_id(unichar_id)); + const auto &normed_ids = + unicharset->normed_ids(unichar_id); + return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_; + } + /// Same as above, but check the unichar at the end of the word. + inline bool has_hyphen_end(const WERD_CHOICE &word) const { + int word_index = word.length() - 1; + return has_hyphen_end(word.unicharset(), word.unichar_id(word_index), + word_index == 0); + } + /// Unless the previous word was the last one on the line, and the current + /// one is not (thus it is the first one on the line), erase hyphen_word_, + /// clear hyphen_active_dawgs_, update last_word_on_line_. + void reset_hyphen_vars(bool last_word_on_line); + /// Update hyphen_word_, and copy the given DawgPositionVectors into + /// hyphen_active_dawgs_ . + void set_hyphen_word(const WERD_CHOICE &word, + const DawgPositionVector &active_dawgs); + + /* permdawg.cpp ************************************************************/ + // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig(). + // When this function is refactored, permdawg.cpp can be removed. + + /// Copies word into best_choice if its rating is smaller + /// than that of best_choice. + inline void update_best_choice(const WERD_CHOICE &word, + WERD_CHOICE *best_choice) { + if (word.rating() < best_choice->rating()) { + *best_choice = word; + } + } + /// Fill the given active_dawgs vector with dawgs that could contain the + /// beginning of the word. If hyphenated() returns true, copy the entries + /// from hyphen_active_dawgs_ instead. + void init_active_dawgs(DawgPositionVector *active_dawgs, + bool ambigs_mode) const; + // Fill the given vector with the default collection of any-length dawgs + void default_dawgs(DawgPositionVector *anylength_dawgs, + bool suppress_patterns) const; + + + /// Recursively explore all the possible character combinations in + /// the given char_choices. Use go_deeper_dawg_fxn() to explore all the + /// dawgs in the dawgs_ vector in parallel and discard invalid words. + /// + /// Allocate and return a WERD_CHOICE with the best valid word found. + WERD_CHOICE *dawg_permute_and_select( + const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit); + /// If the choice being composed so far could be a dictionary word + /// and we have not reached the end of the word keep exploring the + /// char_choices further. + void go_deeper_dawg_fxn( + const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, + int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, + bool word_ending, WERD_CHOICE *word, float certainties[], + float *limit, WERD_CHOICE *best_choice, int *attempts_left, + void *void_more_args); + + /// Pointer to go_deeper function. + void (Dict::*go_deeper_fxn_)(const char *debug, + const BLOB_CHOICE_LIST_VECTOR &char_choices, + int char_choice_index, + const CHAR_FRAGMENT_INFO *prev_char_frag_info, + bool word_ending, WERD_CHOICE *word, + float certainties[], float *limit, + WERD_CHOICE *best_choice, int *attempts_left, + void *void_more_args); + // + // Helper functions for dawg_permute_and_select(). + // + void permute_choices( + const char *debug, + const BLOB_CHOICE_LIST_VECTOR &char_choices, + int char_choice_index, + const CHAR_FRAGMENT_INFO *prev_char_frag_info, + WERD_CHOICE *word, + float certainties[], + float *limit, + WERD_CHOICE *best_choice, + int *attempts_left, + void *more_args); + + void append_choices( + const char *debug, + const BLOB_CHOICE_LIST_VECTOR &char_choices, + const BLOB_CHOICE &blob_choice, + int char_choice_index, + const CHAR_FRAGMENT_INFO *prev_char_frag_info, + WERD_CHOICE *word, + float certainties[], + float *limit, + WERD_CHOICE *best_choice, + int *attempts_left, + void *more_args); + + bool fragment_state_okay(UNICHAR_ID curr_unichar_id, + float curr_rating, float curr_certainty, + const CHAR_FRAGMENT_INFO *prev_char_frag_info, + const char *debug, int word_ending, + CHAR_FRAGMENT_INFO *char_frag_info); + + /* stopper.cpp *************************************************************/ +#if !defined(DISABLED_LEGACY_ENGINE) + bool NoDangerousAmbig(WERD_CHOICE *BestChoice, + DANGERR *fixpt, + bool fix_replaceable, + MATRIX* ratings); +#endif // !defined(DISABLED_LEGACY_ENGINE) + // Replaces the corresponding wrong ngram in werd_choice with the correct + // one. The whole correct n-gram is inserted into the ratings matrix and + // the werd_choice: no more fragments!. Rating and certainty of new entries + // in matrix and werd_choice are the sum and mean of the wrong ngram + // respectively. + // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes + // mystring", with a new entry in the ratings matrix for ". + void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, + UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, + MATRIX *ratings); + + /// Returns the length of the shortest alpha run in WordChoice. + int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const; + /// Returns true if the certainty of the BestChoice word is within a + /// reasonable range of the average certainties for the best choices for + /// each character in the segmentation. This test is used to catch words + /// in which one character is much worse than the other characters in the + /// word (i.e. false will be returned in that case). The algorithm computes + /// the mean and std deviation of the certainties in the word with the worst + /// certainty thrown out. + int UniformCertainties(const WERD_CHOICE& word); + /// Returns true if the given best_choice is good enough to stop. + bool AcceptableChoice(const WERD_CHOICE& best_choice, + XHeightConsistencyEnum xheight_consistency); + /// Returns false if the best choice for the current word is questionable + /// and should be tried again on the second pass or should be flagged to + /// the user. + bool AcceptableResult(WERD_RES *word) const; +#if !defined(DISABLED_LEGACY_ENGINE) + void EndDangerousAmbigs(); +#endif // !defined(DISABLED_LEGACY_ENGINE) + /// Prints the current choices for this word to stdout. + void DebugWordChoices(); + /// Sets up stopper variables in preparation for the first pass. + void SettupStopperPass1(); + /// Sets up stopper variables in preparation for the second pass. + void SettupStopperPass2(); + /* context.cpp *************************************************************/ + /// Check a string to see if it matches a set of lexical rules. + int case_ok(const WERD_CHOICE& word) const; + /// Returns true if the word looks like an absolute garbage + /// (e.g. image mistakenly recognized as text). + bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset); + + /* dict.cpp ****************************************************************/ + + /// Initialize Dict class - load dawgs from [lang].traineddata and + /// user-specified wordlist and parttern list. + static DawgCache *GlobalDawgCache(); + // Sets up ready for a Load or LoadLSTM. + void SetupForLoad(DawgCache *dawg_cache); + // Loads the dawgs needed by Tesseract. Call FinishLoad() after. + void Load(const STRING &lang, TessdataManager *data_file); + // Loads the dawgs needed by the LSTM model. Call FinishLoad() after. + void LoadLSTM(const STRING &lang, TessdataManager *data_file); + // Completes the loading process after Load() and/or LoadLSTM(). + // Returns false if no dictionaries were loaded. + bool FinishLoad(); + void End(); + + // Resets the document dictionary analogous to ResetAdaptiveClassifier. + void ResetDocumentDictionary() { + if (pending_words_ != nullptr) + pending_words_->clear(); + if (document_words_ != nullptr) + document_words_->clear(); + } + + /** + * Returns the maximal permuter code (from ccstruct/ratngs.h) if in light + * of the current state the letter at word_index in the given word + * is allowed according to at least one of the dawgs in dawgs_, + * otherwise returns NO_PERM. + * + * The state is described by void_dawg_args, which are interpreted as + * DawgArgs and contain relevant active dawg positions. + * Each entry in the active_dawgs vector contains an index + * into the dawgs_ vector and an EDGE_REF that indicates the last edge + * followed in the dawg. It also may contain a position in the punctuation + * dawg which describes surrounding punctuation (see struct DawgPosition). + * + * Input: + * At word_index 0 dawg_args->active_dawgs should contain an entry for each + * dawg that may start at the beginning of a word, with punc_ref and edge_ref + * initialized to NO_EDGE. Since the punctuation dawg includes the empty + * pattern " " (meaning anything without surrounding punctuation), having a + * single entry for the punctuation dawg will cover all dawgs reachable + * therefrom -- that includes all number and word dawgs. The only dawg + * non-reachable from the punctuation_dawg is the pattern dawg. + * If hyphen state needs to be applied, initial dawg_args->active_dawgs can + * be copied from the saved hyphen state (maintained by Dict). + * For word_index > 0 the corresponding state (active_dawgs and punc position) + * can be obtained from dawg_args->updated_dawgs passed to + * def_letter_is_okay for word_index-1. + * Note: the function assumes that active_dawgs, and updated_dawgs + * member variables of dawg_args are not nullptr. + * + * Output: + * The function fills in dawg_args->updated_dawgs vector with the + * entries for dawgs that contain the word up to the letter at word_index. + * + */ + + // + int def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset, + UNICHAR_ID unichar_id, bool word_end) const; + + int (Dict::*letter_is_okay_)(void* void_dawg_args, + const UNICHARSET& unicharset, + UNICHAR_ID unichar_id, bool word_end) const; + /// Calls letter_is_okay_ member function. + int LetterIsOkay(void* void_dawg_args, const UNICHARSET& unicharset, + UNICHAR_ID unichar_id, bool word_end) const { + return (this->*letter_is_okay_)(void_dawg_args, + unicharset, unichar_id, word_end); + } + + + /// Probability in context function used by the ngram permuter. + double (Dict::*probability_in_context_)(const char* lang, + const char* context, + int context_bytes, + const char* character, + int character_bytes); + /// Calls probability_in_context_ member function. + double ProbabilityInContext(const char* context, + int context_bytes, + const char* character, + int character_bytes) { + return (this->*probability_in_context_)( + getCCUtil()->lang.c_str(), + context, context_bytes, + character, character_bytes); + } + + /// Default (no-op) implementation of probability in context function. + double def_probability_in_context( + const char* lang, const char* context, int context_bytes, + const char* character, int character_bytes) { + (void)lang; + (void)context; + (void)context_bytes; + (void)character; + (void)character_bytes; + return 0.0; + } + + inline void SetWildcardID(UNICHAR_ID id) { wildcard_unichar_id_ = id; } + inline UNICHAR_ID WildcardID() const { return wildcard_unichar_id_; } + /// Return the number of dawgs in the dawgs_ vector. + inline int NumDawgs() const { return dawgs_.size(); } + /// Return i-th dawg pointer recorded in the dawgs_ vector. + inline const Dawg *GetDawg(int index) const { return dawgs_[index]; } + /// Return the points to the punctuation dawg. + inline const Dawg *GetPuncDawg() const { return punc_dawg_; } + /// Return the points to the unambiguous words dawg. + inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; } + /// Returns the appropriate next node given the EDGE_REF. + static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) { + if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg + NODE_REF node = dawg->next_node(edge_ref); + if (node == 0) node = NO_EDGE; // end of word + return node; + } + + // Given a unichar from a string and a given dawg, return the unichar + // we should use to match in that dawg type. (for example, in the number + // dawg, all numbers are transformed to kPatternUnicharId). + UNICHAR_ID char_for_dawg(const UNICHARSET& unicharset, UNICHAR_ID ch, + const Dawg *dawg) const { + if (!dawg) return ch; + switch (dawg->type()) { + case DAWG_TYPE_NUMBER: + return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch; + default: + return ch; + } + } + + /// For each of the character classes of the given unichar_id (and the + /// unichar_id itself) finds the corresponding outgoing node or self-loop + /// in the given dawg and (after checking that it is valid) records it in + /// dawg_args->updated_ative_dawgs. Updates current_permuter if any valid + /// edges were found. + void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, + UNICHAR_ID unichar_id, bool word_end, + DawgArgs *dawg_args, + PermuterType *current_permuter) const; + + /// Read/Write/Access special purpose dawgs which contain words + /// only of a certain length (used for phrase search for + /// non-space-delimited languages). + + /// Check all the DAWGs to see if this word is in any of them. + inline static bool valid_word_permuter(uint8_t perm, bool numbers_ok) { + return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM || + perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM || + perm == USER_PATTERN_PERM || perm == COMPOUND_PERM || + (numbers_ok && perm == NUMBER_PERM)); + } + int valid_word(const WERD_CHOICE &word, bool numbers_ok) const; + int valid_word(const WERD_CHOICE &word) const { + return valid_word(word, false); // return NO_PERM for words with digits + } + int valid_word_or_number(const WERD_CHOICE &word) const { + return valid_word(word, true); // return NUMBER_PERM for valid numbers + } + /// This function is used by api/tesseract_cube_combiner.cpp + int valid_word(const char *string) const { + WERD_CHOICE word(string, getUnicharset()); + return valid_word(word); + } + // Do the two WERD_CHOICEs form a meaningful bigram? + bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const; + /// Returns true if the word contains a valid punctuation pattern. + /// Note: Since the domains of punctuation symbols and symblos + /// used in numbers are not disjoint, a valid number might contain + /// an invalid punctuation pattern (e.g. .99). + bool valid_punctuation(const WERD_CHOICE &word); + /// Returns true if a good answer is found for the unknown blob rating. + int good_choice(const WERD_CHOICE &choice); + /// Adds a word found on this document to the document specific dictionary. + void add_document_word(const WERD_CHOICE &best_choice); + /// Adjusts the rating of the given word. + void adjust_word(WERD_CHOICE *word, + bool nonword, XHeightConsistencyEnum xheight_consistency, + float additional_adjust, + bool modify_rating, + bool debug); + /// Set wordseg_rating_adjust_factor_ to the given value. + inline void SetWordsegRatingAdjustFactor(float f) { + wordseg_rating_adjust_factor_ = f; + } + /// Returns true if the language is space-delimited (not CJ, or T). + bool IsSpaceDelimitedLang() const; + + private: + /** Private member variables. */ + CCUtil* ccutil_; + /** + * Table that stores ambiguities computed during training + * (loaded when NoDangerousAmbigs() is called for the first time). + * Each entry i in the table stores a set of amibiguities whose + * wrong ngram starts with unichar id i. + */ +#ifndef DISABLED_LEGACY_ENGINE + UnicharAmbigs* dang_ambigs_table_ = nullptr; + /** Same as above, but for ambiguities with replace flag set. */ + UnicharAmbigs* replace_ambigs_table_ = nullptr; +#endif + /** Additional certainty padding allowed before a word is rejected. */ + float reject_offset_; + // Cached UNICHAR_IDs: + UNICHAR_ID wildcard_unichar_id_; // kDictWildcard. + UNICHAR_ID apostrophe_unichar_id_; // kApostropheSymbol. + UNICHAR_ID question_unichar_id_; // kQuestionSymbol. + UNICHAR_ID slash_unichar_id_; // kSlashSymbol. + UNICHAR_ID hyphen_unichar_id_; // kHyphenSymbol. + // Hyphen-related variables. + WERD_CHOICE *hyphen_word_; + DawgPositionVector hyphen_active_dawgs_; + bool last_word_on_line_; + // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary + // matching. The first member of each list is taken as canonical. For + // example, the first list contains hyphens and dashes with the first symbol + // being the ASCII hyphen minus. + std::vector > equivalent_symbols_; + // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs. + DawgCache *dawg_cache_; + bool dawg_cache_is_ours_; // we should delete our own dawg_cache_ + // Dawgs. + DawgVector dawgs_; + SuccessorListsVector successors_; + Trie *pending_words_; + /// The following pointers are only cached for convenience. + /// The dawgs will be deleted when dawgs_ vector is destroyed. + // bigram_dawg_ points to a dawg of two-word bigrams which always supersede if + // any of them are present on the best choices list for a word pair. + // the bigrams are stored as space-separated words where: + // (1) leading and trailing punctuation has been removed from each word and + // (2) any digits have been replaced with '?' marks. + Dawg *bigram_dawg_; + // TODO(daria): need to support multiple languages in the future, + // so maybe will need to maintain a list of dawgs of each kind. + Dawg *freq_dawg_; + Dawg *unambig_dawg_; + Dawg *punc_dawg_; + Trie *document_words_; + /// Current segmentation cost adjust factor for word rating. + /// See comments in incorporate_segcost. + float wordseg_rating_adjust_factor_; + // File for recording ambiguities discovered during dictionary search. + FILE *output_ambig_words_file_; + + public: + /// Variable members. + /// These have to be declared and initialized after image_ptr_, which contains + /// the pointer to the params vector - the member of its base CCUtil class. + STRING_VAR_H(user_words_file, "", "A filename of user-provided words."); + STRING_VAR_H(user_words_suffix, "", + "A suffix of user-provided words located in tessdata."); + STRING_VAR_H(user_patterns_file, "", + "A filename of user-provided patterns."); + STRING_VAR_H(user_patterns_suffix, "", + "A suffix of user-provided patterns located in tessdata."); + BOOL_VAR_H(load_system_dawg, true, "Load system word dawg."); + BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg."); + BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg."); + BOOL_VAR_H(load_punc_dawg, true, + "Load dawg with punctuation patterns."); + BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns."); + BOOL_VAR_H(load_bigram_dawg, true, + "Load dawg with special word bigrams."); + double_VAR_H(xheight_penalty_subscripts, 0.125, + "Score penalty (0.1 = 10%) added if there are subscripts " + "or superscripts in a word, but it is otherwise OK."); + double_VAR_H(xheight_penalty_inconsistent, 0.25, + "Score penalty (0.1 = 10%) added if an xheight is " + "inconsistent."); + double_VAR_H(segment_penalty_dict_frequent_word, 1.0, + "Score multiplier for word matches which have good case and" + "are frequent in the given language (lower is better)."); + + double_VAR_H(segment_penalty_dict_case_ok, 1.1, + "Score multiplier for word matches that have good case " + "(lower is better)."); + + double_VAR_H(segment_penalty_dict_case_bad, 1.3125, + "Default score multiplier for word matches, which may have " + "case issues (lower is better)."); + + double_VAR_H(segment_penalty_dict_nonword, 1.25, + "Score multiplier for glyph fragment segmentations which " + "do not match a dictionary word (lower is better)."); + + double_VAR_H(segment_penalty_garbage, 1.50, + "Score multiplier for poorly cased strings that are not in" + " the dictionary and generally look like garbage (lower is" + " better)."); + STRING_VAR_H(output_ambig_words_file, "", + "Output file for ambiguities found in the dictionary"); + INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info" + ", to 2 for more details, to 3 to see all the debug messages"); + INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words."); + BOOL_VAR_H(use_only_first_uft8_step, false, + "Use only the first UTF8 step of the given string" + " when computing log probabilities."); + double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor"); + double_VAR_H(stopper_nondict_certainty_base, -2.50, + "Certainty threshold for non-dict words"); + double_VAR_H(stopper_phase2_certainty_rejection_offset, 1.0, + "Reject certainty offset"); + INT_VAR_H(stopper_smallword_size, 2, + "Size of dict word to be treated as non-dict word"); + double_VAR_H(stopper_certainty_per_char, -0.50, + "Certainty to add for each dict char above small word size."); + double_VAR_H(stopper_allowable_character_badness, 3.0, + "Max certaintly variation allowed in a word (in sigma)"); + INT_VAR_H(stopper_debug_level, 0, "Stopper debug level"); + BOOL_VAR_H(stopper_no_acceptable_choices, false, + "Make AcceptableChoice() always return false. Useful" + " when there is a need to explore all segmentations"); + INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list"); + STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information" + " should be printed to stdout"); + BOOL_VAR_H(segment_nonalphabetic_script, false, + "Don't use any alphabetic-specific tricks." + "Set to true in the traineddata config file for" + " scripts that are cursive or inherently fixed-pitch"); + BOOL_VAR_H(save_doc_words, 0, "Save Document Words"); + double_VAR_H(doc_dict_pending_threshold, 0.0, + "Worst certainty for using pending dictionary"); + double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty" + " for words that can be inserted into the document dictionary"); + INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different" + " character choices to consider during permutation." + " This limit is especially useful when user patterns" + " are specified, since overly generic patterns can result in" + " dawg search exploring an overly large number of options."); +}; + +} // namespace tesseract + +#endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_ diff --git a/tesseract/src/dict/hyphen.cpp b/tesseract/src/dict/hyphen.cpp new file mode 100644 index 00000000..7a0a622c --- /dev/null +++ b/tesseract/src/dict/hyphen.cpp @@ -0,0 +1,61 @@ +/****************************************************************************** + * File: hyphen.cpp (Formerly hyphen.c) + * Description: Functions for maintaining information about hyphenated words. + * Author: Mark Seaman, OCR Technology + * Status: Reusable Software Component + * + * (c) Copyright 1987, Hewlett-Packard Company. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + *****************************************************************************/ + +#include "dict.h" + +namespace tesseract { + +// Unless the previous word was the last one on the line, and the current +// one is not (thus it is the first one on the line), erase hyphen_word_, +// clear hyphen_active_dawgs_, hyphen_constraints_ update last_word_on_line_. +void Dict::reset_hyphen_vars(bool last_word_on_line) { + if (!(last_word_on_line_ == true && last_word_on_line == false)) { + if (hyphen_word_ != nullptr) { + delete hyphen_word_; + hyphen_word_ = nullptr; + hyphen_active_dawgs_.clear(); + } + } + if (hyphen_debug_level) { + tprintf("reset_hyphen_vars: last_word_on_line %d -> %d\n", + last_word_on_line_, last_word_on_line); + } + last_word_on_line_ = last_word_on_line; +} + +// Update hyphen_word_, and copy the given DawgPositionVectors into +// hyphen_active_dawgs_. +void Dict::set_hyphen_word(const WERD_CHOICE &word, + const DawgPositionVector &active_dawgs) { + if (hyphen_word_ == nullptr) { + hyphen_word_ = new WERD_CHOICE(word.unicharset()); + hyphen_word_->make_bad(); + } + if (hyphen_word_->rating() > word.rating()) { + *hyphen_word_ = word; + // Remove the last unichar id as it is a hyphen, and remove + // any unichar_string/lengths that are present. + hyphen_word_->remove_last_unichar_id(); + hyphen_active_dawgs_ = active_dawgs; + } + if (hyphen_debug_level) { + hyphen_word_->print("set_hyphen_word: "); + } +} +} // namespace tesseract diff --git a/tesseract/src/dict/matchdefs.h b/tesseract/src/dict/matchdefs.h new file mode 100644 index 00000000..b95dbc2b --- /dev/null +++ b/tesseract/src/dict/matchdefs.h @@ -0,0 +1,50 @@ +/****************************************************************************** + ** Filename: matchdefs.h + ** Purpose: Generic interface definitions for feature matchers. + ** Author: Dan Johnson + ** + ** (c) Copyright Hewlett-Packard Company, 1988. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + ******************************************************************************/ + +#ifndef MATCHDEFS_H +#define MATCHDEFS_H + +#include + +#include // INT16_MAX +#include // int16_t + +namespace tesseract { + +/* define the maximum number of classes defined for any matcher + and the maximum class id for any matcher. This must be changed + if more different classes need to be classified */ +#define MAX_NUM_CLASSES INT16_MAX + +/** a CLASS_ID is the ascii character to be associated with a class */ +using CLASS_ID = UNICHAR_ID; +#define NO_CLASS (0) + +/** a PROTO_ID is the index of a prototype within it's class. Valid proto + id's are 0 to N-1 where N is the number of prototypes that make up the + class. */ +using PROTO_ID = int16_t; +#define NO_PROTO (-1) + +/** FEATURE_ID is the index of a feature within a character description + The feature id ranges from 0 to N-1 where N is the number + of features in a character description. */ +using FEATURE_ID = uint8_t; + +} // namespace tesseract + +#endif diff --git a/tesseract/src/dict/permdawg.cpp b/tesseract/src/dict/permdawg.cpp new file mode 100644 index 00000000..33f6b697 --- /dev/null +++ b/tesseract/src/dict/permdawg.cpp @@ -0,0 +1,390 @@ +/****************************************************************************** + * + * File: permdawg.cpp (Formerly permdawg.c) + * Description: Scale word choices by a dictionary + * Author: Mark Seaman, OCR Technology + * + * (c) Copyright 1987, Hewlett-Packard Company. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + *****************************************************************************/ +/*---------------------------------------------------------------------- + I n c l u d e s +----------------------------------------------------------------------*/ + +#include "dawg.h" +#include "stopper.h" +#include "tprintf.h" +#include "params.h" + +#include +#include +#include "dict.h" + +/*---------------------------------------------------------------------- + F u n c t i o n s +----------------------------------------------------------------------*/ +namespace tesseract { + +/** + * @name go_deeper_dawg_fxn + * + * If the choice being composed so far could be a dictionary word + * keep exploring choices. + */ +void Dict::go_deeper_dawg_fxn( + const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, + int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, + bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, + WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) { + auto *more_args = static_cast(void_more_args); + word_ending = (char_choice_index == char_choices.size()-1); + int word_index = word->length() - 1; + if (best_choice->rating() < *limit) return; + // Look up char in DAWG + + // If the current unichar is an ngram first try calling + // letter_is_okay() for each unigram it contains separately. + UNICHAR_ID orig_uch_id = word->unichar_id(word_index); + bool checked_unigrams = false; + if (getUnicharset().get_isngram(orig_uch_id)) { + if (dawg_debug_level) { + tprintf("checking unigrams in an ngram %s\n", + getUnicharset().debug_str(orig_uch_id).c_str()); + } + int num_unigrams = 0; + word->remove_last_unichar_id(); + std::vector encoding; + const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id); + // Since the string came out of the unicharset, failure is impossible. + ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, + nullptr)); + bool unigrams_ok = true; + // Construct DawgArgs that reflect the current state. + DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs); + DawgPositionVector unigram_updated_dawgs; + DawgArgs unigram_dawg_args(&unigram_active_dawgs, + &unigram_updated_dawgs, + more_args->permuter); + // Check unigrams in the ngram with letter_is_okay(). + for (int i = 0; unigrams_ok && i < encoding.size(); ++i) { + UNICHAR_ID uch_id = encoding[i]; + ASSERT_HOST(uch_id != INVALID_UNICHAR_ID); + ++num_unigrams; + word->append_unichar_id(uch_id, 1, 0.0, 0.0); + unigrams_ok = (this->*letter_is_okay_)( + &unigram_dawg_args, *word->unicharset(), + word->unichar_id(word_index+num_unigrams-1), + word_ending && i == encoding.size() - 1); + (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs); + if (dawg_debug_level) { + tprintf("unigram %s is %s\n", + getUnicharset().debug_str(uch_id).c_str(), + unigrams_ok ? "OK" : "not OK"); + } + } + // Restore the word and copy the updated dawg state if needed. + while (num_unigrams-- > 0) word->remove_last_unichar_id(); + word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0); + if (unigrams_ok) { + checked_unigrams = true; + more_args->permuter = unigram_dawg_args.permuter; + *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs); + } + } + + // Check which dawgs from the dawgs_ vector contain the word + // up to and including the current unichar. + if (checked_unigrams || (this->*letter_is_okay_)( + more_args, *word->unicharset(), word->unichar_id(word_index), + word_ending)) { + // Add a new word choice + if (word_ending) { + if (dawg_debug_level) { + tprintf("found word = %s\n", word->debug_string().c_str()); + } + if (strcmp(output_ambig_words_file.c_str(), "") != 0) { + if (output_ambig_words_file_ == nullptr) { + output_ambig_words_file_ = + fopen(output_ambig_words_file.c_str(), "wb+"); + if (output_ambig_words_file_ == nullptr) { + tprintf("Failed to open output_ambig_words_file %s\n", + output_ambig_words_file.c_str()); + exit(1); + } + STRING word_str; + word->string_and_lengths(&word_str, nullptr); + word_str += " "; + fprintf(output_ambig_words_file_, "%s", word_str.c_str()); + } + STRING word_str; + word->string_and_lengths(&word_str, nullptr); + word_str += " "; + fprintf(output_ambig_words_file_, "%s", word_str.c_str()); + } + WERD_CHOICE *adjusted_word = word; + adjusted_word->set_permuter(more_args->permuter); + update_best_choice(*adjusted_word, best_choice); + } else { // search the next letter + // Make updated_* point to the next entries in the DawgPositionVector + // arrays (that were originally created in dawg_permute_and_select) + ++(more_args->updated_dawgs); + // Make active_dawgs and constraints point to the updated ones. + ++(more_args->active_dawgs); + permute_choices(debug, char_choices, char_choice_index + 1, + prev_char_frag_info, word, certainties, limit, + best_choice, attempts_left, more_args); + // Restore previous state to explore another letter in this position. + --(more_args->updated_dawgs); + --(more_args->active_dawgs); + } + } else { + if (dawg_debug_level) { + tprintf("last unichar not OK at index %d in %s\n", + word_index, word->debug_string().c_str()); + } + } +} + + +/** + * dawg_permute_and_select + * + * Recursively explore all the possible character combinations in + * the given char_choices. Use go_deeper_dawg_fxn() to search all the + * dawgs in the dawgs_ vector in parallel and discard invalid words. + * + * Allocate and return a WERD_CHOICE with the best valid word found. + */ +WERD_CHOICE *Dict::dawg_permute_and_select( + const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) { + auto *best_choice = new WERD_CHOICE(&getUnicharset()); + best_choice->make_bad(); + best_choice->set_rating(rating_limit); + if (char_choices.size() == 0 || char_choices.size() > MAX_WERD_LENGTH) + return best_choice; + auto *active_dawgs = + new DawgPositionVector[char_choices.size() + 1]; + init_active_dawgs(&(active_dawgs[0]), true); + DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM); + WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH); + + float certainties[MAX_WERD_LENGTH]; + this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn; + int attempts_left = max_permuter_attempts; + permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr, + char_choices, 0, nullptr, &word, certainties, &rating_limit, best_choice, + &attempts_left, &dawg_args); + delete[] active_dawgs; + return best_choice; +} + +/** + * permute_choices + * + * Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST + * with the given char_choice_index in char_choices. + */ +void Dict::permute_choices( + const char *debug, + const BLOB_CHOICE_LIST_VECTOR &char_choices, + int char_choice_index, + const CHAR_FRAGMENT_INFO *prev_char_frag_info, + WERD_CHOICE *word, + float certainties[], + float *limit, + WERD_CHOICE *best_choice, + int *attempts_left, + void *more_args) { + if (debug) { + tprintf("%s permute_choices: char_choice_index=%d" + " limit=%g rating=%g, certainty=%g word=%s\n", + debug, char_choice_index, *limit, word->rating(), + word->certainty(), word->debug_string().c_str()); + } + if (char_choice_index < char_choices.size()) { + BLOB_CHOICE_IT blob_choice_it; + blob_choice_it.set_to_list(char_choices.get(char_choice_index)); + for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); + blob_choice_it.forward()) { + (*attempts_left)--; + append_choices(debug, char_choices, *(blob_choice_it.data()), + char_choice_index, prev_char_frag_info, word, + certainties, limit, best_choice, attempts_left, more_args); + if (*attempts_left <= 0) { + if (debug) tprintf("permute_choices(): attempts_left is 0\n"); + break; + } + } + } +} + +/** + * append_choices + * + * Checks to see whether or not the next choice is worth appending to + * the word being generated. If so then keeps going deeper into the word. + * + * This function assumes that Dict::go_deeper_fxn_ is set. + */ +void Dict::append_choices( + const char *debug, + const BLOB_CHOICE_LIST_VECTOR &char_choices, + const BLOB_CHOICE &blob_choice, + int char_choice_index, + const CHAR_FRAGMENT_INFO *prev_char_frag_info, + WERD_CHOICE *word, + float certainties[], + float *limit, + WERD_CHOICE *best_choice, + int *attempts_left, + void *more_args) { + int word_ending = (char_choice_index == char_choices.size() - 1); + + // Deal with fragments. + CHAR_FRAGMENT_INFO char_frag_info; + if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), + blob_choice.certainty(), prev_char_frag_info, debug, + word_ending, &char_frag_info)) { + return; // blob_choice must be an invalid fragment + } + // Search the next letter if this character is a fragment. + if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) { + permute_choices(debug, char_choices, char_choice_index + 1, + &char_frag_info, word, certainties, limit, + best_choice, attempts_left, more_args); + return; + } + + // Add the next unichar. + float old_rating = word->rating(); + float old_certainty = word->certainty(); + uint8_t old_permuter = word->permuter(); + certainties[word->length()] = char_frag_info.certainty; + word->append_unichar_id_space_allocated( + char_frag_info.unichar_id, char_frag_info.num_fragments, + char_frag_info.rating, char_frag_info.certainty); + + // Explore the next unichar. + (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, + &char_frag_info, word_ending, word, certainties, + limit, best_choice, attempts_left, more_args); + + // Remove the unichar we added to explore other choices in it's place. + word->remove_last_unichar_id(); + word->set_rating(old_rating); + word->set_certainty(old_certainty); + word->set_permuter(old_permuter); +} + +/** + * @name fragment_state + * + * Given the current char choice and information about previously seen + * fragments, determines whether adjacent character fragments are + * present and whether they can be concatenated. + * + * The given prev_char_frag_info contains: + * - fragment: if not nullptr contains information about immediately + * preceding fragmented character choice + * - num_fragments: number of fragments that have been used so far + * to construct a character + * - certainty: certainty of the current choice or minimum + * certainty of all fragments concatenated so far + * - rating: rating of the current choice or sum of fragment + * ratings concatenated so far + * + * The output char_frag_info is filled in as follows: + * - character: is set to be nullptr if the choice is a non-matching + * or non-ending fragment piece; is set to unichar of the given choice + * if it represents a regular character or a matching ending fragment + * - fragment,num_fragments,certainty,rating are set as described above + * + * @returns false if a non-matching fragment is discovered, true otherwise. + */ +bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id, + float curr_rating, float curr_certainty, + const CHAR_FRAGMENT_INFO *prev_char_frag_info, + const char *debug, int word_ending, + CHAR_FRAGMENT_INFO *char_frag_info) { + const CHAR_FRAGMENT *this_fragment = + getUnicharset().get_fragment(curr_unichar_id); + const CHAR_FRAGMENT *prev_fragment = + prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr; + + // Print debug info for fragments. + if (debug && (prev_fragment || this_fragment)) { + tprintf("%s check fragments: choice=%s word_ending=%d\n", debug, + getUnicharset().debug_str(curr_unichar_id).c_str(), + word_ending); + if (prev_fragment) { + tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str()); + } + if (this_fragment) { + tprintf("this_fragment %s\n", this_fragment->to_string().c_str()); + } + } + + char_frag_info->unichar_id = curr_unichar_id; + char_frag_info->fragment = this_fragment; + char_frag_info->rating = curr_rating; + char_frag_info->certainty = curr_certainty; + char_frag_info->num_fragments = 1; + if (prev_fragment && !this_fragment) { + if (debug) tprintf("Skip choice with incomplete fragment\n"); + return false; + } + if (this_fragment) { + // We are dealing with a fragment. + char_frag_info->unichar_id = INVALID_UNICHAR_ID; + if (prev_fragment) { + if (!this_fragment->is_continuation_of(prev_fragment)) { + if (debug) tprintf("Non-matching fragment piece\n"); + return false; + } + if (this_fragment->is_ending()) { + char_frag_info->unichar_id = + getUnicharset().unichar_to_id(this_fragment->get_unichar()); + char_frag_info->fragment = nullptr; + if (debug) { + tprintf("Built character %s from fragments\n", + getUnicharset().debug_str( + char_frag_info->unichar_id).c_str()); + } + } else { + if (debug) tprintf("Record fragment continuation\n"); + char_frag_info->fragment = this_fragment; + } + // Update certainty and rating. + char_frag_info->rating = + prev_char_frag_info->rating + curr_rating; + char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1; + char_frag_info->certainty = + std::min(curr_certainty, prev_char_frag_info->certainty); + } else { + if (this_fragment->is_beginning()) { + if (debug) tprintf("Record fragment beginning\n"); + } else { + if (debug) { + tprintf("Non-starting fragment piece with no prev_fragment\n"); + } + return false; + } + } + } + if (word_ending && char_frag_info->fragment) { + if (debug) tprintf("Word can not end with a fragment\n"); + return false; + } + return true; +} + +} // namespace tesseract diff --git a/tesseract/src/dict/stopper.cpp b/tesseract/src/dict/stopper.cpp new file mode 100644 index 00000000..36207d00 --- /dev/null +++ b/tesseract/src/dict/stopper.cpp @@ -0,0 +1,515 @@ +/****************************************************************************** + ** Filename: stopper.c + ** Purpose: Stopping criteria for word classifier. + ** Author: Dan Johnson + ** + ** (c) Copyright Hewlett-Packard Company, 1988. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + ******************************************************************************/ + +#include +#include +#include +#include + +#include "stopper.h" +#ifndef DISABLED_LEGACY_ENGINE +#include "ambigs.h" +#endif +#include "ccutil.h" +#include "dict.h" +#include "helpers.h" +#include "matchdefs.h" +#include "pageres.h" +#include "params.h" +#include "ratngs.h" +#include + +/*---------------------------------------------------------------------------- + Private Code +----------------------------------------------------------------------------*/ + +namespace tesseract { + +bool Dict::AcceptableChoice(const WERD_CHOICE& best_choice, + XHeightConsistencyEnum xheight_consistency) { + float CertaintyThreshold = stopper_nondict_certainty_base; + int WordSize; + + if (stopper_no_acceptable_choices) return false; + + if (best_choice.length() == 0) return false; + + bool no_dang_ambigs = !best_choice.dangerous_ambig_found(); + bool is_valid_word = valid_word_permuter(best_choice.permuter(), false); + bool is_case_ok = case_ok(best_choice); + + if (stopper_debug_level >= 1) { + const char *xht = "UNKNOWN"; + switch (xheight_consistency) { + case XH_GOOD: xht = "NORMAL"; break; + case XH_SUBNORMAL: xht = "SUBNORMAL"; break; + case XH_INCONSISTENT: xht = "INCONSISTENT"; break; + default: xht = "UNKNOWN"; + } + tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n", + best_choice.unichar_string().c_str(), + (is_valid_word ? 'y' : 'n'), + (is_case_ok ? 'y' : 'n'), + xht, + best_choice.min_x_height(), + best_choice.max_x_height()); + } + // Do not accept invalid words in PASS1. + if (reject_offset_ <= 0.0f && !is_valid_word) return false; + if (is_valid_word && is_case_ok) { + WordSize = LengthOfShortestAlphaRun(best_choice); + WordSize -= stopper_smallword_size; + if (WordSize < 0) + WordSize = 0; + CertaintyThreshold += WordSize * stopper_certainty_per_char; + } + + if (stopper_debug_level >= 1) + tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n", + best_choice.rating(), best_choice.certainty(), CertaintyThreshold); + + if (no_dang_ambigs && + best_choice.certainty() > CertaintyThreshold && + xheight_consistency < XH_INCONSISTENT && + UniformCertainties(best_choice)) { + return true; + } else { + if (stopper_debug_level >= 1) { + tprintf("AcceptableChoice() returned false" + " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n", + no_dang_ambigs, best_choice.certainty(), + CertaintyThreshold, + UniformCertainties(best_choice)); + } + return false; + } +} + +bool Dict::AcceptableResult(WERD_RES *word) const { + if (word->best_choice == nullptr) return false; + float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_; + int WordSize; + + if (stopper_debug_level >= 1) { + tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n", + word->best_choice->debug_string().c_str(), + (valid_word(*word->best_choice) ? 'y' : 'n'), + (case_ok(*word->best_choice) ? 'y' : 'n'), + word->best_choice->dangerous_ambig_found() ? 'n' : 'y', + word->best_choices.singleton() ? 'n' : 'y'); + } + + if (word->best_choice->length() == 0 || !word->best_choices.singleton()) + return false; + if (valid_word(*word->best_choice) && case_ok(*word->best_choice)) { + WordSize = LengthOfShortestAlphaRun(*word->best_choice); + WordSize -= stopper_smallword_size; + if (WordSize < 0) + WordSize = 0; + CertaintyThreshold += WordSize * stopper_certainty_per_char; + } + + if (stopper_debug_level >= 1) + tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f ", + word->best_choice->certainty(), CertaintyThreshold); + + if (word->best_choice->certainty() > CertaintyThreshold && + !stopper_no_acceptable_choices) { + if (stopper_debug_level >= 1) + tprintf("ACCEPTED\n"); + return true; + } else { + if (stopper_debug_level >= 1) + tprintf("REJECTED\n"); + return false; + } +} + +#if !defined(DISABLED_LEGACY_ENGINE) + +bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, + DANGERR *fixpt, + bool fix_replaceable, + MATRIX *ratings) { + if (stopper_debug_level > 2) { + tprintf("\nRunning NoDangerousAmbig() for %s\n", + best_choice->debug_string().c_str()); + } + + // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities + // for each unichar id in BestChoice. + BLOB_CHOICE_LIST_VECTOR ambig_blob_choices; + int i; + bool ambigs_found = false; + // For each position in best_choice: + // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i] + // -- initialize wrong_ngram with a single unichar_id at best_choice[i] + // -- look for ambiguities corresponding to wrong_ngram in the list while + // adding the following unichar_ids from best_choice to wrong_ngram + // + // Repeat the above procedure twice: first time look through + // ambigs to be replaced and replace all the ambiguities found; + // second time look through dangerous ambiguities and construct + // ambig_blob_choices with fake a blob choice for each ambiguity + // and pass them to dawg_permute_and_select() to search for + // ambiguous words in the dictionaries. + // + // Note that during the execution of the for loop (on the first pass) + // if replacements are made the length of best_choice might change. + for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) { + bool replace = (fix_replaceable && pass == 0); + const UnicharAmbigsVector &table = replace ? + getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs(); + if (!replace) { + // Initialize ambig_blob_choices with lists containing a single + // unichar id for the corresponding position in best_choice. + // best_choice consisting from only the original letters will + // have a rating of 0.0. + for (i = 0; i < best_choice->length(); ++i) { + auto *lst = new BLOB_CHOICE_LIST(); + BLOB_CHOICE_IT lst_it(lst); + // TODO(rays/antonova) Put real xheights and y shifts here. + lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i), + 0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG)); + ambig_blob_choices.push_back(lst); + } + } + UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1]; + int wrong_ngram_index; + int next_index; + int blob_index = 0; + for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i), + ++i) { + UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i); + if (stopper_debug_level > 2) { + tprintf("Looking for %s ngrams starting with %s:\n", + replace ? "replaceable" : "ambiguous", + getUnicharset().debug_str(curr_unichar_id).c_str()); + } + int num_wrong_blobs = best_choice->state(i); + wrong_ngram_index = 0; + wrong_ngram[wrong_ngram_index] = curr_unichar_id; + if (curr_unichar_id == INVALID_UNICHAR_ID || + curr_unichar_id >= table.size() || + table[curr_unichar_id] == nullptr) { + continue; // there is no ambig spec for this unichar id + } + AmbigSpec_IT spec_it(table[curr_unichar_id]); + for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) { + const AmbigSpec *ambig_spec = spec_it.data(); + wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID; + int compare = UnicharIdArrayUtils::compare(wrong_ngram, + ambig_spec->wrong_ngram); + if (stopper_debug_level > 2) { + tprintf("candidate ngram: "); + UnicharIdArrayUtils::print(wrong_ngram, getUnicharset()); + tprintf("current ngram from spec: "); + UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset()); + tprintf("comparison result: %d\n", compare); + } + if (compare == 0) { + // Record the place where we found an ambiguity. + if (fixpt != nullptr) { + UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0]; + fixpt->push_back(DANGERR_INFO( + blob_index, blob_index + num_wrong_blobs, replace, + getUnicharset().get_isngram(ambig_spec->correct_ngram_id), + leftmost_id)); + if (stopper_debug_level > 1) { + tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index, + blob_index + num_wrong_blobs, false, + getUnicharset().get_isngram( + ambig_spec->correct_ngram_id), + getUnicharset().id_to_unichar(leftmost_id)); + } + } + + if (replace) { + if (stopper_debug_level > 2) { + tprintf("replace ambiguity with %s : ", + getUnicharset().id_to_unichar( + ambig_spec->correct_ngram_id)); + UnicharIdArrayUtils::print( + ambig_spec->correct_fragments, getUnicharset()); + } + ReplaceAmbig(i, ambig_spec->wrong_ngram_size, + ambig_spec->correct_ngram_id, + best_choice, ratings); + } else if (i > 0 || ambig_spec->type != CASE_AMBIG) { + // We found dang ambig - update ambig_blob_choices. + if (stopper_debug_level > 2) { + tprintf("found ambiguity: "); + UnicharIdArrayUtils::print( + ambig_spec->correct_fragments, getUnicharset()); + } + ambigs_found = true; + for (int tmp_index = 0; tmp_index <= wrong_ngram_index; + ++tmp_index) { + // Add a blob choice for the corresponding fragment of the + // ambiguity. These fake blob choices are initialized with + // negative ratings (which are not possible for real blob + // choices), so that dawg_permute_and_select() considers any + // word not consisting of only the original letters a better + // choice and stops searching for alternatives once such a + // choice is found. + BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]); + bc_it.add_to_end(new BLOB_CHOICE( + ambig_spec->correct_fragments[tmp_index], -1.0, 0.0, + -1, 0, 1, 0, BCC_AMBIG)); + } + } + spec_it.forward(); + } else if (compare == -1) { + if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size && + ((next_index = wrong_ngram_index+1+i) < best_choice->length())) { + // Add the next unichar id to wrong_ngram and keep looking for + // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST. + wrong_ngram[++wrong_ngram_index] = + best_choice->unichar_id(next_index); + num_wrong_blobs += best_choice->state(next_index); + } else { + break; // no more matching ambigs in this AMBIG_SPEC_LIST + } + } else { + spec_it.forward(); + } + } // end searching AmbigSpec_LIST + } // end searching best_choice + } // end searching replace and dangerous ambigs + + // If any ambiguities were found permute the constructed ambig_blob_choices + // to see if an alternative dictionary word can be found. + if (ambigs_found) { + if (stopper_debug_level > 2) { + tprintf("\nResulting ambig_blob_choices:\n"); + for (i = 0; i < ambig_blob_choices.size(); ++i) { + print_ratings_list("", ambig_blob_choices.get(i), getUnicharset()); + tprintf("\n"); + } + } + WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0); + ambigs_found = (alt_word->rating() < 0.0); + if (ambigs_found) { + if (stopper_debug_level >= 1) { + tprintf ("Stopper: Possible ambiguous word = %s\n", + alt_word->debug_string().c_str()); + } + if (fixpt != nullptr) { + // Note: Currently character choices combined from fragments can only + // be generated by NoDangrousAmbigs(). This code should be updated if + // the capability to produce classifications combined from character + // fragments is added to other functions. + int orig_i = 0; + for (i = 0; i < alt_word->length(); ++i) { + const UNICHARSET &uchset = getUnicharset(); + bool replacement_is_ngram = + uchset.get_isngram(alt_word->unichar_id(i)); + UNICHAR_ID leftmost_id = alt_word->unichar_id(i); + if (replacement_is_ngram) { + // we have to extract the leftmost unichar from the ngram. + const char *str = uchset.id_to_unichar(leftmost_id); + int step = uchset.step(str); + if (step) leftmost_id = uchset.unichar_to_id(str, step); + } + int end_i = orig_i + alt_word->state(i); + if (alt_word->state(i) > 1 || + (orig_i + 1 == end_i && replacement_is_ngram)) { + // Compute proper blob indices. + int blob_start = 0; + for (int j = 0; j < orig_i; ++j) + blob_start += best_choice->state(j); + int blob_end = blob_start; + for (int j = orig_i; j < end_i; ++j) + blob_end += best_choice->state(j); + fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true, + replacement_is_ngram, leftmost_id)); + if (stopper_debug_level > 1) { + tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i, + true, replacement_is_ngram, + uchset.id_to_unichar(leftmost_id)); + } + } + orig_i += alt_word->state(i); + } + } + } + delete alt_word; + } + if (output_ambig_words_file_ != nullptr) { + fprintf(output_ambig_words_file_, "\n"); + } + + ambig_blob_choices.delete_data_pointers(); + return !ambigs_found; +} + +void Dict::EndDangerousAmbigs() {} + +#endif // !defined(DISABLED_LEGACY_ENGINE) + +void Dict::SettupStopperPass1() { + reject_offset_ = 0.0; +} + +void Dict::SettupStopperPass2() { + reject_offset_ = stopper_phase2_certainty_rejection_offset; +} + +void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, + UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, + MATRIX *ratings) { + int num_blobs_to_replace = 0; + int begin_blob_index = 0; + int i; + // Rating and certainty for the new BLOB_CHOICE are derived from the + // replaced choices. + float new_rating = 0.0f; + float new_certainty = 0.0f; + BLOB_CHOICE* old_choice = nullptr; + for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) { + if (i >= wrong_ngram_begin_index) { + int num_blobs = werd_choice->state(i); + int col = begin_blob_index + num_blobs_to_replace; + int row = col + num_blobs - 1; + BLOB_CHOICE_LIST* choices = ratings->get(col, row); + ASSERT_HOST(choices != nullptr); + old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices); + ASSERT_HOST(old_choice != nullptr); + new_rating += old_choice->rating(); + new_certainty += old_choice->certainty(); + num_blobs_to_replace += num_blobs; + } else { + begin_blob_index += werd_choice->state(i); + } + } + new_certainty /= wrong_ngram_size; + // If there is no entry in the ratings matrix, add it. + MATRIX_COORD coord(begin_blob_index, + begin_blob_index + num_blobs_to_replace - 1); + if (!coord.Valid(*ratings)) { + ratings->IncreaseBandSize(coord.row - coord.col + 1); + } + if (ratings->get(coord.col, coord.row) == nullptr) + ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST); + BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row); + BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices); + if (choice != nullptr) { + // Already there. Upgrade if new rating better. + if (new_rating < choice->rating()) + choice->set_rating(new_rating); + if (new_certainty < choice->certainty()) + choice->set_certainty(new_certainty); + // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState. + } else { + // Need a new choice with the correct_ngram_id. + choice = new BLOB_CHOICE(*old_choice); + choice->set_unichar_id(correct_ngram_id); + choice->set_rating(new_rating); + choice->set_certainty(new_certainty); + choice->set_classifier(BCC_AMBIG); + choice->set_matrix_cell(coord.col, coord.row); + BLOB_CHOICE_IT it (new_choices); + it.add_to_end(choice); + } + // Remove current unichar from werd_choice. On the last iteration + // set the correct replacement unichar instead of removing a unichar. + for (int replaced_count = 0; replaced_count < wrong_ngram_size; + ++replaced_count) { + if (replaced_count + 1 == wrong_ngram_size) { + werd_choice->set_blob_choice(wrong_ngram_begin_index, + num_blobs_to_replace, choice); + } else { + werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1); + } + } + if (stopper_debug_level >= 1) { + werd_choice->print("ReplaceAmbig() "); + tprintf("Modified blob_choices: "); + print_ratings_list("\n", new_choices, getUnicharset()); + } +} + +int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const { + int shortest = INT32_MAX; + int curr_len = 0; + for (int w = 0; w < WordChoice.length(); ++w) { + if (WordChoice.unicharset()->get_isalpha(WordChoice.unichar_id(w))) { + curr_len++; + } else if (curr_len > 0) { + if (curr_len < shortest) shortest = curr_len; + curr_len = 0; + } + } + if (curr_len > 0 && curr_len < shortest) { + shortest = curr_len; + } else if (shortest == INT32_MAX) { + shortest = 0; + } + return shortest; +} + +int Dict::UniformCertainties(const WERD_CHOICE& word) { + float Certainty; + float WorstCertainty = FLT_MAX; + float CertaintyThreshold; + double TotalCertainty; + double TotalCertaintySquared; + double Variance; + float Mean, StdDev; + int word_length = word.length(); + + if (word_length < 3) + return true; + + TotalCertainty = TotalCertaintySquared = 0.0; + for (int i = 0; i < word_length; ++i) { + Certainty = word.certainty(i); + TotalCertainty += Certainty; + TotalCertaintySquared += static_cast(Certainty) * Certainty; + if (Certainty < WorstCertainty) + WorstCertainty = Certainty; + } + + // Subtract off worst certainty from statistics. + word_length--; + TotalCertainty -= WorstCertainty; + TotalCertaintySquared -= static_cast(WorstCertainty) * WorstCertainty; + + Mean = TotalCertainty / word_length; + Variance = ((word_length * TotalCertaintySquared - + TotalCertainty * TotalCertainty) / + (word_length * (word_length - 1))); + if (Variance < 0.0) + Variance = 0.0; + StdDev = sqrt(Variance); + + CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev; + if (CertaintyThreshold > stopper_nondict_certainty_base) + CertaintyThreshold = stopper_nondict_certainty_base; + + if (word.certainty() < CertaintyThreshold) { + if (stopper_debug_level >= 1) + tprintf("Stopper: Non-uniform certainty = %4.1f" + " (m=%4.1f, s=%4.1f, t=%4.1f)\n", + word.certainty(), Mean, StdDev, CertaintyThreshold); + return false; + } else { + return true; + } +} + +} // namespace tesseract diff --git a/tesseract/src/dict/stopper.h b/tesseract/src/dict/stopper.h new file mode 100644 index 00000000..d7df3988 --- /dev/null +++ b/tesseract/src/dict/stopper.h @@ -0,0 +1,50 @@ +/****************************************************************************** + ** Filename: stopper.h + ** Purpose: Stopping criteria for word classifier. + ** Author: Dan Johnson + ** History: Wed May 1 09:42:57 1991, DSJ, Created. + ** + ** (c) Copyright Hewlett-Packard Company, 1988. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + ******************************************************************************/ +#ifndef STOPPER_H +#define STOPPER_H + +#include "params.h" +#include "ratngs.h" + +#include "genericvector.h" +#include + +namespace tesseract { + +class WERD_CHOICE; + +using BLOB_WIDTH = uint8_t; + +struct DANGERR_INFO { + DANGERR_INFO() : + begin(-1), end(-1), dangerous(false), correct_is_ngram(false), + leftmost(INVALID_UNICHAR_ID) {} + DANGERR_INFO(int b, int e, bool d, bool n, UNICHAR_ID l) : + begin(b), end(e), dangerous(d), correct_is_ngram(n), leftmost(l) {} + int begin; + int end; + bool dangerous; + bool correct_is_ngram; + UNICHAR_ID leftmost; // in the replacement, what's the leftmost character? +}; + +using DANGERR = GenericVector; + +} // namespace tesseract + +#endif diff --git a/tesseract/src/dict/trie.cpp b/tesseract/src/dict/trie.cpp new file mode 100644 index 00000000..e9edbbaf --- /dev/null +++ b/tesseract/src/dict/trie.cpp @@ -0,0 +1,716 @@ +/****************************************************************************** + * + * File: trie.cpp (Formerly trie.c) + * Description: Functions to build a trie data structure. + * Author: Mark Seaman, OCR Technology + * + * (c) Copyright 1987, Hewlett-Packard Company. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + *****************************************************************************/ +/*---------------------------------------------------------------------- + I n c l u d e s +----------------------------------------------------------------------*/ + +#include "trie.h" + +#include "dawg.h" +#include "dict.h" +#include "genericvector.h" +#include "helpers.h" +#include "kdpair.h" + +namespace tesseract { + +const char kDoNotReverse[] = "RRP_DO_NO_REVERSE"; +const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL"; +const char kForceReverse[] = "RRP_FORCE_REVERSE"; + +const char * const RTLReversePolicyNames[] = { + kDoNotReverse, + kReverseIfHasRTL, + kForceReverse +}; + +const char Trie::kAlphaPatternUnicode[] = "\u2000"; +const char Trie::kDigitPatternUnicode[] = "\u2001"; +const char Trie::kAlphanumPatternUnicode[] = "\u2002"; +const char Trie::kPuncPatternUnicode[] = "\u2003"; +const char Trie::kLowerPatternUnicode[] = "\u2004"; +const char Trie::kUpperPatternUnicode[] = "\u2005"; + +const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) { + return RTLReversePolicyNames[reverse_policy]; +} + +// Reset the Trie to empty. +void Trie::clear() { + nodes_.delete_data_pointers(); + nodes_.clear(); + root_back_freelist_.clear(); + num_edges_ = 0; + new_dawg_node(); // Need to allocate node 0. +} + +bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node, + int direction, bool word_end, UNICHAR_ID unichar_id, + EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const { + if (debug_level_ == 3) { + tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT + " direction %d word_end %d unichar_id %d, exploring node:\n", + node_ref, next_node, direction, word_end, unichar_id); + if (node_ref != NO_EDGE) { + print_node(node_ref, nodes_[node_ref]->forward_edges.size()); + } + } + if (node_ref == NO_EDGE) return false; + assert(node_ref < nodes_.size()); + EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ? + nodes_[node_ref]->forward_edges : nodes_[node_ref]->backward_edges; + int vec_size = vec.size(); + if (node_ref == 0 && direction == FORWARD_EDGE) { // binary search + EDGE_INDEX start = 0; + EDGE_INDEX end = vec_size - 1; + EDGE_INDEX k; + int compare; + while (start <= end) { + k = (start + end) >> 1; // (start + end) / 2 + compare = given_greater_than_edge_rec(next_node, word_end, + unichar_id, vec[k]); + if (compare == 0) { // given == vec[k] + *edge_ptr = &(vec[k]); + *edge_index = k; + return true; + } else if (compare == 1) { // given > vec[k] + start = k + 1; + } else { // given < vec[k] + end = k - 1; + } + } + } else { // linear search + for (int i = 0; i < vec_size; ++i) { + EDGE_RECORD &edge_rec = vec[i]; + if (edge_rec_match(next_node, word_end, unichar_id, + next_node_from_edge_rec(edge_rec), + end_of_word_from_edge_rec(edge_rec), + unichar_id_from_edge_rec(edge_rec))) { + *edge_ptr = &(edge_rec); + *edge_index = i; + return true; + } + } + } + return false; // not found +} + +bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, + int direction, bool word_end, + UNICHAR_ID unichar_id) { + EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ? + &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges); + int search_index; + if (node1 == 0 && direction == FORWARD_EDGE) { + search_index = 0; // find the index to make the add sorted + while (search_index < vec->size() && + given_greater_than_edge_rec(node2, word_end, unichar_id, + (*vec)[search_index]) == 1) { + search_index++; + } + } else { + search_index = vec->size(); // add is unsorted, so index does not matter + } + EDGE_RECORD edge_rec; + link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id); + if (node1 == 0 && direction == BACKWARD_EDGE && + !root_back_freelist_.empty()) { + EDGE_INDEX edge_index = root_back_freelist_.pop_back(); + (*vec)[edge_index] = edge_rec; + } else if (search_index < vec->size()) { + vec->insert(edge_rec, search_index); + } else { + vec->push_back(edge_rec); + } + if (debug_level_ > 1) { + tprintf("new edge in nodes_[" REFFORMAT "]: ", node1); + print_edge_rec(edge_rec); + tprintf("\n"); + } + num_edges_++; + return true; +} + +void Trie::add_word_ending(EDGE_RECORD *edge_ptr, + NODE_REF the_next_node, + bool marker_flag, + UNICHAR_ID unichar_id) { + EDGE_RECORD *back_edge_ptr; + EDGE_INDEX back_edge_index; + ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false, + unichar_id, &back_edge_ptr, &back_edge_index)); + if (marker_flag) { + *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_); + *edge_ptr |= (MARKER_FLAG << flag_start_bit_); + } + // Mark both directions as end of word. + *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_); + *edge_ptr |= (WERD_END_FLAG << flag_start_bit_); +} + +bool Trie::add_word_to_dawg(const WERD_CHOICE &word, + const GenericVector *repetitions) { + if (word.length() <= 0) return false; // can't add empty words + if (repetitions != nullptr) ASSERT_HOST(repetitions->size() == word.length()); + // Make sure the word does not contain invalid unchar ids. + for (int i = 0; i < word.length(); ++i) { + if (word.unichar_id(i) < 0 || + word.unichar_id(i) >= unicharset_size_) return false; + } + + EDGE_RECORD *edge_ptr; + NODE_REF last_node = 0; + NODE_REF the_next_node; + bool marker_flag = false; + EDGE_INDEX edge_index; + int i; + int32_t still_finding_chars = true; + int32_t word_end = false; + bool add_failed = false; + bool found; + + if (debug_level_ > 1) word.print("\nAdding word: "); + + UNICHAR_ID unichar_id; + for (i = 0; i < word.length() - 1; ++i) { + unichar_id = word.unichar_id(i); + marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false; + if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id); + if (still_finding_chars) { + found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end, + unichar_id, &edge_ptr, &edge_index); + if (found && debug_level_ > 1) { + tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n", + edge_index, last_node); + } + if (!found) { + still_finding_chars = false; + } else if (next_node_from_edge_rec(*edge_ptr) == 0) { + // We hit the end of an existing word, but the new word is longer. + // In this case we have to disconnect the existing word from the + // backwards root node, mark the current position as end-of-word + // and add new nodes for the increased length. Disconnecting the + // existing word from the backwards root node requires a linear + // search, so it is much faster to add the longest words first, + // to avoid having to come here. + word_end = true; + still_finding_chars = false; + remove_edge(last_node, 0, word_end, unichar_id); + } else { + // We have to add a new branch here for the new word. + if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr); + last_node = next_node_from_edge_rec(*edge_ptr); + } + } + if (!still_finding_chars) { + the_next_node = new_dawg_node(); + if (debug_level_ > 1) + tprintf("adding node " REFFORMAT "\n", the_next_node); + if (the_next_node == 0) { + add_failed = true; + break; + } + if (!add_new_edge(last_node, the_next_node, + marker_flag, word_end, unichar_id)) { + add_failed = true; + break; + } + word_end = false; + last_node = the_next_node; + } + } + the_next_node = 0; + unichar_id = word.unichar_id(i); + marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false; + if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id); + if (still_finding_chars && + edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false, + unichar_id, &edge_ptr, &edge_index)) { + // An extension of this word already exists in the trie, so we + // only have to add the ending flags in both directions. + add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr), + marker_flag, unichar_id); + } else { + // Add a link to node 0. All leaves connect to node 0 so the back links can + // be used in reduction to a dawg. This root backward node has one edge + // entry for every word, (except prefixes of longer words) so it is huge. + if (!add_failed && + !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id)) + add_failed = true; + } + if (add_failed) { + tprintf("Re-initializing document dictionary...\n"); + clear(); + return false; + } else { + return true; + } +} + +NODE_REF Trie::new_dawg_node() { + auto *node = new TRIE_NODE_RECORD(); + nodes_.push_back(node); + return nodes_.size() - 1; +} + +bool Trie::read_and_add_word_list(const char *filename, + const UNICHARSET &unicharset, + Trie::RTLReversePolicy reverse_policy) { + std::vector word_list; + if (!read_word_list(filename, &word_list)) return false; + std::sort(word_list.begin(), word_list.end(), [](auto &s1, auto &s2) { + return s1.size() > s2.size(); + }); + return add_word_list(word_list, unicharset, reverse_policy); +} + +bool Trie::read_word_list(const char *filename, + std::vector* words) { + FILE *word_file; + char line_str[CHARS_PER_LINE]; + int word_count = 0; + + word_file = fopen(filename, "rb"); + if (word_file == nullptr) return false; + + while (fgets(line_str, sizeof(line_str), word_file) != nullptr) { + chomp_string(line_str); // remove newline + STRING word_str(line_str); + ++word_count; + if (debug_level_ && word_count % 10000 == 0) + tprintf("Read %d words so far\n", word_count); + words->push_back(word_str); + } + if (debug_level_) + tprintf("Read %d words total.\n", word_count); + fclose(word_file); + return true; +} + +bool Trie::add_word_list(const std::vector &words, + const UNICHARSET &unicharset, + Trie::RTLReversePolicy reverse_policy) { + for (int i = 0; i < words.size(); ++i) { + WERD_CHOICE word(words[i].c_str(), unicharset); + if (word.length() == 0 || word.contains_unichar_id(INVALID_UNICHAR_ID)) + continue; + if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL && + word.has_rtl_unichar_id()) || + reverse_policy == RRP_FORCE_REVERSE) { + word.reverse_and_mirror_unichar_ids(); + } + if (!word_in_dawg(word)) { + add_word_to_dawg(word); + if (!word_in_dawg(word)) { + tprintf("Error: word '%s' not in DAWG after adding it\n", + words[i].c_str()); + return false; + } + } + } + return true; +} + +void Trie::initialize_patterns(UNICHARSET *unicharset) { + unicharset->unichar_insert(kAlphaPatternUnicode); + alpha_pattern_ = unicharset->unichar_to_id(kAlphaPatternUnicode); + unicharset->unichar_insert(kDigitPatternUnicode); + digit_pattern_ = unicharset->unichar_to_id(kDigitPatternUnicode); + unicharset->unichar_insert(kAlphanumPatternUnicode); + alphanum_pattern_ = unicharset->unichar_to_id(kAlphanumPatternUnicode); + unicharset->unichar_insert(kPuncPatternUnicode); + punc_pattern_ = unicharset->unichar_to_id(kPuncPatternUnicode); + unicharset->unichar_insert(kLowerPatternUnicode); + lower_pattern_ = unicharset->unichar_to_id(kLowerPatternUnicode); + unicharset->unichar_insert(kUpperPatternUnicode); + upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode); + initialized_patterns_ = true; + unicharset_size_ = unicharset->size(); +} + +void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id, + const UNICHARSET &unicharset, + GenericVector *vec) const { + bool is_alpha = unicharset.get_isalpha(unichar_id); + if (is_alpha) { + vec->push_back(alpha_pattern_); + vec->push_back(alphanum_pattern_); + if (unicharset.get_islower(unichar_id)) { + vec->push_back(lower_pattern_); + } else if (unicharset.get_isupper(unichar_id)) { + vec->push_back(upper_pattern_); + } + } + if (unicharset.get_isdigit(unichar_id)) { + vec->push_back(digit_pattern_); + if (!is_alpha) vec->push_back(alphanum_pattern_); + } + if (unicharset.get_ispunctuation(unichar_id)) { + vec->push_back(punc_pattern_); + } +} + +UNICHAR_ID Trie::character_class_to_pattern(char ch) { + if (ch == 'c') { + return alpha_pattern_; + } else if (ch == 'd') { + return digit_pattern_; + } else if (ch == 'n') { + return alphanum_pattern_; + } else if (ch == 'p') { + return punc_pattern_; + } else if (ch == 'a') { + return lower_pattern_; + } else if (ch == 'A') { + return upper_pattern_; + } else { + return INVALID_UNICHAR_ID; + } +} + +bool Trie::read_pattern_list(const char *filename, + const UNICHARSET &unicharset) { + if (!initialized_patterns_) { + tprintf("please call initialize_patterns() before read_pattern_list()\n"); + return false; + } + + FILE *pattern_file = fopen(filename, "rb"); + if (pattern_file == nullptr) { + tprintf("Error opening pattern file %s\n", filename); + return false; + } + + int pattern_count = 0; + char string[CHARS_PER_LINE]; + while (fgets(string, CHARS_PER_LINE, pattern_file) != nullptr) { + chomp_string(string); // remove newline + // Parse the pattern and construct a unichar id vector. + // Record the number of repetitions of each unichar in the parallel vector. + WERD_CHOICE word(&unicharset); + GenericVector repetitions_vec; + const char *str_ptr = string; + int step = unicharset.step(str_ptr); + bool failed = false; + while (step > 0) { + UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID; + if (step == 1 && *str_ptr == '\\') { + ++str_ptr; + if (*str_ptr == '\\') { // regular '\' unichar that was escaped + curr_unichar_id = unicharset.unichar_to_id(str_ptr, step); + } else { + if (word.length() < kSaneNumConcreteChars) { + tprintf("Please provide at least %d concrete characters at the" + " beginning of the pattern\n", kSaneNumConcreteChars); + failed = true; + break; + } + // Parse character class from expression. + curr_unichar_id = character_class_to_pattern(*str_ptr); + } + } else { + curr_unichar_id = unicharset.unichar_to_id(str_ptr, step); + } + if (curr_unichar_id == INVALID_UNICHAR_ID) { + failed = true; + break; // failed to parse this pattern + } + word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0); + repetitions_vec.push_back(false); + str_ptr += step; + step = unicharset.step(str_ptr); + // Check if there is a repetition pattern specified after this unichar. + if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') { + repetitions_vec[repetitions_vec.size()-1] = true; + str_ptr += 2; + step = unicharset.step(str_ptr); + } + } + if (failed) { + tprintf("Invalid user pattern %s\n", string); + continue; + } + // Insert the pattern into the trie. + if (debug_level_ > 2) { + tprintf("Inserting expanded user pattern %s\n", + word.debug_string().c_str()); + } + if (!this->word_in_dawg(word)) { + this->add_word_to_dawg(word, &repetitions_vec); + if (!this->word_in_dawg(word)) { + tprintf("Error: failed to insert pattern '%s'\n", string); + } + } + ++pattern_count; + } + if (debug_level_) { + tprintf("Read %d valid patterns from %s\n", pattern_count, filename); + } + fclose(pattern_file); + return true; +} + +void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, + bool word_end, UNICHAR_ID unichar_id) { + EDGE_RECORD *edge_ptr = nullptr; + EDGE_INDEX edge_index = 0; + ASSERT_HOST(edge_char_of(node1, node2, direction, word_end, + unichar_id, &edge_ptr, &edge_index)); + if (debug_level_ > 1) { + tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1); + print_edge_rec(*edge_ptr); + tprintf("\n"); + } + if (direction == FORWARD_EDGE) { + nodes_[node1]->forward_edges.remove(edge_index); + } else if (node1 == 0) { + KillEdge(&nodes_[node1]->backward_edges[edge_index]); + root_back_freelist_.push_back(edge_index); + } else { + nodes_[node1]->backward_edges.remove(edge_index); + } + --num_edges_; +} + +// Some optimizations employed in add_word_to_dawg and trie_to_dawg: +// 1 Avoid insertion sorting or bubble sorting the tail root node +// (back links on node 0, a list of all the leaves.). The node is +// huge, and sorting it with n^2 time is terrible. +// 2 Avoid using GenericVector::remove on the tail root node. +// (a) During add of words to the trie, zero-out the unichars and +// keep a freelist of spaces to re-use. +// (b) During reduction, just zero-out the unichars of deleted back +// links, skipping zero entries while searching. +// 3 Avoid linear search of the tail root node. This has to be done when +// a suffix is added to an existing word. Adding words by decreasing +// length avoids this problem entirely. Words can still be added in +// any order, but it is faster to add the longest first. +SquishedDawg *Trie::trie_to_dawg() { + root_back_freelist_.clear(); // Will be invalided by trie_to_dawg. + if (debug_level_ > 2) { + print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY); + } + auto reduced_nodes = new bool[nodes_.size()]; + for (int i = 0; i < nodes_.size(); i++) reduced_nodes[i] = false; + this->reduce_node_input(0, reduced_nodes); + delete[] reduced_nodes; + + if (debug_level_ > 2) { + print_all("After reduction:", MAX_NODE_EDGES_DISPLAY); + } + // Build a translation map from node indices in nodes_ vector to + // their target indices in EDGE_ARRAY. + auto *node_ref_map = new NODE_REF[nodes_.size() + 1]; + int i, j; + node_ref_map[0] = 0; + for (i = 0; i < nodes_.size(); ++i) { + node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size(); + } + int num_forward_edges = node_ref_map[i]; + + // Convert nodes_ vector into EDGE_ARRAY translating the next node references + // in edges using node_ref_map. Empty nodes and backward edges are dropped. + auto edge_array = new EDGE_RECORD[num_forward_edges]; + EDGE_ARRAY edge_array_ptr = edge_array; + for (i = 0; i < nodes_.size(); ++i) { + TRIE_NODE_RECORD *node_ptr = nodes_[i]; + int end = node_ptr->forward_edges.size(); + for (j = 0; j < end; ++j) { + EDGE_RECORD &edge_rec = node_ptr->forward_edges[j]; + NODE_REF node_ref = next_node_from_edge_rec(edge_rec); + ASSERT_HOST(node_ref < nodes_.size()); + UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec); + link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE, + end_of_word_from_edge_rec(edge_rec), unichar_id); + if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr); + ++edge_array_ptr; + } + } + delete[] node_ref_map; + + return new SquishedDawg(edge_array, num_forward_edges, type_, lang_, + perm_, unicharset_size_, debug_level_); +} + +bool Trie::eliminate_redundant_edges(NODE_REF node, + const EDGE_RECORD &edge1, + const EDGE_RECORD &edge2) { + if (debug_level_ > 1) { + tprintf("\nCollapsing node %" PRIi64 ":\n", node); + print_node(node, MAX_NODE_EDGES_DISPLAY); + tprintf("Candidate edges: "); + print_edge_rec(edge1); + tprintf(", "); + print_edge_rec(edge2); + tprintf("\n\n"); + } + NODE_REF next_node1 = next_node_from_edge_rec(edge1); + NODE_REF next_node2 = next_node_from_edge_rec(edge2); + TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2]; + // Translate all edges going to/from next_node2 to go to/from next_node1. + EDGE_RECORD *edge_ptr = nullptr; + EDGE_INDEX edge_index; + int i; + // The backward link in node to next_node2 will be zeroed out by the caller. + // Copy all the backward links in next_node2 to node next_node1 + for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) { + const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i]; + NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge); + UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge); + int curr_word_end = end_of_word_from_edge_rec(bkw_edge); + bool marker_flag = marker_flag_from_edge_rec(bkw_edge); + add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE, + curr_word_end, curr_unichar_id); + // Relocate the corresponding forward edge in curr_next_node + ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE, + curr_word_end, curr_unichar_id, + &edge_ptr, &edge_index)); + set_next_node_in_edge_rec(edge_ptr, next_node1); + } + int next_node2_num_edges = (next_node2_ptr->forward_edges.size() + + next_node2_ptr->backward_edges.size()); + if (debug_level_ > 1) { + tprintf("removed %d edges from node " REFFORMAT "\n", + next_node2_num_edges, next_node2); + } + next_node2_ptr->forward_edges.clear(); + next_node2_ptr->backward_edges.clear(); + num_edges_ -= next_node2_num_edges; + return true; +} + +bool Trie::reduce_lettered_edges(EDGE_INDEX edge_index, + UNICHAR_ID unichar_id, + NODE_REF node, + EDGE_VECTOR* backward_edges, + NODE_MARKER reduced_nodes) { + if (debug_level_ > 1) + tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index); + // Compare each of the edge pairs with the given unichar_id. + bool did_something = false; + for (int i = edge_index; i < backward_edges->size() - 1; ++i) { + // Find the first edge that can be eliminated. + UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID; + while (i < backward_edges->size()) { + if (!DeadEdge((*backward_edges)[i])) { + curr_unichar_id = unichar_id_from_edge_rec((*backward_edges)[i]); + if (curr_unichar_id != unichar_id) return did_something; + if (can_be_eliminated((*backward_edges)[i])) break; + } + ++i; + } + if (i == backward_edges->size()) break; + const EDGE_RECORD &edge_rec = (*backward_edges)[i]; + // Compare it to the rest of the edges with the given unichar_id. + for (int j = i + 1; j < backward_edges->size(); ++j) { + const EDGE_RECORD &next_edge_rec = (*backward_edges)[j]; + if (DeadEdge(next_edge_rec)) continue; + UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec); + if (next_id != unichar_id) break; + if (end_of_word_from_edge_rec(next_edge_rec) == + end_of_word_from_edge_rec(edge_rec) && + can_be_eliminated(next_edge_rec) && + eliminate_redundant_edges(node, edge_rec, next_edge_rec)) { + reduced_nodes[next_node_from_edge_rec(edge_rec)] = false; + did_something = true; + KillEdge(&(*backward_edges)[j]); + } + } + } + return did_something; +} + +void Trie::sort_edges(EDGE_VECTOR *edges) { + int num_edges = edges->size(); + if (num_edges <= 1) return; + GenericVector > sort_vec; + sort_vec.reserve(num_edges); + for (int i = 0; i < num_edges; ++i) { + sort_vec.push_back(KDPairInc( + unichar_id_from_edge_rec((*edges)[i]), (*edges)[i])); + } + sort_vec.sort(); + for (int i = 0; i < num_edges; ++i) + (*edges)[i] = sort_vec[i].data(); +} + +void Trie::reduce_node_input(NODE_REF node, + NODE_MARKER reduced_nodes) { + EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges; + sort_edges(&backward_edges); + if (debug_level_ > 1) { + tprintf("reduce_node_input(node=" REFFORMAT ")\n", node); + print_node(node, MAX_NODE_EDGES_DISPLAY); + } + + EDGE_INDEX edge_index = 0; + while (edge_index < backward_edges.size()) { + if (DeadEdge(backward_edges[edge_index])) continue; + UNICHAR_ID unichar_id = + unichar_id_from_edge_rec(backward_edges[edge_index]); + while (reduce_lettered_edges(edge_index, unichar_id, node, + &backward_edges, reduced_nodes)); + while (++edge_index < backward_edges.size()) { + UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]); + if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) break; + } + } + reduced_nodes[node] = true; // mark as reduced + + if (debug_level_ > 1) { + tprintf("Node " REFFORMAT " after reduction:\n", node); + print_node(node, MAX_NODE_EDGES_DISPLAY); + } + + for (int i = 0; i < backward_edges.size(); ++i) { + if (DeadEdge(backward_edges[i])) continue; + NODE_REF next_node = next_node_from_edge_rec(backward_edges[i]); + if (next_node != 0 && !reduced_nodes[next_node]) { + reduce_node_input(next_node, reduced_nodes); + } + } +} + +void Trie::print_node(NODE_REF node, int max_num_edges) const { + if (node == NO_EDGE) return; // nothing to print + TRIE_NODE_RECORD *node_ptr = nodes_[node]; + int num_fwd = node_ptr->forward_edges.size(); + int num_bkw = node_ptr->backward_edges.size(); + EDGE_VECTOR *vec; + for (int dir = 0; dir < 2; ++dir) { + if (dir == 0) { + vec = &(node_ptr->forward_edges); + tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw); + } else { + vec = &(node_ptr->backward_edges); + tprintf("\t"); + } + int i; + for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) && + i < max_num_edges; ++i) { + if (DeadEdge((*vec)[i])) continue; + print_edge_rec((*vec)[i]); + tprintf(" "); + } + if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("..."); + tprintf("\n"); + } +} + +} // namespace tesseract diff --git a/tesseract/src/dict/trie.h b/tesseract/src/dict/trie.h new file mode 100644 index 00000000..f516b3c5 --- /dev/null +++ b/tesseract/src/dict/trie.h @@ -0,0 +1,432 @@ +/****************************************************************************** + * + * File: trie.h + * Description: Functions to build a trie data structure. + * Author: Mark Seaman, SW Productivity + * + * (c) Copyright 1987, Hewlett-Packard Company. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + *****************************************************************************/ +#ifndef TRIE_H +#define TRIE_H + +#include "dawg.h" + +#include "genericvector.h" + +namespace tesseract { + +class UNICHARSET; + +// Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed +// max int32, we will need to change GenericVector to use int64 for size +// and address indices. This does not seem to be needed immediately, +// since currently the largest number of edges limit used by tesseract +// (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32. +// There are also int casts below to satisfy the WIN32 compiler that would +// need to be changed. +// It might be cleanest to change the types of most of the Trie/Dawg related +// typedefs to int and restrict the casts to extracting these values from +// the 64 bit EDGE_RECORD. +using EDGE_INDEX = int64_t ; // index of an edge in a given node +using NODE_MARKER = bool *; +using EDGE_VECTOR = GenericVector ; + +struct TRIE_NODE_RECORD { + EDGE_VECTOR forward_edges; + EDGE_VECTOR backward_edges; +}; +using TRIE_NODES = GenericVector ; + +/** + * Concrete class for Trie data structure that allows to store a list of + * words (extends Dawg base class) as well as dynamically add new words. + * This class stores a vector of pointers to TRIE_NODE_RECORDs, each of + * which has a vector of forward and backward edges. + */ +class TESS_API Trie : public Dawg { + public: + enum RTLReversePolicy { + RRP_DO_NO_REVERSE, + RRP_REVERSE_IF_HAS_RTL, + RRP_FORCE_REVERSE, + }; + + // Minimum number of concrete characters at the beginning of user patterns. + static const int kSaneNumConcreteChars = 0; + // Various unicode whitespace characters are used to denote unichar patterns, + // (character classifier would never produce these whitespace characters as a + // valid classification). + static const char kAlphaPatternUnicode[]; + static const char kDigitPatternUnicode[]; + static const char kAlphanumPatternUnicode[]; + static const char kPuncPatternUnicode[]; + static const char kLowerPatternUnicode[]; + static const char kUpperPatternUnicode[]; + + static const char *get_reverse_policy_name( + RTLReversePolicy reverse_policy); + + // max_num_edges argument allows limiting the amount of memory this + // Trie can consume (if a new word insert would cause the Trie to + // contain more edges than max_num_edges, all the edges are cleared + // so that new inserts can proceed). + Trie(DawgType type, const STRING &lang, PermuterType perm, + int unicharset_size, int debug_level) + : Dawg(type, lang, perm, debug_level) { + init(unicharset_size); + num_edges_ = 0; + deref_node_index_mask_ = ~letter_mask_; + new_dawg_node(); // need to allocate node 0 + initialized_patterns_ = false; + } + ~Trie() override { nodes_.delete_data_pointers(); } + + // Reset the Trie to empty. + void clear(); + + /** Returns the edge that corresponds to the letter out of this node. */ + EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, + bool word_end) const override { + EDGE_RECORD *edge_ptr; + EDGE_INDEX edge_index; + if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, + &edge_ptr, &edge_index)) return NO_EDGE; + return make_edge_ref(node_ref, edge_index); + } + + /** + * Fills the given NodeChildVector with all the unichar ids (and the + * corresponding EDGE_REFs) for which there is an edge out of this node. + */ + void unichar_ids_of(NODE_REF node, NodeChildVector *vec, + bool word_end) const override { + const EDGE_VECTOR &forward_edges = + nodes_[static_cast(node)]->forward_edges; + for (int i = 0; i < forward_edges.size(); ++i) { + if (!word_end || end_of_word_from_edge_rec(forward_edges[i])) { + vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]), + make_edge_ref(node, i))); + } + } + } + + /** + * Returns the next node visited by following the edge + * indicated by the given EDGE_REF. + */ + NODE_REF next_node(EDGE_REF edge_ref) const override { + if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE; + return next_node_from_edge_rec(*deref_edge_ref(edge_ref)); + } + + /** + * Returns true if the edge indicated by the given EDGE_REF + * marks the end of a word. + */ + bool end_of_word(EDGE_REF edge_ref) const override { + if (edge_ref == NO_EDGE || num_edges_ == 0) return false; + return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref)); + } + + /** Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. */ + UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override { + if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID; + return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref)); + } + // Sets the UNICHAR_ID in the given edge_rec to unicharset_size_, marking + // the edge dead. + void KillEdge(EDGE_RECORD* edge_rec) const { + *edge_rec &= ~letter_mask_; + *edge_rec |= (unicharset_size_ << LETTER_START_BIT); + } + bool DeadEdge(const EDGE_RECORD& edge_rec) const { + return unichar_id_from_edge_rec(edge_rec) == unicharset_size_; + } + + // Prints the contents of the node indicated by the given NODE_REF. + // At most max_num_edges will be printed. + void print_node(NODE_REF node, int max_num_edges) const override; + + // Writes edges from nodes_ to an EDGE_ARRAY and creates a SquishedDawg. + // Eliminates redundant edges and returns the pointer to the SquishedDawg. + // Note: the caller is responsible for deallocating memory associated + // with the returned SquishedDawg pointer. + SquishedDawg *trie_to_dawg(); + + // Reads a list of words from the given file and adds into the Trie. + // Calls WERD_CHOICE::reverse_unichar_ids_if_rtl() according to the reverse + // policy and information in the unicharset. + // Returns false on error. + bool read_and_add_word_list(const char *filename, + const UNICHARSET &unicharset, + Trie::RTLReversePolicy reverse); + + // Reads a list of words from the given file. + // Returns false on error. + bool read_word_list(const char *filename, + std::vector* words); + // Adds a list of words previously read using read_word_list to the trie + // using the given unicharset and reverse_policy to convert to unichar-ids. + // Returns false on error. + bool add_word_list(const std::vector &words, + const UNICHARSET &unicharset, + Trie::RTLReversePolicy reverse_policy); + + // Inserts the list of patterns from the given file into the Trie. + // The pattern list file should contain one pattern per line in UTF-8 format. + // + // Each pattern can contain any non-whitespace characters, however only the + // patterns that contain characters from the unicharset of the corresponding + // language will be useful. + // The only meta character is '\'. To be used in a pattern as an ordinary + // string it should be escaped with '\' (e.g. string "C:\Documents" should + // be written in the patterns file as "C:\\Documents"). + // This function supports a very limited regular expression syntax. One can + // express a character, a certain character class and a number of times the + // entity should be repeated in the pattern. + // + // To denote a character class use one of: + // \c - unichar for which UNICHARSET::get_isalpha() is true (character) + // \d - unichar for which UNICHARSET::get_isdigit() is true + // \n - unichar for which UNICHARSET::get_isdigit() or + // UNICHARSET::isalpha() are true + // \p - unichar for which UNICHARSET::get_ispunct() is true + // \a - unichar for which UNICHARSET::get_islower() is true + // \A - unichar for which UNICHARSET::get_isupper() is true + // + // \* could be specified after each character or pattern to indicate that + // the character/pattern can be repeated any number of times before the next + // character/pattern occurs. + // + // Examples: + // 1-8\d\d-GOOG-411 will be expanded to strings: + // 1-800-GOOG-411, 1-801-GOOG-411, ... 1-899-GOOG-411. + // + // http://www.\n\*.com will be expanded to strings like: + // http://www.a.com http://www.a123.com ... http://www.ABCDefgHIJKLMNop.com + // + // Note: In choosing which patterns to include please be aware of the fact + // providing very generic patterns will make tesseract run slower. + // For example \n\* at the beginning of the pattern will make Tesseract + // consider all the combinations of proposed character choices for each + // of the segmentations, which will be unacceptably slow. + // Because of potential problems with speed that could be difficult to + // identify, each user pattern has to have at least kSaneNumConcreteChars + // concrete characters from the unicharset at the beginning. + bool read_pattern_list(const char *filename, const UNICHARSET &unicharset); + + // Initializes the values of *_pattern_ unichar ids. + // This function should be called before calling read_pattern_list(). + void initialize_patterns(UNICHARSET *unicharset); + + // Fills in the given unichar id vector with the unichar ids that represent + // the patterns of the character classes of the given unichar_id. + void unichar_id_to_patterns(UNICHAR_ID unichar_id, + const UNICHARSET &unicharset, + GenericVector *vec) const override; + + // Returns the given EDGE_REF if the EDGE_RECORD that it points to has + // a self loop and the given unichar_id matches the unichar_id stored in the + // EDGE_RECORD, returns NO_EDGE otherwise. + EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, + UNICHAR_ID unichar_id, + bool word_end) const override { + if (edge_ref == NO_EDGE) return NO_EDGE; + EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref); + return (marker_flag_from_edge_rec(*edge_rec) && + unichar_id == unichar_id_from_edge_rec(*edge_rec) && + word_end == end_of_word_from_edge_rec(*edge_rec)) ? + edge_ref : NO_EDGE; + } + + // Adds a word to the Trie (creates the necessary nodes and edges). + // + // If repetitions vector is not nullptr, each entry in the vector indicates + // whether the unichar id with the corresponding index in the word is allowed + // to repeat an unlimited number of times. For each entry that is true, MARKER + // flag of the corresponding edge created for this unichar id is set to true). + // + // Return true if add succeeded, false otherwise (e.g. when a word contained + // an invalid unichar id or the trie was getting too large and was cleared). + bool add_word_to_dawg(const WERD_CHOICE &word, + const GenericVector *repetitions); + bool add_word_to_dawg(const WERD_CHOICE &word) { + return add_word_to_dawg(word, nullptr); + } + + protected: + // The structure of an EDGE_REF for Trie edges is as follows: + // [LETTER_START_BIT, flag_start_bit_): + // edge index in *_edges in a TRIE_NODE_RECORD + // [flag_start_bit, 30th bit]: node index in nodes (TRIE_NODES vector) + // + // With this arrangement there are enough bits to represent edge indices + // (each node can have at most unicharset_size_ forward edges and + // the position of flag_start_bit is set to be log2(unicharset_size_)). + // It is also possible to accommodate a maximum number of nodes that is at + // least as large as that of the SquishedDawg representation (in SquishedDawg + // each EDGE_RECORD has 32-(flag_start_bit+NUM_FLAG_BITS) bits to represent + // the next node index). + // + + // Returns the pointer to EDGE_RECORD after decoding the location + // of the edge from the information in the given EDGE_REF. + // This function assumes that EDGE_REF holds valid node/edge indices. + inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const { + int edge_index = static_cast( + (edge_ref & letter_mask_) >> LETTER_START_BIT); + int node_index = static_cast( + (edge_ref & deref_node_index_mask_) >> flag_start_bit_); + TRIE_NODE_RECORD *node_rec = nodes_[node_index]; + return &(node_rec->forward_edges[edge_index]); + } + /** Constructs EDGE_REF from the given node_index and edge_index. */ + inline EDGE_REF make_edge_ref(NODE_REF node_index, + EDGE_INDEX edge_index) const { + return ((node_index << flag_start_bit_) | + (edge_index << LETTER_START_BIT)); + } + /** Sets up this edge record to the requested values. */ + inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, + int direction, bool word_end, UNICHAR_ID unichar_id) { + EDGE_RECORD flags = 0; + if (repeats) flags |= MARKER_FLAG; + if (word_end) flags |= WERD_END_FLAG; + if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG; + *edge = ((nxt << next_node_start_bit_) | + (static_cast(flags) << flag_start_bit_) | + (static_cast(unichar_id) << LETTER_START_BIT)); + } + /** Prints the given EDGE_RECORD. */ + inline void print_edge_rec(const EDGE_RECORD &edge_rec) const { + tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec), + marker_flag_from_edge_rec(edge_rec) ? "R," : "", + (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B", + end_of_word_from_edge_rec(edge_rec) ? ",E" : "", + unichar_id_from_edge_rec(edge_rec)); + } + // Returns true if the next node in recorded the given EDGE_RECORD + // has exactly one forward edge. + inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) { + NODE_REF node_ref = next_node_from_edge_rec(edge_rec); + return (node_ref != NO_EDGE && + nodes_[static_cast(node_ref)]->forward_edges.size() == 1); + } + + // Prints the contents of the Trie. + // At most max_num_edges will be printed for each node. + void print_all(const char* msg, int max_num_edges) { + tprintf("\n__________________________\n%s\n", msg); + for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges); + tprintf("__________________________\n"); + } + + // Finds the edge with the given direction, word_end and unichar_id + // in the node indicated by node_ref. Fills in the pointer to the + // EDGE_RECORD and the index of the edge with the the values + // corresponding to the edge found. Returns true if an edge was found. + bool edge_char_of(NODE_REF node_ref, NODE_REF next_node, + int direction, bool word_end, UNICHAR_ID unichar_id, + EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const; + + // Adds an single edge linkage between node1 and node2 in the direction + // indicated by direction argument. + bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, + int direction, bool word_end, + UNICHAR_ID unichar_id); + + // Adds forward edge linkage from node1 to node2 and the corresponding + // backward edge linkage in the other direction. + bool add_new_edge(NODE_REF node1, NODE_REF node2, + bool repeats, bool word_end, UNICHAR_ID unichar_id) { + return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE, + word_end, unichar_id) && + add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE, + word_end, unichar_id)); + } + + // Sets the word ending flags in an already existing edge pair. + // Returns true on success. + void add_word_ending(EDGE_RECORD *edge, + NODE_REF the_next_node, + bool repeats, + UNICHAR_ID unichar_id); + + // Allocates space for a new node in the Trie. + NODE_REF new_dawg_node(); + + // Removes a single edge linkage to between node1 and node2 in the + // direction indicated by direction argument. + void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, + bool word_end, UNICHAR_ID unichar_id); + + // Removes forward edge linkage from node1 to node2 and the corresponding + // backward edge linkage in the other direction. + void remove_edge(NODE_REF node1, NODE_REF node2, + bool word_end, UNICHAR_ID unichar_id) { + remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id); + remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id); + } + + // Compares edge1 and edge2 in the given node to see if they point to two + // next nodes that could be collapsed. If they do, performs the reduction + // and returns true. + bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, + const EDGE_RECORD &edge2); + + // Assuming that edge_index indicates the first edge in a group of edges + // in this node with a particular letter value, looks through these edges + // to see if any of them can be collapsed. If so does it. Returns to the + // caller when all edges with this letter have been reduced. + // Returns true if further reduction is possible with this same letter. + bool reduce_lettered_edges(EDGE_INDEX edge_index, + UNICHAR_ID unichar_id, + NODE_REF node, + EDGE_VECTOR* backward_edges, + NODE_MARKER reduced_nodes); + + /** + * Order num_edges of consecutive EDGE_RECORDS in the given EDGE_VECTOR in + * increasing order of unichar ids. This function is normally called + * for all edges in a single node, and since number of edges in each node + * is usually quite small, selection sort is used. + */ + void sort_edges(EDGE_VECTOR *edges); + + /** Eliminates any redundant edges from this node in the Trie. */ + void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes); + + // Returns the pattern unichar id for the given character class code. + UNICHAR_ID character_class_to_pattern(char ch); + + // Member variables + TRIE_NODES nodes_; // vector of nodes in the Trie + // Freelist of edges in the root backwards node that were previously zeroed. + GenericVector root_back_freelist_; + uint64_t num_edges_; // sum of all edges (forward and backward) + uint64_t deref_direction_mask_; // mask for EDGE_REF to extract direction + uint64_t deref_node_index_mask_; // mask for EDGE_REF to extract node index + // Variables for translating character class codes denoted in user patterns + // file to the unichar ids used to represent them in a Trie. + UNICHAR_ID alpha_pattern_; + UNICHAR_ID digit_pattern_; + UNICHAR_ID alphanum_pattern_; + UNICHAR_ID punc_pattern_; + UNICHAR_ID lower_pattern_; + UNICHAR_ID upper_pattern_; + bool initialized_patterns_; +}; + +} // namespace tesseract + +#endif -- cgit v1.2.3-65-gdbad