summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'tesseract/src/ccmain')
-rw-r--r--tesseract/src/ccmain/adaptions.cpp114
-rw-r--r--tesseract/src/ccmain/applybox.cpp807
-rw-r--r--tesseract/src/ccmain/control.cpp2110
-rw-r--r--tesseract/src/ccmain/control.h38
-rw-r--r--tesseract/src/ccmain/docqual.cpp981
-rw-r--r--tesseract/src/ccmain/docqual.h43
-rw-r--r--tesseract/src/ccmain/equationdetect.cpp1516
-rw-r--r--tesseract/src/ccmain/equationdetect.h273
-rw-r--r--tesseract/src/ccmain/fixspace.cpp885
-rw-r--r--tesseract/src/ccmain/fixspace.h36
-rw-r--r--tesseract/src/ccmain/fixxht.cpp216
-rw-r--r--tesseract/src/ccmain/linerec.cpp307
-rw-r--r--tesseract/src/ccmain/ltrresultiterator.cpp492
-rw-r--r--tesseract/src/ccmain/mutableiterator.cpp24
-rw-r--r--tesseract/src/ccmain/mutableiterator.h63
-rw-r--r--tesseract/src/ccmain/osdetect.cpp579
-rw-r--r--tesseract/src/ccmain/output.cpp418
-rw-r--r--tesseract/src/ccmain/output.h37
-rw-r--r--tesseract/src/ccmain/pageiterator.cpp635
-rw-r--r--tesseract/src/ccmain/pagesegmain.cpp420
-rw-r--r--tesseract/src/ccmain/pagewalk.cpp43
-rw-r--r--tesseract/src/ccmain/par_control.cpp73
-rw-r--r--tesseract/src/ccmain/paragraphs.cpp2590
-rw-r--r--tesseract/src/ccmain/paragraphs.h110
-rw-r--r--tesseract/src/ccmain/paragraphs_internal.h314
-rw-r--r--tesseract/src/ccmain/paramsd.cpp365
-rw-r--r--tesseract/src/ccmain/paramsd.h134
-rw-r--r--tesseract/src/ccmain/pgedit.cpp981
-rw-r--r--tesseract/src/ccmain/pgedit.h71
-rw-r--r--tesseract/src/ccmain/recogtraining.cpp238
-rw-r--r--tesseract/src/ccmain/reject.cpp792
-rw-r--r--tesseract/src/ccmain/reject.h39
-rw-r--r--tesseract/src/ccmain/resultiterator.cpp752
-rw-r--r--tesseract/src/ccmain/superscript.cpp610
-rw-r--r--tesseract/src/ccmain/tessbox.cpp75
-rw-r--r--tesseract/src/ccmain/tessedit.cpp474
-rw-r--r--tesseract/src/ccmain/tesseractclass.cpp707
-rw-r--r--tesseract/src/ccmain/tesseractclass.h1163
-rw-r--r--tesseract/src/ccmain/tessvars.cpp24
-rw-r--r--tesseract/src/ccmain/tessvars.h27
-rw-r--r--tesseract/src/ccmain/tfacepp.cpp322
-rw-r--r--tesseract/src/ccmain/thresholder.cpp334
-rw-r--r--tesseract/src/ccmain/werdit.cpp68
-rw-r--r--tesseract/src/ccmain/werdit.h34
44 files changed, 20334 insertions, 0 deletions
diff --git a/tesseract/src/ccmain/adaptions.cpp b/tesseract/src/ccmain/adaptions.cpp
new file mode 100644
index 00000000..e07bf58c
--- /dev/null
+++ b/tesseract/src/ccmain/adaptions.cpp
@@ -0,0 +1,114 @@
+/**********************************************************************
+ * File: adaptions.cpp (Formerly adaptions.c)
+ * Description: Functions used to adapt to blobs already confidently
+ * identified
+ * Author: Chris Newton
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <cctype>
+#include <cstring>
+#include "tessvars.h"
+#include "reject.h"
+#include "control.h"
+#include "stopper.h"
+#include "tesseractclass.h"
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+namespace tesseract {
+bool Tesseract::word_adaptable( //should we adapt?
+ WERD_RES* word,
+ uint16_t mode) {
+ if (tessedit_adaption_debug) {
+ tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
+ word->best_choice->unichar_string().c_str(),
+ word->best_choice->rating(), word->best_choice->certainty());
+ }
+
+ bool status = false;
+ BITS16 flags(mode);
+
+ enum MODES
+ {
+ ADAPTABLE_WERD,
+ ACCEPTABLE_WERD,
+ CHECK_DAWGS,
+ CHECK_SPACES,
+ CHECK_ONE_ELL_CONFLICT,
+ CHECK_AMBIG_WERD
+ };
+
+ /*
+ 0: NO adaption
+ */
+ if (mode == 0) {
+ if (tessedit_adaption_debug) tprintf("adaption disabled\n");
+ return false;
+ }
+
+ if (flags[ADAPTABLE_WERD]) {
+ status |= word->tess_would_adapt; // result of Classify::AdaptableWord()
+ if (tessedit_adaption_debug && !status) {
+ tprintf("tess_would_adapt bit is false\n");
+ }
+ }
+
+ if (flags[ACCEPTABLE_WERD]) {
+ status |= word->tess_accepted;
+ if (tessedit_adaption_debug && !status) {
+ tprintf("tess_accepted bit is false\n");
+ }
+ }
+
+ if (!status) { // If not set then
+ return false; // ignore other checks
+ }
+
+ if (flags[CHECK_DAWGS] &&
+ (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
+ (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
+ (word->best_choice->permuter () != USER_DAWG_PERM) &&
+ (word->best_choice->permuter () != NUMBER_PERM)) {
+ if (tessedit_adaption_debug) tprintf("word not in dawgs\n");
+ return false;
+ }
+
+ if (flags[CHECK_ONE_ELL_CONFLICT] && one_ell_conflict (word, false)) {
+ if (tessedit_adaption_debug) tprintf("word has ell conflict\n");
+ return false;
+ }
+
+ if (flags[CHECK_SPACES] &&
+ (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
+ if (tessedit_adaption_debug) tprintf("word contains spaces\n");
+ return false;
+ }
+
+ if (flags[CHECK_AMBIG_WERD] &&
+ word->best_choice->dangerous_ambig_found()) {
+ if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
+ return false;
+ }
+
+ if (tessedit_adaption_debug) {
+ tprintf("returning status %d\n", status);
+ }
+ return status;
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/applybox.cpp b/tesseract/src/ccmain/applybox.cpp
new file mode 100644
index 00000000..a8d1bbcd
--- /dev/null
+++ b/tesseract/src/ccmain/applybox.cpp
@@ -0,0 +1,807 @@
+/**********************************************************************
+ * File: applybox.cpp (Formerly applybox.c)
+ * Description: Re segment rows according to box file data
+ * Author: Phil Cheatle
+ *
+ * (C) Copyright 1993, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef DISABLED_LEGACY_ENGINE
+#include <cctype>
+#include <cerrno>
+#include <cstring>
+#include "allheaders.h"
+#include "boxread.h"
+#endif // ndef DISABLED_LEGACY_ENGINE
+#include "pageres.h"
+#include <tesseract/unichar.h>
+#include "unicharset.h"
+#include "tesseractclass.h"
+#include "genericvector.h"
+
+#ifndef DISABLED_LEGACY_ENGINE
+/** Max number of blobs to classify together in FindSegmentation. */
+const int kMaxGroupSize = 4;
+/// Max fraction of median allowed as deviation in xheight before switching
+/// to median.
+const double kMaxXHeightDeviationFraction = 0.125;
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+/**
+ * The box file is assumed to contain box definitions, one per line, of the
+ * following format for blob-level boxes:
+ * @verbatim
+ * <UTF8 str> <left> <bottom> <right> <top> <page id>
+ * @endverbatim
+ * and for word/line-level boxes:
+ * @verbatim
+ * WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
+ * @endverbatim
+ * NOTES:
+ * The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
+ *
+ * <page id> is 0-based, and the page number is used for multipage input (tiff).
+ *
+ * In the blob-level form, each line represents a recognizable unit, which may
+ * be several UTF-8 bytes, but there is a bounding box around each recognizable
+ * unit, and no classifier is needed to train in this mode (bootstrapping.)
+ *
+ * In the word/line-level form, the line begins with the literal "WordStr", and
+ * the bounding box bounds either a whole line or a whole word. The recognizable
+ * units in the word/line are listed after the # at the end of the line and
+ * are space delimited, ignoring any original spaces on the line.
+ * Eg.
+ * @verbatim
+ * word -> #w o r d
+ * multi word line -> #m u l t i w o r d l i n e
+ * @endverbatim
+ * The recognizable units must be space-delimited in order to allow multiple
+ * unicodes to be used for a single recognizable unit, eg Hindi.
+ *
+ * In this mode, the classifier must have been pre-trained with the desired
+ * character set, or it will not be able to find the character segmentations.
+ */
+
+namespace tesseract {
+
+#ifndef DISABLED_LEGACY_ENGINE
+static void clear_any_old_text(BLOCK_LIST *block_list) {
+ BLOCK_IT block_it(block_list);
+ for (block_it.mark_cycle_pt();
+ !block_it.cycled_list(); block_it.forward()) {
+ ROW_IT row_it(block_it.data()->row_list());
+ for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
+ WERD_IT word_it(row_it.data()->word_list());
+ for (word_it.mark_cycle_pt();
+ !word_it.cycled_list(); word_it.forward()) {
+ word_it.data()->set_text("");
+ }
+ }
+ }
+}
+
+// Applies the box file based on the image name filename, and resegments
+// the words in the block_list (page), with:
+// blob-mode: one blob per line in the box file, words as input.
+// word/line-mode: one blob per space-delimited unit after the #, and one word
+// per line in the box file. (See comment above for box file format.)
+// If find_segmentation is true, (word/line mode) then the classifier is used
+// to re-segment words/lines to match the space-delimited truth string for
+// each box. In this case, the input box may be for a word or even a whole
+// text line, and the output words will contain multiple blobs corresponding
+// to the space-delimited input string.
+// With find_segmentation false, no classifier is needed, but the chopper
+// can still be used to correctly segment touching characters with the help
+// of the input boxes.
+// In the returned PAGE_RES, the WERD_RES are setup as they would be returned
+// from normal classification, ie. with a word, chopped_word, rebuild_word,
+// seam_array, denorm, box_word, and best_state, but NO best_choice or
+// raw_choice, as they would require a UNICHARSET, which we aim to avoid.
+// Instead, the correct_text member of WERD_RES is set, and this may be later
+// converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
+// is not required before calling ApplyBoxTraining.
+PAGE_RES* Tesseract::ApplyBoxes(const char* filename,
+ bool find_segmentation,
+ BLOCK_LIST *block_list) {
+ std::vector<TBOX> boxes;
+ std::vector<STRING> texts, full_texts;
+ if (!ReadAllBoxes(applybox_page, true, filename, &boxes, &texts, &full_texts,
+ nullptr)) {
+ return nullptr; // Can't do it.
+ }
+
+ const int box_count = boxes.size();
+ int box_failures = 0;
+
+ // In word mode, we use the boxes to make a word for each box, but
+ // in blob mode we use the existing words and maximally chop them first.
+ PAGE_RES* page_res = find_segmentation ?
+ nullptr : SetupApplyBoxes(boxes, block_list);
+ clear_any_old_text(block_list);
+
+ for (int i = 0; i < box_count; i++) {
+ bool foundit = false;
+ if (page_res != nullptr) {
+ foundit = ResegmentCharBox(page_res,
+ (i == 0) ? nullptr : &boxes[i - 1],
+ boxes[i],
+ (i == box_count - 1) ? nullptr : &boxes[i + 1],
+ full_texts[i].c_str());
+ } else {
+ foundit = ResegmentWordBox(block_list, boxes[i],
+ (i == box_count - 1) ? nullptr : &boxes[i + 1],
+ texts[i].c_str());
+ }
+ if (!foundit) {
+ box_failures++;
+ ReportFailedBox(i, boxes[i], texts[i].c_str(),
+ "FAILURE! Couldn't find a matching blob");
+ }
+ }
+
+ if (page_res == nullptr) {
+ // In word/line mode, we now maximally chop all the words and resegment
+ // them with the classifier.
+ page_res = SetupApplyBoxes(boxes, block_list);
+ ReSegmentByClassification(page_res);
+ }
+ if (applybox_debug > 0) {
+ tprintf("APPLY_BOXES:\n");
+ tprintf(" Boxes read from boxfile: %6d\n", box_count);
+ if (box_failures > 0)
+ tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
+ }
+ TidyUp(page_res);
+ return page_res;
+}
+
+// Helper computes median xheight in the image.
+static double MedianXHeight(BLOCK_LIST *block_list) {
+ BLOCK_IT block_it(block_list);
+ STATS xheights(0, block_it.data()->pdblk.bounding_box().height());
+ for (block_it.mark_cycle_pt();
+ !block_it.cycled_list(); block_it.forward()) {
+ ROW_IT row_it(block_it.data()->row_list());
+ for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
+ xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
+ }
+ }
+ return xheights.median();
+}
+
+/// Any row xheight that is significantly different from the median is set
+/// to the median.
+void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
+ const double median_xheight = MedianXHeight(block_list);
+ const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
+ // Strip all fuzzy space markers to simplify the PAGE_RES.
+ BLOCK_IT b_it(block_list);
+ for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+ BLOCK* block = b_it.data();
+ ROW_IT r_it(block->row_list());
+ for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
+ ROW* row = r_it.data();
+ const double diff = fabs(row->x_height() - median_xheight);
+ if (diff > max_deviation) {
+ if (applybox_debug) {
+ tprintf("row xheight=%g, but median xheight = %g\n",
+ row->x_height(), median_xheight);
+ }
+ row->set_x_height(static_cast<float>(median_xheight));
+ }
+ }
+ }
+}
+
+/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
+/// All fuzzy spaces are removed, and all the words are maximally chopped.
+PAGE_RES* Tesseract::SetupApplyBoxes(const std::vector<TBOX>& boxes,
+ BLOCK_LIST *block_list) {
+ PreenXHeights(block_list);
+ // Strip all fuzzy space markers to simplify the PAGE_RES.
+ BLOCK_IT b_it(block_list);
+ for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+ BLOCK* block = b_it.data();
+ ROW_IT r_it(block->row_list());
+ for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
+ ROW* row = r_it.data();
+ WERD_IT w_it(row->word_list());
+ for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
+ WERD* word = w_it.data();
+ if (word->cblob_list()->empty()) {
+ delete w_it.extract();
+ } else {
+ word->set_flag(W_FUZZY_SP, false);
+ word->set_flag(W_FUZZY_NON, false);
+ }
+ }
+ }
+ }
+ auto* page_res = new PAGE_RES(false, block_list, nullptr);
+ PAGE_RES_IT pr_it(page_res);
+ WERD_RES* word_res;
+ while ((word_res = pr_it.word()) != nullptr) {
+ MaximallyChopWord(boxes, pr_it.block()->block,
+ pr_it.row()->row, word_res);
+ pr_it.forward();
+ }
+ return page_res;
+}
+
+/// Tests the chopper by exhaustively running chop_one_blob.
+/// The word_res will contain filled chopped_word, seam_array, denorm,
+/// box_word and best_state for the maximally chopped word.
+void Tesseract::MaximallyChopWord(const std::vector<TBOX>& boxes,
+ BLOCK* block, ROW* row,
+ WERD_RES* word_res) {
+ if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
+ tessedit_ocr_engine_mode, nullptr,
+ classify_bln_numeric_mode,
+ textord_use_cjk_fp_model,
+ poly_allow_detailed_fx,
+ row, block)) {
+ word_res->CloneChoppedToRebuild();
+ return;
+ }
+ if (chop_debug) {
+ tprintf("Maximally chopping word at:");
+ word_res->word->bounding_box().print();
+ }
+ GenericVector<BLOB_CHOICE*> blob_choices;
+ ASSERT_HOST(!word_res->chopped_word->blobs.empty());
+ auto rating = static_cast<float>(INT8_MAX);
+ for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
+ // The rating and certainty are not quite arbitrary. Since
+ // select_blob_to_chop uses the worst certainty to choose, they all have
+ // to be different, so starting with INT8_MAX, subtract 1/8 for each blob
+ // in here, and then divide by e each time they are chopped, which
+ // should guarantee a set of unequal values for the whole tree of blobs
+ // produced, however much chopping is required. The chops are thus only
+ // limited by the ability of the chopper to find suitable chop points,
+ // and not by the value of the certainties.
+ auto* choice =
+ new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
+ blob_choices.push_back(choice);
+ rating -= 0.125f;
+ }
+ const double e = exp(1.0); // The base of natural logs.
+ int blob_number;
+ int right_chop_index = 0;
+ if (!assume_fixed_pitch_char_segment) {
+ // We only chop if the language is not fixed pitch like CJK.
+ SEAM* seam = nullptr;
+ while ((seam = chop_one_blob(boxes, blob_choices, word_res,
+ &blob_number)) != nullptr) {
+ word_res->InsertSeam(blob_number, seam);
+ BLOB_CHOICE* left_choice = blob_choices[blob_number];
+ rating = left_choice->rating() / e;
+ left_choice->set_rating(rating);
+ left_choice->set_certainty(-rating);
+ // combine confidence w/ serial #
+ auto* right_choice = new BLOB_CHOICE(++right_chop_index,
+ rating - 0.125f, -rating, -1,
+ 0.0f, 0.0f, 0.0f, BCC_FAKE);
+ blob_choices.insert(right_choice, blob_number + 1);
+ }
+ }
+ word_res->CloneChoppedToRebuild();
+ word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
+}
+
+/// Helper to compute the dispute resolution metric.
+/// Disputed blob resolution. The aim is to give the blob to the most
+/// appropriate boxfile box. Most of the time it is obvious, but if
+/// two boxfile boxes overlap significantly it is not. If a small boxfile
+/// box takes most of the blob, and a large boxfile box does too, then
+/// we want the small boxfile box to get it, but if the small box
+/// is much smaller than the blob, we don't want it to get it.
+/// Details of the disputed blob resolution:
+/// Given a box with area A, and a blob with area B, with overlap area C,
+/// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
+/// miss metric gets the blob.
+static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
+ const int overlap_area = box1.intersection(box2).area();
+ const int a = box1.area();
+ const int b = box2.area();
+ ASSERT_HOST(a != 0 && b != 0);
+ return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b;
+}
+
+/// Gather consecutive blobs that match the given box into the best_state
+/// and corresponding correct_text.
+///
+/// Fights over which box owns which blobs are settled by pre-chopping and
+/// applying the blobs to box or next_box with the least non-overlap.
+/// @return false if the box was in error, which can only be caused by
+/// failing to find an appropriate blob for a box.
+///
+/// This means that occasionally, blobs may be incorrectly segmented if the
+/// chopper fails to find a suitable chop point.
+bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX* prev_box,
+ const TBOX& box, const TBOX* next_box,
+ const char* correct_text) {
+ if (applybox_debug > 1) {
+ tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
+ }
+ PAGE_RES_IT page_res_it(page_res);
+ WERD_RES* word_res;
+ for (word_res = page_res_it.word(); word_res != nullptr;
+ word_res = page_res_it.forward()) {
+ if (!word_res->box_word->bounding_box().major_overlap(box))
+ continue;
+ if (applybox_debug > 1) {
+ tprintf("Checking word box:");
+ word_res->box_word->bounding_box().print();
+ }
+ int word_len = word_res->box_word->length();
+ for (int i = 0; i < word_len; ++i) {
+ TBOX char_box = TBOX();
+ int blob_count = 0;
+ for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
+ TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
+ if (!blob_box.major_overlap(box))
+ break;
+ if (word_res->correct_text[i + blob_count].length() > 0)
+ break; // Blob is claimed already.
+ if (next_box != nullptr) {
+ const double current_box_miss_metric = BoxMissMetric(blob_box, box);
+ const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
+ if (applybox_debug > 2) {
+ tprintf("Checking blob:");
+ blob_box.print();
+ tprintf("Current miss metric = %g, next = %g\n",
+ current_box_miss_metric, next_box_miss_metric);
+ }
+ if (current_box_miss_metric > next_box_miss_metric)
+ break; // Blob is a better match for next box.
+ }
+ char_box += blob_box;
+ }
+ if (blob_count > 0) {
+ if (applybox_debug > 1) {
+ tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
+ }
+ if (!char_box.almost_equal(box, 3) &&
+ ((next_box != nullptr && box.x_gap(*next_box) < -3)||
+ (prev_box != nullptr && prev_box->x_gap(box) < -3))) {
+ return false;
+ }
+ // We refine just the box_word, best_state and correct_text here.
+ // The rebuild_word is made in TidyUp.
+ // blob_count blobs are put together to match the box. Merge the
+ // box_word boxes, save the blob_count in the state and the text.
+ word_res->box_word->MergeBoxes(i, i + blob_count);
+ word_res->best_state[i] = blob_count;
+ word_res->correct_text[i] = correct_text;
+ if (applybox_debug > 2) {
+ tprintf("%d Blobs match: blob box:", blob_count);
+ word_res->box_word->BlobBox(i).print();
+ tprintf("Matches box:");
+ box.print();
+ if (next_box != nullptr) {
+ tprintf("With next box:");
+ next_box->print();
+ }
+ }
+ // Eliminated best_state and correct_text entries for the consumed
+ // blobs.
+ for (int j = 1; j < blob_count; ++j) {
+ word_res->best_state.remove(i + 1);
+ word_res->correct_text.remove(i + 1);
+ }
+ // Assume that no box spans multiple source words, so we are done with
+ // this box.
+ if (applybox_debug > 1) {
+ tprintf("Best state = ");
+ for (int j = 0; j < word_res->best_state.size(); ++j) {
+ tprintf("%d ", word_res->best_state[j]);
+ }
+ tprintf("\n");
+ tprintf("Correct text = [[ ");
+ for (int j = 0; j < word_res->correct_text.size(); ++j) {
+ tprintf("%s ", word_res->correct_text[j].c_str());
+ }
+ tprintf("]]\n");
+ }
+ return true;
+ }
+ }
+ }
+ if (applybox_debug > 0) {
+ tprintf("FAIL!\n");
+ }
+ return false; // Failure.
+}
+
+/// Consume all source blobs that strongly overlap the given box,
+/// putting them into a new word, with the correct_text label.
+/// Fights over which box owns which blobs are settled by
+/// applying the blobs to box or next_box with the least non-overlap.
+/// @return false if the box was in error, which can only be caused by
+/// failing to find an overlapping blob for a box.
+bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
+ const TBOX& box, const TBOX* next_box,
+ const char* correct_text) {
+ if (applybox_debug > 1) {
+ tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
+ }
+ WERD* new_word = nullptr;
+ BLOCK_IT b_it(block_list);
+ for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+ BLOCK* block = b_it.data();
+ if (!box.major_overlap(block->pdblk.bounding_box()))
+ continue;
+ ROW_IT r_it(block->row_list());
+ for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
+ ROW* row = r_it.data();
+ if (!box.major_overlap(row->bounding_box()))
+ continue;
+ WERD_IT w_it(row->word_list());
+ for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
+ WERD* word = w_it.data();
+ if (applybox_debug > 2) {
+ tprintf("Checking word:");
+ word->bounding_box().print();
+ }
+ if (word->text() != nullptr && word->text()[0] != '\0')
+ continue; // Ignore words that are already done.
+ if (!box.major_overlap(word->bounding_box()))
+ continue;
+ C_BLOB_IT blob_it(word->cblob_list());
+ for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
+ blob_it.forward()) {
+ C_BLOB* blob = blob_it.data();
+ TBOX blob_box = blob->bounding_box();
+ if (!blob_box.major_overlap(box))
+ continue;
+ if (next_box != nullptr) {
+ const double current_box_miss_metric = BoxMissMetric(blob_box, box);
+ const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
+ if (applybox_debug > 2) {
+ tprintf("Checking blob:");
+ blob_box.print();
+ tprintf("Current miss metric = %g, next = %g\n",
+ current_box_miss_metric, next_box_miss_metric);
+ }
+ if (current_box_miss_metric > next_box_miss_metric)
+ continue; // Blob is a better match for next box.
+ }
+ if (applybox_debug > 2) {
+ tprintf("Blob match: blob:");
+ blob_box.print();
+ tprintf("Matches box:");
+ box.print();
+ if (next_box != nullptr) {
+ tprintf("With next box:");
+ next_box->print();
+ }
+ }
+ if (new_word == nullptr) {
+ // Make a new word with a single blob.
+ new_word = word->shallow_copy();
+ new_word->set_text(correct_text);
+ w_it.add_to_end(new_word);
+ }
+ C_BLOB_IT new_blob_it(new_word->cblob_list());
+ new_blob_it.add_to_end(blob_it.extract());
+ }
+ }
+ }
+ }
+ if (new_word == nullptr && applybox_debug > 0) tprintf("FAIL!\n");
+ return new_word != nullptr;
+}
+
+/// Resegments the words by running the classifier in an attempt to find the
+/// correct segmentation that produces the required string.
+void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
+ PAGE_RES_IT pr_it(page_res);
+ WERD_RES* word_res;
+ for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
+ const WERD* word = word_res->word;
+ if (word->text() == nullptr || word->text()[0] == '\0')
+ continue; // Ignore words that have no text.
+ // Convert the correct text to a vector of UNICHAR_ID
+ GenericVector<UNICHAR_ID> target_text;
+ if (!ConvertStringToUnichars(word->text(), &target_text)) {
+ tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
+ word->text());
+ pr_it.DeleteCurrentWord();
+ continue;
+ }
+ if (!FindSegmentation(target_text, word_res)) {
+ tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
+ word->text());
+ pr_it.DeleteCurrentWord();
+ continue;
+ }
+ }
+}
+
+/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
+/// @return false if an invalid UNICHAR_ID is encountered.
+bool Tesseract::ConvertStringToUnichars(const char* utf8,
+ GenericVector<UNICHAR_ID>* class_ids) {
+ for (int step = 0; *utf8 != '\0'; utf8 += step) {
+ const char* next_space = strchr(utf8, ' ');
+ if (next_space == nullptr)
+ next_space = utf8 + strlen(utf8);
+ step = next_space - utf8;
+ UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
+ if (class_id == INVALID_UNICHAR_ID) {
+ return false;
+ }
+ while (utf8[step] == ' ')
+ ++step;
+ class_ids->push_back(class_id);
+ }
+ return true;
+}
+
+/// Resegments the word to achieve the target_text from the classifier.
+/// Returns false if the re-segmentation fails.
+/// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
+/// applies a full search on the classifier results to find the best classified
+/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
+/// substitutions ARE used.
+bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
+ WERD_RES* word_res) {
+ // Classify all required combinations of blobs and save results in choices.
+ const int word_length = word_res->box_word->length();
+ auto* choices =
+ new GenericVector<BLOB_CHOICE_LIST*>[word_length];
+ for (int i = 0; i < word_length; ++i) {
+ for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
+ BLOB_CHOICE_LIST* match_result = classify_piece(
+ word_res->seam_array, i, i + j - 1, "Applybox",
+ word_res->chopped_word, word_res->blamer_bundle);
+ if (applybox_debug > 2) {
+ tprintf("%d+%d:", i, j);
+ print_ratings_list("Segment:", match_result, unicharset);
+ }
+ choices[i].push_back(match_result);
+ }
+ }
+ // Search the segmentation graph for the target text. Must be an exact
+ // match. Using wildcards makes it difficult to find the correct
+ // segmentation even when it is there.
+ word_res->best_state.clear();
+ GenericVector<int> search_segmentation;
+ float best_rating = 0.0f;
+ SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
+ &search_segmentation, &best_rating, &word_res->best_state);
+ for (int i = 0; i < word_length; ++i)
+ choices[i].delete_data_pointers();
+ delete [] choices;
+ if (word_res->best_state.empty()) {
+ // Build the original segmentation and if it is the same length as the
+ // truth, assume it will do.
+ int blob_count = 1;
+ for (int s = 0; s < word_res->seam_array.size(); ++s) {
+ SEAM* seam = word_res->seam_array[s];
+ if (!seam->HasAnySplits()) {
+ word_res->best_state.push_back(blob_count);
+ blob_count = 1;
+ } else {
+ ++blob_count;
+ }
+ }
+ word_res->best_state.push_back(blob_count);
+ if (word_res->best_state.size() != target_text.size()) {
+ word_res->best_state.clear(); // No good. Original segmentation bad size.
+ return false;
+ }
+ }
+ word_res->correct_text.clear();
+ for (int i = 0; i < target_text.size(); ++i) {
+ word_res->correct_text.push_back(
+ STRING(unicharset.id_to_unichar(target_text[i])));
+ }
+ return true;
+}
+
+/// Recursive helper to find a match to the target_text (from text_index
+/// position) in the choices (from choices_pos position).
+/// @param choices is an array of GenericVectors, of length choices_length,
+/// with each element representing a starting position in the word, and the
+/// #GenericVector holding classification results for a sequence of consecutive
+/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
+/// @param choices_pos
+/// @param choices_length
+/// @param target_text
+/// @param text_index
+/// @param rating
+/// @param segmentation
+/// @param best_rating
+/// @param best_segmentation
+void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
+ int choices_pos, int choices_length,
+ const GenericVector<UNICHAR_ID>& target_text,
+ int text_index,
+ float rating, GenericVector<int>* segmentation,
+ float* best_rating,
+ GenericVector<int>* best_segmentation) {
+ const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs();
+ for (int length = 1; length <= choices[choices_pos].size(); ++length) {
+ // Rating of matching choice or worst choice if no match.
+ float choice_rating = 0.0f;
+ // Find the corresponding best BLOB_CHOICE.
+ BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
+ for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
+ choice_it.forward()) {
+ const BLOB_CHOICE* choice = choice_it.data();
+ choice_rating = choice->rating();
+ UNICHAR_ID class_id = choice->unichar_id();
+ if (class_id == target_text[text_index]) {
+ break;
+ }
+ // Search ambigs table.
+ if (class_id < table.size() && table[class_id] != nullptr) {
+ AmbigSpec_IT spec_it(table[class_id]);
+ for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
+ spec_it.forward()) {
+ const AmbigSpec *ambig_spec = spec_it.data();
+ // We'll only do 1-1.
+ if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
+ ambig_spec->correct_ngram_id == target_text[text_index])
+ break;
+ }
+ if (!spec_it.cycled_list())
+ break; // Found an ambig.
+ }
+ }
+ if (choice_it.cycled_list())
+ continue; // No match.
+ segmentation->push_back(length);
+ if (choices_pos + length == choices_length &&
+ text_index + 1 == target_text.size()) {
+ // This is a complete match. If the rating is good record a new best.
+ if (applybox_debug > 2) {
+ tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
+ rating + choice_rating, *best_rating, segmentation->size(),
+ best_segmentation->size());
+ }
+ if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
+ *best_segmentation = *segmentation;
+ *best_rating = rating + choice_rating;
+ }
+ } else if (choices_pos + length < choices_length &&
+ text_index + 1 < target_text.size()) {
+ if (applybox_debug > 3) {
+ tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
+ target_text[text_index],
+ unicharset.id_to_unichar(target_text[text_index]),
+ choice_it.data()->unichar_id() == target_text[text_index]
+ ? "Match" : "Ambig",
+ choices_pos, length);
+ }
+ SearchForText(choices, choices_pos + length, choices_length, target_text,
+ text_index + 1, rating + choice_rating, segmentation,
+ best_rating, best_segmentation);
+ if (applybox_debug > 3) {
+ tprintf("End recursion for %d=%s\n", target_text[text_index],
+ unicharset.id_to_unichar(target_text[text_index]));
+ }
+ }
+ segmentation->truncate(segmentation->size() - 1);
+ }
+}
+
+/// - Counts up the labelled words and the blobs within.
+/// - Deletes all unused or emptied words, counting the unused ones.
+/// - Resets W_BOL and W_EOL flags correctly.
+/// - Builds the rebuild_word and rebuilds the box_word and the best_choice.
+void Tesseract::TidyUp(PAGE_RES* page_res) {
+ int ok_blob_count = 0;
+ int bad_blob_count = 0;
+ int ok_word_count = 0;
+ int unlabelled_words = 0;
+ PAGE_RES_IT pr_it(page_res);
+ WERD_RES* word_res;
+ for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
+ int ok_in_word = 0;
+ int blob_count = word_res->correct_text.size();
+ auto* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
+ word_choice->set_permuter(TOP_CHOICE_PERM);
+ for (int c = 0; c < blob_count; ++c) {
+ if (word_res->correct_text[c].length() > 0) {
+ ++ok_in_word;
+ }
+ // Since we only need a fake word_res->best_choice, the actual
+ // unichar_ids do not matter. Which is fortunate, since TidyUp()
+ // can be called while training Tesseract, at the stage where
+ // unicharset is not meaningful yet.
+ word_choice->append_unichar_id_space_allocated(
+ INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
+ }
+ if (ok_in_word > 0) {
+ ok_blob_count += ok_in_word;
+ bad_blob_count += word_res->correct_text.size() - ok_in_word;
+ word_res->LogNewRawChoice(word_choice);
+ word_res->LogNewCookedChoice(1, false, word_choice);
+ } else {
+ ++unlabelled_words;
+ if (applybox_debug > 0) {
+ tprintf("APPLY_BOXES: Unlabelled word at :");
+ word_res->word->bounding_box().print();
+ }
+ pr_it.DeleteCurrentWord();
+ delete word_choice;
+ }
+ }
+ pr_it.restart_page();
+ for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
+ // Denormalize back to a BoxWord.
+ word_res->RebuildBestState();
+ word_res->SetupBoxWord();
+ word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
+ word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
+ }
+ if (applybox_debug > 0) {
+ tprintf(" Found %d good blobs.\n", ok_blob_count);
+ if (bad_blob_count > 0) {
+ tprintf(" Leaving %d unlabelled blobs in %d words.\n",
+ bad_blob_count, ok_word_count);
+ }
+ if (unlabelled_words > 0)
+ tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
+ }
+}
+
+/** Logs a bad box by line in the box file and box coords.*/
+void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
+ const char *box_ch, const char *err_msg) {
+ tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
+ boxfile_lineno + 1, box_ch,
+ box.left(), box.bottom(), box.right(), box.top(), err_msg);
+}
+
+/// Calls #LearnWord to extract features for labelled blobs within each word.
+/// Features are stored in an internal buffer.
+void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
+ PAGE_RES_IT pr_it(page_res);
+ int word_count = 0;
+ for (WERD_RES *word_res = pr_it.word(); word_res != nullptr;
+ word_res = pr_it.forward()) {
+ LearnWord(fontname.c_str(), word_res);
+ ++word_count;
+ }
+ tprintf("Generated training data for %d words\n", word_count);
+}
+
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+/** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
+void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
+ PAGE_RES_IT pr_it(page_res);
+ for (WERD_RES *word_res = pr_it.word(); word_res != nullptr;
+ word_res = pr_it.forward()) {
+ auto* choice = new WERD_CHOICE(word_res->uch_set,
+ word_res->correct_text.size());
+ for (int i = 0; i < word_res->correct_text.size(); ++i) {
+ // The part before the first space is the real ground truth, and the
+ // rest is the bounding box location and page number.
+ std::vector<STRING> tokens;
+ word_res->correct_text[i].split(' ', &tokens);
+ UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str());
+ choice->append_unichar_id_space_allocated(char_id,
+ word_res->best_state[i],
+ 0.0f, 0.0f);
+ }
+ word_res->ClearWordChoices();
+ word_res->LogNewRawChoice(choice);
+ word_res->LogNewCookedChoice(1, false, choice);
+ }
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/control.cpp b/tesseract/src/ccmain/control.cpp
new file mode 100644
index 00000000..50b0fb05
--- /dev/null
+++ b/tesseract/src/ccmain/control.cpp
@@ -0,0 +1,2110 @@
+/******************************************************************
+ * File: control.cpp (Formerly control.c)
+ * Description: Module-independent matcher controller.
+ * Author: Ray Smith
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#include <cmath>
+#include <cstdint> // for int16_t, int32_t
+#include <cstdio> // for fclose, fopen, FILE
+#include <ctime> // for clock
+#include <cctype>
+#include "control.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#include "docqual.h"
+#include "drawfx.h"
+#include "fixspace.h"
+#endif
+#include "lstmrecognizer.h"
+#include <tesseract/ocrclass.h>
+#include "output.h"
+#include "pageres.h" // for WERD_RES, PAGE_RES_IT, PAGE_RES, BLO...
+#ifndef DISABLED_LEGACY_ENGINE
+#include "reject.h"
+#endif
+#include "sorthelper.h"
+#include "tesseractclass.h"
+#include "tessvars.h"
+#include "werdit.h"
+
+const char* const kBackUpConfigFile = "tempconfigdata.config";
+// Min believable x-height for any text when refitting as a fraction of
+// original x-height
+const double kMinRefitXHeightFraction = 0.5;
+
+
+/**
+ * Make a word from the selected blobs and run Tess on them.
+ *
+ * @param page_res recognise blobs
+ * @param selection_box within this box
+ */
+namespace tesseract {
+
+void Tesseract::recog_pseudo_word(PAGE_RES* page_res,
+ TBOX &selection_box) {
+ PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
+ if (it != nullptr) {
+ recog_interactive(it);
+ it->DeleteCurrentWord();
+ delete it;
+ }
+}
+
+/**
+ * Recognize a single word in interactive mode.
+ *
+ * @param pr_it the page results iterator
+ */
+bool Tesseract::recog_interactive(PAGE_RES_IT* pr_it) {
+ WordData word_data(*pr_it);
+ SetupWordPassN(2, &word_data);
+ // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
+ if (lstm_recognizer_ == nullptr) {
+#ifndef DISABLED_LEGACY_ENGINE
+ classify_word_and_language(2, pr_it, &word_data);
+#endif // ndef DISABLED_LEGACY_ENGINE
+ } else {
+ classify_word_and_language(1, pr_it, &word_data);
+ }
+#ifndef DISABLED_LEGACY_ENGINE
+ if (tessedit_debug_quality_metrics) {
+ int16_t char_qual;
+ int16_t good_char_qual;
+ WERD_RES* word_res = pr_it->word();
+ word_char_quality(word_res, &char_qual, &good_char_qual);
+ tprintf("\n%d chars; word_blob_quality: %d; outline_errs: %d; "
+ "char_quality: %d; good_char_quality: %d\n",
+ word_res->reject_map.length(),
+ word_blob_quality(word_res),
+ word_outline_errs(word_res), char_qual, good_char_qual);
+ }
+#endif // ndef DISABLED_LEGACY_ENGINE
+ return true;
+}
+
+// Helper function to check for a target word and handle it appropriately.
+// Inspired by Jetsoft's requirement to process only single words on pass2
+// and beyond.
+// If word_config is not null:
+// If the word_box and target_word_box overlap, read the word_config file
+// else reset to previous config data.
+// return true.
+// else
+// If the word_box and target_word_box overlap or pass <= 1, return true.
+// Note that this function uses a fixed temporary file for storing the previous
+// configs, so it is neither thread-safe, nor process-safe, but the assumption
+// is that it will only be used for one debug window at a time.
+//
+// Since this function is used for debugging (and not to change OCR results)
+// set only debug params from the word config file.
+bool Tesseract::ProcessTargetWord(const TBOX& word_box,
+ const TBOX& target_word_box,
+ const char* word_config,
+ int pass) {
+ if (word_config != nullptr) {
+ if (word_box.major_overlap(target_word_box)) {
+ if (backup_config_file_ == nullptr) {
+ backup_config_file_ = kBackUpConfigFile;
+ FILE* config_fp = fopen(backup_config_file_, "wb");
+ if (config_fp == nullptr) {
+ tprintf("Error, failed to open file \"%s\"\n", backup_config_file_);
+ } else {
+ ParamUtils::PrintParams(config_fp, params());
+ fclose(config_fp);
+ }
+ ParamUtils::ReadParamsFile(word_config,
+ SET_PARAM_CONSTRAINT_DEBUG_ONLY,
+ params());
+ }
+ } else {
+ if (backup_config_file_ != nullptr) {
+ ParamUtils::ReadParamsFile(backup_config_file_,
+ SET_PARAM_CONSTRAINT_DEBUG_ONLY,
+ params());
+ backup_config_file_ = nullptr;
+ }
+ }
+ } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
+ return false;
+ }
+ return true;
+}
+
+/** If tesseract is to be run, sets the words up ready for it. */
+void Tesseract::SetupAllWordsPassN(int pass_n,
+ const TBOX* target_word_box,
+ const char* word_config,
+ PAGE_RES* page_res,
+ std::vector<WordData>* words) {
+ // Prepare all the words.
+ PAGE_RES_IT page_res_it(page_res);
+ for (page_res_it.restart_page(); page_res_it.word() != nullptr;
+ page_res_it.forward()) {
+ if (target_word_box == nullptr ||
+ ProcessTargetWord(page_res_it.word()->word->bounding_box(),
+ *target_word_box, word_config, 1)) {
+ words->push_back(WordData(page_res_it));
+ }
+ }
+ // Setup all the words for recognition with polygonal approximation.
+ for (int w = 0; w < words->size(); ++w) {
+ SetupWordPassN(pass_n, &(*words)[w]);
+ if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
+ }
+}
+
+// Sets up the single word ready for whichever engine is to be run.
+void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
+ if (pass_n == 1 || !word->word->done) {
+ if (pass_n == 1) {
+ word->word->SetupForRecognition(unicharset, this, BestPix(),
+ tessedit_ocr_engine_mode, nullptr,
+ classify_bln_numeric_mode,
+ textord_use_cjk_fp_model,
+ poly_allow_detailed_fx,
+ word->row, word->block);
+ } else if (pass_n == 2) {
+ // TODO(rays) Should we do this on pass1 too?
+ word->word->caps_height = 0.0;
+ if (word->word->x_height == 0.0f)
+ word->word->x_height = word->row->x_height();
+ }
+ word->lang_words.truncate(0);
+ for (int s = 0; s <= sub_langs_.size(); ++s) {
+ // The sub_langs_.size() entry is for the master language.
+ Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
+ auto* word_res = new WERD_RES;
+ word_res->InitForRetryRecognition(*word->word);
+ word->lang_words.push_back(word_res);
+ // LSTM doesn't get setup for pass2.
+ if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
+ word_res->SetupForRecognition(
+ lang_t->unicharset, lang_t, BestPix(),
+ lang_t->tessedit_ocr_engine_mode, nullptr,
+ lang_t->classify_bln_numeric_mode,
+ lang_t->textord_use_cjk_fp_model,
+ lang_t->poly_allow_detailed_fx, word->row, word->block);
+ }
+ }
+ }
+}
+
+// Runs word recognition on all the words.
+bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
+ PAGE_RES_IT* pr_it,
+ std::vector<WordData>* words) {
+ // TODO(rays) Before this loop can be parallelized (it would yield a massive
+ // speed-up) all remaining member globals need to be converted to local/heap
+ // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
+ // added. The results will be significantly different with adaption on, and
+ // deterioration will need investigation.
+ pr_it->restart_page();
+ for (int w = 0; w < words->size(); ++w) {
+ WordData* word = &(*words)[w];
+ if (w > 0) word->prev_word = &(*words)[w - 1];
+ if (monitor != nullptr) {
+ monitor->ocr_alive = true;
+ if (pass_n == 1) {
+ monitor->progress = 70 * w / words->size();
+ } else {
+ monitor->progress = 70 + 30 * w / words->size();
+ }
+ if (monitor->progress_callback2 != nullptr) {
+ TBOX box = pr_it->word()->word->bounding_box();
+ (*monitor->progress_callback2)(monitor, box.left(),
+ box.right(), box.top(), box.bottom());
+ }
+ if (monitor->deadline_exceeded() ||
+ (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this,
+ words->size()))) {
+ // Timeout. Fake out the rest of the words.
+ for (; w < words->size(); ++w) {
+ (*words)[w].word->SetupFake(unicharset);
+ }
+ return false;
+ }
+ }
+ if (word->word->tess_failed) {
+ int s;
+ for (s = 0; s < word->lang_words.size() &&
+ word->lang_words[s]->tess_failed; ++s) {}
+ // If all are failed, skip it. Image words are skipped by this test.
+ if (s > word->lang_words.size()) continue;
+ }
+ // Sync pr_it with the wth WordData.
+ while (pr_it->word() != nullptr && pr_it->word() != word->word)
+ pr_it->forward();
+ ASSERT_HOST(pr_it->word() != nullptr);
+ bool make_next_word_fuzzy = false;
+ #ifndef DISABLED_LEGACY_ENGINE
+ if (!AnyLSTMLang() &&
+ ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
+ // Needs to be setup again to see the new outlines in the chopped_word.
+ SetupWordPassN(pass_n, word);
+ }
+ #endif // ndef DISABLED_LEGACY_ENGINE
+
+ classify_word_and_language(pass_n, pr_it, word);
+ if (tessedit_dump_choices || debug_noise_removal) {
+ tprintf("Pass%d: %s [%s]\n", pass_n,
+ word->word->best_choice->unichar_string().c_str(),
+ word->word->best_choice->debug_string().c_str());
+ }
+ pr_it->forward();
+ if (make_next_word_fuzzy && pr_it->word() != nullptr) {
+ pr_it->MakeCurrentWordFuzzy();
+ }
+ }
+ return true;
+}
+
+/**
+ * recog_all_words()
+ *
+ * Walk the page_res, recognizing all the words.
+ * If monitor is not null, it is used as a progress monitor/timeout/cancel.
+ * If dopasses is 0, all recognition passes are run,
+ * 1 just pass 1, 2 passes2 and higher.
+ * If target_word_box is not null, special things are done to words that
+ * overlap the target_word_box:
+ * if word_config is not null, the word config file is read for just the
+ * target word(s), otherwise, on pass 2 and beyond ONLY the target words
+ * are processed (Jetsoft modification.)
+ * Returns false if we cancelled prematurely.
+ *
+ * @param page_res page structure
+ * @param monitor progress monitor
+ * @param word_config word_config file
+ * @param target_word_box specifies just to extract a rectangle
+ * @param dopasses 0 - all, 1 just pass 1, 2 passes 2 and higher
+ */
+
+bool Tesseract::recog_all_words(PAGE_RES* page_res,
+ ETEXT_DESC* monitor,
+ const TBOX* target_word_box,
+ const char* word_config,
+ int dopasses) {
+ PAGE_RES_IT page_res_it(page_res);
+
+ if (tessedit_minimal_rej_pass1) {
+ tessedit_test_adaption.set_value (true);
+ tessedit_minimal_rejection.set_value (true);
+ }
+
+ if (dopasses==0 || dopasses==1) {
+ page_res_it.restart_page();
+ // ****************** Pass 1 *******************
+
+ #ifndef DISABLED_LEGACY_ENGINE
+ // If the adaptive classifier is full switch to one we prepared earlier,
+ // ie on the previous page. If the current adaptive classifier is non-empty,
+ // prepare a backup starting at this page, in case it fills up. Do all this
+ // independently for each language.
+ if (AdaptiveClassifierIsFull()) {
+ SwitchAdaptiveClassifier();
+ } else if (!AdaptiveClassifierIsEmpty()) {
+ StartBackupAdaptiveClassifier();
+ }
+ // Now check the sub-langs as well.
+ for (int i = 0; i < sub_langs_.size(); ++i) {
+ if (sub_langs_[i]->AdaptiveClassifierIsFull()) {
+ sub_langs_[i]->SwitchAdaptiveClassifier();
+ } else if (!sub_langs_[i]->AdaptiveClassifierIsEmpty()) {
+ sub_langs_[i]->StartBackupAdaptiveClassifier();
+ }
+ }
+
+ #endif // ndef DISABLED_LEGACY_ENGINE
+
+ // Set up all words ready for recognition, so that if parallelism is on
+ // all the input and output classes are ready to run the classifier.
+ std::vector<WordData> words;
+ SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
+ #ifndef DISABLED_LEGACY_ENGINE
+ if (tessedit_parallelize) {
+ PrerecAllWordsPar(words);
+ }
+ #endif // ndef DISABLED_LEGACY_ENGINE
+
+ stats_.word_count = words.size();
+
+ stats_.dict_words = 0;
+ stats_.doc_blob_quality = 0;
+ stats_.doc_outline_errs = 0;
+ stats_.doc_char_quality = 0;
+ stats_.good_char_count = 0;
+ stats_.doc_good_char_quality = 0;
+
+ most_recently_used_ = this;
+ // Run pass 1 word recognition.
+ if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;
+ // Pass 1 post-processing.
+ for (page_res_it.restart_page(); page_res_it.word() != nullptr;
+ page_res_it.forward()) {
+ if (page_res_it.word()->word->flag(W_REP_CHAR)) {
+ fix_rep_char(&page_res_it);
+ continue;
+ }
+
+ // Count dict words.
+ if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
+ ++(stats_.dict_words);
+
+ // Update misadaption log (we only need to do it on pass 1, since
+ // adaption only happens on this pass).
+ if (page_res_it.word()->blamer_bundle != nullptr &&
+ page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
+ page_res->misadaption_log.push_back(
+ page_res_it.word()->blamer_bundle->misadaption_debug());
+ }
+ }
+ }
+
+ if (dopasses == 1) return true;
+
+ #ifndef DISABLED_LEGACY_ENGINE
+
+ // ****************** Pass 2 *******************
+ if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption &&
+ AnyTessLang()) {
+ page_res_it.restart_page();
+ std::vector<WordData> words;
+ SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
+ if (tessedit_parallelize) {
+ PrerecAllWordsPar(words);
+ }
+ most_recently_used_ = this;
+ // Run pass 2 word recognition.
+ if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
+ }
+
+ // The next passes are only required for Tess-only.
+ if (AnyTessLang() && !AnyLSTMLang()) {
+ // ****************** Pass 3 *******************
+ // Fix fuzzy spaces.
+
+ if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces
+ && !tessedit_word_for_word && !right_to_left())
+ fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
+
+ // ****************** Pass 4 *******************
+ if (tessedit_enable_dict_correction) dictionary_correction_pass(page_res);
+ if (tessedit_enable_bigram_correction) bigram_correction_pass(page_res);
+
+ // ****************** Pass 5,6 *******************
+ rejection_passes(page_res, monitor, target_word_box, word_config);
+
+ // ****************** Pass 8 *******************
+ font_recognition_pass(page_res);
+
+ // ****************** Pass 9 *******************
+ // Check the correctness of the final results.
+ blamer_pass(page_res);
+ script_pos_pass(page_res);
+ }
+
+ #endif // ndef DISABLED_LEGACY_ENGINE
+
+ // Write results pass.
+ // This is now redundant, but retained commented so show how to obtain
+ // bounding boxes and style information.
+
+ #ifndef DISABLED_LEGACY_ENGINE
+ // changed by jetsoft
+ // needed for dll to output memory structure
+ if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
+ output_pass(page_res_it, target_word_box);
+ // end jetsoft
+ #endif //ndef DISABLED_LEGACY_ENGINE
+
+ const auto pageseg_mode = static_cast<PageSegMode>(
+ static_cast<int>(tessedit_pageseg_mode));
+ textord_.CleanupSingleRowResult(pageseg_mode, page_res);
+
+ // Remove empty words, as these mess up the result iterators.
+ for (page_res_it.restart_page(); page_res_it.word() != nullptr;
+ page_res_it.forward()) {
+ const WERD_RES* word = page_res_it.word();
+ const POLY_BLOCK* pb = page_res_it.block()->block != nullptr
+ ? page_res_it.block()->block->pdblk.poly_block()
+ : nullptr;
+ if (word->best_choice == nullptr || word->best_choice->length() == 0 ||
+ (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) {
+ page_res_it.DeleteCurrentWord();
+ }
+ }
+
+ if (monitor != nullptr) {
+ monitor->progress = 100;
+ }
+ return true;
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
+ PAGE_RES_IT word_it(page_res);
+
+ WERD_RES *w_prev = nullptr;
+ WERD_RES *w = word_it.word();
+ while (true) {
+ w_prev = w;
+ while (word_it.forward() != nullptr &&
+ (!word_it.word() || word_it.word()->part_of_combo)) {
+ // advance word_it, skipping over parts of combos
+ }
+ if (!word_it.word()) break;
+ w = word_it.word();
+ if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
+ continue;
+ }
+ if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
+ if (tessedit_bigram_debug) {
+ tprintf("Skipping because one of the words is W_REP_CHAR\n");
+ }
+ continue;
+ }
+ // Two words sharing the same language model, excellent!
+ GenericVector<WERD_CHOICE *> overrides_word1;
+ GenericVector<WERD_CHOICE *> overrides_word2;
+
+ const STRING orig_w1_str = w_prev->best_choice->unichar_string();
+ const STRING orig_w2_str = w->best_choice->unichar_string();
+ WERD_CHOICE prev_best(w->uch_set);
+ {
+ int w1start, w1end;
+ w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
+ prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
+ }
+ WERD_CHOICE this_best(w->uch_set);
+ {
+ int w2start, w2end;
+ w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
+ this_best = w->best_choice->shallow_copy(w2start, w2end);
+ }
+
+ if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
+ if (tessedit_bigram_debug) {
+ tprintf("Top choice \"%s %s\" verified by bigram model.\n",
+ orig_w1_str.c_str(), orig_w2_str.c_str());
+ }
+ continue;
+ }
+ if (tessedit_bigram_debug > 2) {
+ tprintf("Examining alt choices for \"%s %s\".\n",
+ orig_w1_str.c_str(), orig_w2_str.c_str());
+ }
+ if (tessedit_bigram_debug > 1) {
+ if (!w_prev->best_choices.singleton()) {
+ w_prev->PrintBestChoices();
+ }
+ if (!w->best_choices.singleton()) {
+ w->PrintBestChoices();
+ }
+ }
+ float best_rating = 0.0;
+ int best_idx = 0;
+ WERD_CHOICE_IT prev_it(&w_prev->best_choices);
+ for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
+ WERD_CHOICE *p1 = prev_it.data();
+ WERD_CHOICE strip1(w->uch_set);
+ {
+ int p1start, p1end;
+ p1->GetNonSuperscriptSpan(&p1start, &p1end);
+ strip1 = p1->shallow_copy(p1start, p1end);
+ }
+ WERD_CHOICE_IT w_it(&w->best_choices);
+ for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
+ WERD_CHOICE *p2 = w_it.data();
+ WERD_CHOICE strip2(w->uch_set);
+ {
+ int p2start, p2end;
+ p2->GetNonSuperscriptSpan(&p2start, &p2end);
+ strip2 = p2->shallow_copy(p2start, p2end);
+ }
+ if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
+ overrides_word1.push_back(p1);
+ overrides_word2.push_back(p2);
+ if (overrides_word1.size() == 1 ||
+ p1->rating() + p2->rating() < best_rating) {
+ best_rating = p1->rating() + p2->rating();
+ best_idx = overrides_word1.size() - 1;
+ }
+ }
+ }
+ }
+ if (!overrides_word1.empty()) {
+ // Excellent, we have some bigram matches.
+ if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice,
+ *overrides_word1[best_idx]) &&
+ EqualIgnoringCaseAndTerminalPunct(*w->best_choice,
+ *overrides_word2[best_idx])) {
+ if (tessedit_bigram_debug > 1) {
+ tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
+ "model.\n", orig_w1_str.c_str(), orig_w2_str.c_str());
+ }
+ continue;
+ }
+ const STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
+ const STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
+ if (new_w1_str != orig_w1_str) {
+ w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
+ }
+ if (new_w2_str != orig_w2_str) {
+ w->ReplaceBestChoice(overrides_word2[best_idx]);
+ }
+ if (tessedit_bigram_debug > 0) {
+ STRING choices_description;
+ int num_bigram_choices
+ = overrides_word1.size() * overrides_word2.size();
+ if (num_bigram_choices == 1) {
+ choices_description = "This was the unique bigram choice.";
+ } else {
+ if (tessedit_bigram_debug > 1) {
+ STRING bigrams_list;
+ const int kMaxChoicesToPrint = 20;
+ for (int i = 0; i < overrides_word1.size() &&
+ i < kMaxChoicesToPrint; i++) {
+ if (i > 0) { bigrams_list += ", "; }
+ WERD_CHOICE *p1 = overrides_word1[i];
+ WERD_CHOICE *p2 = overrides_word2[i];
+ bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
+ }
+ choices_description = "There were many choices: {";
+ choices_description += bigrams_list;
+ choices_description += "}";
+ } else {
+ choices_description.add_str_int("There were ", num_bigram_choices);
+ choices_description += " compatible bigrams.";
+ }
+ }
+ tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
+ orig_w1_str.c_str(), orig_w2_str.c_str(),
+ new_w1_str.c_str(), new_w2_str.c_str(),
+ choices_description.c_str());
+ }
+ }
+ }
+}
+
+void Tesseract::rejection_passes(PAGE_RES* page_res,
+ ETEXT_DESC* monitor,
+ const TBOX* target_word_box,
+ const char* word_config) {
+ PAGE_RES_IT page_res_it(page_res);
+ // ****************** Pass 5 *******************
+ // Gather statistics on rejects.
+ int word_index = 0;
+ while (!tessedit_test_adaption && page_res_it.word() != nullptr) {
+ WERD_RES* word = page_res_it.word();
+ word_index++;
+ if (monitor != nullptr) {
+ monitor->ocr_alive = true;
+ monitor->progress = 95 + 5 * word_index / stats_.word_count;
+ }
+ if (word->rebuild_word == nullptr) {
+ // Word was not processed by tesseract.
+ page_res_it.forward();
+ continue;
+ }
+ check_debug_pt(word, 70);
+
+ // changed by jetsoft
+ // specific to its needs to extract one word when need
+ if (target_word_box &&
+ !ProcessTargetWord(word->word->bounding_box(),
+ *target_word_box, word_config, 4)) {
+ page_res_it.forward();
+ continue;
+ }
+ // end jetsoft
+
+ page_res_it.rej_stat_word();
+ const int chars_in_word = word->reject_map.length();
+ const int rejects_in_word = word->reject_map.reject_count();
+
+ const int blob_quality = word_blob_quality(word);
+ stats_.doc_blob_quality += blob_quality;
+ const int outline_errs = word_outline_errs(word);
+ stats_.doc_outline_errs += outline_errs;
+ int16_t all_char_quality;
+ int16_t accepted_all_char_quality;
+ word_char_quality(word, &all_char_quality, &accepted_all_char_quality);
+ stats_.doc_char_quality += all_char_quality;
+ const uint8_t permuter_type = word->best_choice->permuter();
+ if ((permuter_type == SYSTEM_DAWG_PERM) ||
+ (permuter_type == FREQ_DAWG_PERM) ||
+ (permuter_type == USER_DAWG_PERM)) {
+ stats_.good_char_count += chars_in_word - rejects_in_word;
+ stats_.doc_good_char_quality += accepted_all_char_quality;
+ }
+ check_debug_pt(word, 80);
+ if (tessedit_reject_bad_qual_wds &&
+ (blob_quality == 0) && (outline_errs >= chars_in_word))
+ word->reject_map.rej_word_bad_quality();
+ check_debug_pt(word, 90);
+ page_res_it.forward();
+ }
+
+ if (tessedit_debug_quality_metrics) {
+ tprintf
+ ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
+ " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
+ page_res->char_count, page_res->rej_count,
+ page_res->rej_count / static_cast<float>(page_res->char_count),
+ stats_.doc_blob_quality,
+ stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
+ stats_.doc_outline_errs,
+ stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
+ stats_.doc_char_quality,
+ stats_.doc_char_quality / static_cast<float>(page_res->char_count),
+ stats_.doc_good_char_quality,
+ (stats_.good_char_count > 0) ?
+ (stats_.doc_good_char_quality /
+ static_cast<float>(stats_.good_char_count)) : 0.0);
+ }
+ bool good_quality_doc =
+ ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
+ quality_rej_pc) &&
+ (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
+ quality_blob_pc) &&
+ (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
+ quality_outline_pc) &&
+ (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
+ quality_char_pc);
+
+ // ****************** Pass 6 *******************
+ // Do whole document or whole block rejection pass
+ if (!tessedit_test_adaption) {
+ quality_based_rejection(page_res_it, good_quality_doc);
+ }
+}
+
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+void Tesseract::blamer_pass(PAGE_RES* page_res) {
+ if (!wordrec_run_blamer) return;
+ PAGE_RES_IT page_res_it(page_res);
+ for (page_res_it.restart_page(); page_res_it.word() != nullptr;
+ page_res_it.forward()) {
+ WERD_RES *word = page_res_it.word();
+ BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word);
+ page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++;
+ }
+ tprintf("Blame reasons:\n");
+ for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
+ tprintf("%s %d\n", BlamerBundle::IncorrectReasonName(
+ static_cast<IncorrectResultReason>(bl)),
+ page_res->blame_reasons[bl]);
+ }
+ if (page_res->misadaption_log.size() > 0) {
+ tprintf("Misadaption log:\n");
+ for (int i = 0; i < page_res->misadaption_log.size(); ++i) {
+ tprintf("%s\n", page_res->misadaption_log[i].c_str());
+ }
+ }
+}
+
+// Sets script positions and detects smallcaps on all output words.
+void Tesseract::script_pos_pass(PAGE_RES* page_res) {
+ PAGE_RES_IT page_res_it(page_res);
+ for (page_res_it.restart_page(); page_res_it.word() != nullptr;
+ page_res_it.forward()) {
+ WERD_RES* word = page_res_it.word();
+ if (word->word->flag(W_REP_CHAR)) {
+ page_res_it.forward();
+ continue;
+ }
+ const float x_height = page_res_it.block()->block->x_height();
+ float word_x_height = word->x_height;
+ if (word_x_height < word->best_choice->min_x_height() ||
+ word_x_height > word->best_choice->max_x_height()) {
+ word_x_height = (word->best_choice->min_x_height() +
+ word->best_choice->max_x_height()) / 2.0f;
+ }
+ // Test for small caps. Word capheight must be close to block xheight,
+ // and word must contain no lower case letters, and at least one upper case.
+ const double small_cap_xheight = x_height * kXHeightCapRatio;
+ const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
+ if (word->uch_set->script_has_xheight() &&
+ small_cap_xheight - small_cap_delta <= word_x_height &&
+ word_x_height <= small_cap_xheight + small_cap_delta) {
+ // Scan for upper/lower.
+ int num_upper = 0;
+ int num_lower = 0;
+ for (int i = 0; i < word->best_choice->length(); ++i) {
+ if (word->uch_set->get_isupper(word->best_choice->unichar_id(i)))
+ ++num_upper;
+ else if (word->uch_set->get_islower(word->best_choice->unichar_id(i)))
+ ++num_lower;
+ }
+ if (num_upper > 0 && num_lower == 0)
+ word->small_caps = true;
+ }
+ word->SetScriptPositions();
+ }
+}
+
+// Helper finds the gap between the index word and the next.
+static void WordGap(const PointerVector<WERD_RES>& words, int index, int* right,
+ int* next_left) {
+ *right = -INT32_MAX;
+ *next_left = INT32_MAX;
+ if (index < words.size()) {
+ *right = words[index]->word->bounding_box().right();
+ if (index + 1 < words.size())
+ *next_left = words[index + 1]->word->bounding_box().left();
+ }
+}
+
+// Factored helper computes the rating, certainty, badness and validity of
+// the permuter of the words in [first_index, end_index).
+static void EvaluateWordSpan(const PointerVector<WERD_RES>& words,
+ int first_index, int end_index, float* rating,
+ float* certainty, bool* bad,
+ bool* valid_permuter) {
+ if (end_index <= first_index) {
+ *bad = true;
+ *valid_permuter = false;
+ }
+ for (int index = first_index; index < end_index && index < words.size();
+ ++index) {
+ WERD_CHOICE* choice = words[index]->best_choice;
+ if (choice == nullptr) {
+ *bad = true;
+ } else {
+ *rating += choice->rating();
+ *certainty = std::min(*certainty, choice->certainty());
+ if (!Dict::valid_word_permuter(choice->permuter(), false))
+ *valid_permuter = false;
+ }
+ }
+}
+
+// Helper chooses the best combination of words, transferring good ones from
+// new_words to best_words. To win, a new word must have (better rating and
+// certainty) or (better permuter status and rating within rating ratio and
+// certainty within certainty margin) than current best.
+// All the new_words are consumed (moved to best_words or deleted.)
+// The return value is the number of new_words used minus the number of
+// best_words that remain in the output.
+static int SelectBestWords(double rating_ratio,
+ double certainty_margin,
+ bool debug,
+ PointerVector<WERD_RES>* new_words,
+ PointerVector<WERD_RES>* best_words) {
+ // Process the smallest groups of words that have an overlapping word
+ // boundary at the end.
+ GenericVector<WERD_RES*> out_words;
+ // Index into each word vector (best, new).
+ int b = 0, n = 0;
+ int num_best = 0, num_new = 0;
+ while (b < best_words->size() || n < new_words->size()) {
+ // Start of the current run in each.
+ int start_b = b, start_n = n;
+ while (b < best_words->size() || n < new_words->size()) {
+ int b_right = -INT32_MAX;
+ int next_b_left = INT32_MAX;
+ WordGap(*best_words, b, &b_right, &next_b_left);
+ int n_right = -INT32_MAX;
+ int next_n_left = INT32_MAX;
+ WordGap(*new_words, n, &n_right, &next_n_left);
+ if (std::max(b_right, n_right) < std::min(next_b_left, next_n_left)) {
+ // The word breaks overlap. [start_b,b] and [start_n, n] match.
+ break;
+ }
+ // Keep searching for the matching word break.
+ if ((b_right < n_right && b < best_words->size()) ||
+ n == new_words->size())
+ ++b;
+ else
+ ++n;
+ }
+ // Rating of the current run in each.
+ float b_rating = 0.0f, n_rating = 0.0f;
+ // Certainty of the current run in each.
+ float b_certainty = 0.0f, n_certainty = 0.0f;
+ // True if any word is missing its best choice.
+ bool b_bad = false, n_bad = false;
+ // True if all words have a valid permuter.
+ bool b_valid_permuter = true, n_valid_permuter = true;
+ const int end_b = b < best_words->size() ? b + 1 : b;
+ const int end_n = n < new_words->size() ? n + 1 : n;
+ EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty,
+ &b_bad, &b_valid_permuter);
+ EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty,
+ &n_bad, &n_valid_permuter);
+ bool new_better = false;
+ if (!n_bad && (b_bad || (n_certainty > b_certainty &&
+ n_rating < b_rating) ||
+ (!b_valid_permuter && n_valid_permuter &&
+ n_rating < b_rating * rating_ratio &&
+ n_certainty > b_certainty - certainty_margin))) {
+ // New is better.
+ for (int i = start_n; i < end_n; ++i) {
+ out_words.push_back((*new_words)[i]);
+ (*new_words)[i] = nullptr;
+ ++num_new;
+ }
+ new_better = true;
+ } else if (!b_bad) {
+ // Current best is better.
+ for (int i = start_b; i < end_b; ++i) {
+ out_words.push_back((*best_words)[i]);
+ (*best_words)[i] = nullptr;
+ ++num_best;
+ }
+ }
+ if (debug) {
+ tprintf("%d new words %s than %d old words: r: %g v %g c: %g v %g"
+ " valid dict: %d v %d\n",
+ end_n - start_n, new_better ? "better" : "worse",
+ end_b - start_b, n_rating, b_rating,
+ n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);
+ }
+ // Move on to the next group.
+ b = end_b;
+ n = end_n;
+ }
+ // Transfer from out_words to best_words.
+ best_words->clear();
+ for (int i = 0; i < out_words.size(); ++i)
+ best_words->push_back(out_words[i]);
+ return num_new - num_best;
+}
+
+// Helper to recognize the word using the given (language-specific) tesseract.
+// Returns positive if this recognizer found more new best words than the
+// number kept from best_words.
+int Tesseract::RetryWithLanguage(const WordData& word_data,
+ WordRecognizer recognizer, bool debug,
+ WERD_RES** in_word,
+ PointerVector<WERD_RES>* best_words) {
+ if (debug) {
+ tprintf("Trying word using lang %s, oem %d\n",
+ lang.c_str(), static_cast<int>(tessedit_ocr_engine_mode));
+ }
+ // Run the recognizer on the word.
+ PointerVector<WERD_RES> new_words;
+ (this->*recognizer)(word_data, in_word, &new_words);
+ if (new_words.empty()) {
+ // Transfer input word to new_words, as the classifier must have put
+ // the result back in the input.
+ new_words.push_back(*in_word);
+ *in_word = nullptr;
+ }
+ if (debug) {
+ for (int i = 0; i < new_words.size(); ++i)
+ new_words[i]->DebugTopChoice("Lang result");
+ }
+ // Initial version is a bit of a hack based on better certainty and rating
+ // or a dictionary vs non-dictionary word.
+ return SelectBestWords(classify_max_rating_ratio,
+ classify_max_certainty_margin,
+ debug, &new_words, best_words);
+}
+
+// Helper returns true if all the words are acceptable.
+static bool WordsAcceptable(const PointerVector<WERD_RES>& words) {
+ for (int w = 0; w < words.size(); ++w) {
+ if (words[w]->tess_failed || !words[w]->tess_accepted) return false;
+ }
+ return true;
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+// Moves good-looking "noise"/diacritics from the reject list to the main
+// blob list on the current word. Returns true if anything was done, and
+// sets make_next_word_fuzzy if blob(s) were added to the end of the word.
+bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT* pr_it,
+ bool* make_next_word_fuzzy) {
+ *make_next_word_fuzzy = false;
+ WERD* real_word = pr_it->word()->word;
+ if (real_word->rej_cblob_list()->empty() ||
+ real_word->cblob_list()->empty() ||
+ real_word->rej_cblob_list()->length() > noise_maxperword)
+ return false;
+ real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
+ // Get the noise outlines into a vector with matching bool map.
+ GenericVector<C_OUTLINE*> outlines;
+ real_word->GetNoiseOutlines(&outlines);
+ GenericVector<bool> word_wanted;
+ GenericVector<bool> overlapped_any_blob;
+ GenericVector<C_BLOB*> target_blobs;
+ AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,
+ &word_wanted, &overlapped_any_blob,
+ &target_blobs);
+ // Filter the outlines that overlapped any blob and put them into the word
+ // now. This simplifies the remaining task and also makes it more accurate
+ // as it has more completed blobs to work on.
+ GenericVector<bool> wanted;
+ GenericVector<C_BLOB*> wanted_blobs;
+ GenericVector<C_OUTLINE*> wanted_outlines;
+ int num_overlapped = 0;
+ int num_overlapped_used = 0;
+ for (int i = 0; i < overlapped_any_blob.size(); ++i) {
+ if (overlapped_any_blob[i]) {
+ ++num_overlapped;
+ if (word_wanted[i]) ++num_overlapped_used;
+ wanted.push_back(word_wanted[i]);
+ wanted_blobs.push_back(target_blobs[i]);
+ wanted_outlines.push_back(outlines[i]);
+ outlines[i] = nullptr;
+ }
+ }
+ real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr);
+ AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,
+ &target_blobs);
+ int non_overlapped = 0;
+ int non_overlapped_used = 0;
+ for (int i = 0; i < word_wanted.size(); ++i) {
+ if (word_wanted[i]) ++non_overlapped_used;
+ if (outlines[i] != nullptr) ++non_overlapped_used;
+ }
+ if (debug_noise_removal) {
+ tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
+ num_overlapped_used, num_overlapped, non_overlapped_used,
+ non_overlapped);
+ real_word->bounding_box().print();
+ }
+ // Now we have decided which outlines we want, put them into the real_word.
+ if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,
+ make_next_word_fuzzy)) {
+ pr_it->MakeCurrentWordFuzzy();
+ }
+ // TODO(rays) Parts of combos have a deep copy of the real word, and need
+ // to have their noise outlines moved/assigned in the same way!!
+ return num_overlapped_used != 0 || non_overlapped_used != 0;
+}
+
+// Attempts to put noise/diacritic outlines into the blobs that they overlap.
+// Input: a set of noisy outlines that probably belong to the real_word.
+// Output: word_wanted indicates which outlines are to be assigned to a blob,
+// target_blobs indicates which to assign to, and overlapped_any_blob is
+// true for all outlines that overlapped a blob.
+void Tesseract::AssignDiacriticsToOverlappingBlobs(
+ const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
+ PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
+ GenericVector<bool>* overlapped_any_blob,
+ GenericVector<C_BLOB*>* target_blobs) {
+ std::vector<bool> blob_wanted;
+ word_wanted->resize(outlines.size(), false);
+ overlapped_any_blob->resize(outlines.size(), false);
+ target_blobs->resize(outlines.size(), nullptr);
+ // For each real blob, find the outlines that seriously overlap it.
+ // A single blob could be several merged characters, so there can be quite
+ // a few outlines overlapping, and the full engine needs to be used to chop
+ // and join to get a sensible result.
+ C_BLOB_IT blob_it(real_word->cblob_list());
+ for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+ C_BLOB* blob = blob_it.data();
+ const TBOX blob_box = blob->bounding_box();
+ blob_wanted.resize(outlines.size(), false);
+ int num_blob_outlines = 0;
+ for (int i = 0; i < outlines.size(); ++i) {
+ if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
+ !(*word_wanted)[i]) {
+ blob_wanted[i] = true;
+ (*overlapped_any_blob)[i] = true;
+ ++num_blob_outlines;
+ }
+ }
+ if (debug_noise_removal) {
+ tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
+ blob_box.print();
+ }
+ // If any outlines overlap the blob, and not too many, classify the blob
+ // (using the full engine, languages and all), and choose the maximal
+ // combination of outlines that doesn't hurt the end-result classification
+ // by too much. Mark them as wanted.
+ if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
+ if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
+ outlines, num_blob_outlines,
+ &blob_wanted)) {
+ for (int i = 0; i < blob_wanted.size(); ++i) {
+ if (blob_wanted[i]) {
+ // Claim the outline and record where it is going.
+ (*word_wanted)[i] = true;
+ (*target_blobs)[i] = blob;
+ }
+ }
+ }
+ }
+ }
+}
+
+// Attempts to assign non-overlapping outlines to their nearest blobs or
+// make new blobs out of them.
+void Tesseract::AssignDiacriticsToNewBlobs(
+ const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
+ PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
+ GenericVector<C_BLOB*>* target_blobs) {
+ std::vector<bool> blob_wanted;
+ word_wanted->resize(outlines.size(), false);
+ target_blobs->resize(outlines.size(), nullptr);
+ // Check for outlines that need to be turned into stand-alone blobs.
+ for (int i = 0; i < outlines.size(); ++i) {
+ if (outlines[i] == nullptr) continue;
+ // Get a set of adjacent outlines that don't overlap any existing blob.
+ blob_wanted.resize(outlines.size(), false);
+ int num_blob_outlines = 0;
+ TBOX total_ol_box(outlines[i]->bounding_box());
+ while (i < outlines.size() && outlines[i] != nullptr) {
+ blob_wanted[i] = true;
+ total_ol_box += outlines[i]->bounding_box();
+ ++i;
+ ++num_blob_outlines;
+ }
+ // Find the insertion point.
+ C_BLOB_IT blob_it(real_word->cblob_list());
+ while (!blob_it.at_last() &&
+ blob_it.data_relative(1)->bounding_box().left() <=
+ total_ol_box.left()) {
+ blob_it.forward();
+ }
+ // Choose which combination of them we actually want and where to put
+ // them.
+ if (debug_noise_removal)
+ tprintf("Num blobless outlines = %d\n", num_blob_outlines);
+ C_BLOB* left_blob = blob_it.data();
+ TBOX left_box = left_blob->bounding_box();
+ C_BLOB* right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
+ if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr ||
+ !right_blob->bounding_box().x_overlap(total_ol_box)) &&
+ SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
+ outlines, num_blob_outlines,
+ &blob_wanted)) {
+ if (debug_noise_removal) tprintf("Added to left blob\n");
+ for (int j = 0; j < blob_wanted.size(); ++j) {
+ if (blob_wanted[j]) {
+ (*word_wanted)[j] = true;
+ (*target_blobs)[j] = left_blob;
+ }
+ }
+ } else if (right_blob != nullptr &&
+ (!left_box.x_overlap(total_ol_box) ||
+ right_blob->bounding_box().x_overlap(total_ol_box)) &&
+ SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it,
+ right_blob, outlines,
+ num_blob_outlines, &blob_wanted)) {
+ if (debug_noise_removal) tprintf("Added to right blob\n");
+ for (int j = 0; j < blob_wanted.size(); ++j) {
+ if (blob_wanted[j]) {
+ (*word_wanted)[j] = true;
+ (*target_blobs)[j] = right_blob;
+ }
+ }
+ } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr,
+ outlines, num_blob_outlines,
+ &blob_wanted)) {
+ if (debug_noise_removal) tprintf("Fitted between blobs\n");
+ for (int j = 0; j < blob_wanted.size(); ++j) {
+ if (blob_wanted[j]) {
+ (*word_wanted)[j] = true;
+ (*target_blobs)[j] = nullptr;
+ }
+ }
+ }
+ }
+}
+
+// Starting with ok_outlines set to indicate which outlines overlap the blob,
+// chooses the optimal set (approximately) and returns true if any outlines
+// are desired, in which case ok_outlines indicates which ones.
+bool Tesseract::SelectGoodDiacriticOutlines(
+ int pass, float certainty_threshold, PAGE_RES_IT* pr_it, C_BLOB* blob,
+ const GenericVector<C_OUTLINE*>& outlines, int num_outlines,
+ std::vector<bool>* ok_outlines) {
+ STRING best_str;
+ float target_cert = certainty_threshold;
+ if (blob != nullptr) {
+ float target_c2;
+ target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);
+ if (debug_noise_removal) {
+ tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.c_str(),
+ target_cert, target_c2);
+ blob->bounding_box().print();
+ }
+ target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
+ }
+ std::vector<bool> test_outlines = *ok_outlines;
+ // Start with all the outlines in.
+ STRING all_str;
+ std::vector<bool> best_outlines = *ok_outlines;
+ float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
+ pr_it, blob, &all_str);
+ if (debug_noise_removal) {
+ TBOX ol_box;
+ for (int i = 0; i < test_outlines.size(); ++i) {
+ if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
+ }
+ tprintf("All Noise blob classified as %s=%g, delta=%g at:",
+ all_str.c_str(), best_cert, best_cert - target_cert);
+ ol_box.print();
+ }
+ // Iteratively zero out the bit that improves the certainty the most, until
+ // we get past the threshold, have zero bits, or fail to improve.
+ int best_index = 0; // To zero out.
+ while (num_outlines > 1 && best_index >= 0 &&
+ (blob == nullptr || best_cert < target_cert || blob != nullptr)) {
+ // Find the best bit to zero out.
+ best_index = -1;
+ for (int i = 0; i < outlines.size(); ++i) {
+ if (test_outlines[i]) {
+ test_outlines[i] = false;
+ STRING str;
+ float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
+ pr_it, blob, &str);
+ if (debug_noise_removal) {
+ TBOX ol_box;
+ for (int j = 0; j < outlines.size(); ++j) {
+ if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
+ tprintf("%c", test_outlines[j] ? 'T' : 'F');
+ }
+ tprintf(" blob classified as %s=%g, delta=%g) at:", str.c_str(),
+ cert, cert - target_cert);
+ ol_box.print();
+ }
+ if (cert > best_cert) {
+ best_cert = cert;
+ best_index = i;
+ best_outlines = test_outlines;
+ }
+ test_outlines[i] = true;
+ }
+ }
+ if (best_index >= 0) {
+ test_outlines[best_index] = false;
+ --num_outlines;
+ }
+ }
+ if (best_cert >= target_cert) {
+ // Save the best combination.
+ *ok_outlines = best_outlines;
+ if (debug_noise_removal) {
+ tprintf("%s noise combination ", blob ? "Adding" : "New");
+ for (int i = 0; i < best_outlines.size(); ++i) {
+ tprintf("%c", best_outlines[i] ? 'T' : 'F');
+ }
+ tprintf(" yields certainty %g, beating target of %g\n", best_cert,
+ target_cert);
+ }
+ return true;
+ }
+
+ return false;
+}
+
+// Classifies the given blob plus the outlines flagged by ok_outlines, undoes
+// the inclusion of the outlines, and returns the certainty of the raw choice.
+float Tesseract::ClassifyBlobPlusOutlines(
+ const std::vector<bool>& ok_outlines,
+ const GenericVector<C_OUTLINE*>& outlines, int pass_n, PAGE_RES_IT* pr_it,
+ C_BLOB* blob, STRING* best_str) {
+ C_OUTLINE_IT ol_it;
+ C_OUTLINE* first_to_keep = nullptr;
+ C_BLOB* local_blob = nullptr;
+ if (blob != nullptr) {
+ // Add the required outlines to the blob.
+ ol_it.set_to_list(blob->out_list());
+ first_to_keep = ol_it.data();
+ }
+ for (int i = 0; i < ok_outlines.size(); ++i) {
+ if (ok_outlines[i]) {
+ // This outline is to be added.
+ if (blob == nullptr) {
+ local_blob = new C_BLOB(outlines[i]);
+ blob = local_blob;
+ ol_it.set_to_list(blob->out_list());
+ } else {
+ ol_it.add_before_stay_put(outlines[i]);
+ }
+ }
+ }
+ float c2;
+ float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
+ ol_it.move_to_first();
+ if (first_to_keep == nullptr) {
+ // We created blob. Empty its outlines and delete it.
+ for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
+ delete local_blob;
+ cert = -c2;
+ } else {
+ // Remove the outlines that we put in.
+ for (; ol_it.data() != first_to_keep; ol_it.forward()) {
+ ol_it.extract();
+ }
+ }
+ return cert;
+}
+
+// Classifies the given blob (part of word_data->word->word) as an individual
+// word, using languages, chopper etc, returning only the certainty of the
+// best raw choice, and undoing all the work done to fake out the word.
+float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it,
+ C_BLOB* blob, STRING* best_str, float* c2) {
+ WERD* real_word = pr_it->word()->word;
+ WERD* word = real_word->ConstructFromSingleBlob(
+ real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
+ WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
+ // Get a new iterator that points to the new word.
+ PAGE_RES_IT it(pr_it->page_res);
+ while (it.word() != word_res && it.word() != nullptr) it.forward();
+ ASSERT_HOST(it.word() == word_res);
+ WordData wd(it);
+ // Force full initialization.
+ SetupWordPassN(1, &wd);
+ classify_word_and_language(pass_n, &it, &wd);
+ if (debug_noise_removal) {
+ if (wd.word->raw_choice != nullptr) {
+ tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
+ wd.row->x_height(), wd.word->raw_choice->min_x_height(),
+ wd.word->raw_choice->max_x_height());
+ } else {
+ tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height,
+ wd.row->x_height());
+ }
+ }
+ float cert = 0.0f;
+ if (wd.word->raw_choice != nullptr) { // This probably shouldn't happen, but...
+ cert = wd.word->raw_choice->certainty();
+ float rat = wd.word->raw_choice->rating();
+ *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
+ *best_str = wd.word->raw_choice->unichar_string();
+ } else {
+ *c2 = 0.0f;
+ *best_str = "";
+ }
+ it.DeleteCurrentWord();
+ pr_it->ResetWordIterator();
+ return cert;
+}
+
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+// Generic function for classifying a word. Can be used either for pass1 or
+// pass2 according to the function passed to recognizer.
+// word_data holds the word to be recognized, and its block and row, and
+// pr_it points to the word as well, in case we are running LSTM and it wants
+// to output multiple words.
+// Recognizes in the current language, and if successful that is all.
+// If recognition was not successful, tries all available languages until
+// it gets a successful result or runs out of languages. Keeps the best result.
+void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
+ WordData* word_data) {
+#ifdef DISABLED_LEGACY_ENGINE
+ WordRecognizer recognizer = &Tesseract::classify_word_pass1;
+#else
+ WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
+ : &Tesseract::classify_word_pass2;
+#endif // def DISABLED_LEGACY_ENGINE
+
+ // Best result so far.
+ PointerVector<WERD_RES> best_words;
+ // Points to the best result. May be word or in lang_words.
+ const WERD_RES* word = word_data->word;
+ clock_t start_t = clock();
+ const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
+ if (debug) {
+ tprintf("%s word with lang %s at:",
+ word->done ? "Already done" : "Processing",
+ most_recently_used_->lang.c_str());
+ word->word->bounding_box().print();
+ }
+ if (word->done) {
+ // If done on pass1, leave it as-is.
+ if (!word->tess_failed)
+ most_recently_used_ = word->tesseract;
+ return;
+ }
+ int sub = sub_langs_.size();
+ if (most_recently_used_ != this) {
+ // Get the index of the most_recently_used_.
+ for (sub = 0; sub < sub_langs_.size() &&
+ most_recently_used_ != sub_langs_[sub]; ++sub) {}
+ }
+ most_recently_used_->RetryWithLanguage(
+ *word_data, recognizer, debug, &word_data->lang_words[sub], &best_words);
+ Tesseract* best_lang_tess = most_recently_used_;
+ if (!WordsAcceptable(best_words)) {
+ // Try all the other languages to see if they are any better.
+ if (most_recently_used_ != this &&
+ this->RetryWithLanguage(*word_data, recognizer, debug,
+ &word_data->lang_words[sub_langs_.size()],
+ &best_words) > 0) {
+ best_lang_tess = this;
+ }
+ for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
+ ++i) {
+ if (most_recently_used_ != sub_langs_[i] &&
+ sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug,
+ &word_data->lang_words[i],
+ &best_words) > 0) {
+ best_lang_tess = sub_langs_[i];
+ }
+ }
+ }
+ most_recently_used_ = best_lang_tess;
+ if (!best_words.empty()) {
+ if (best_words.size() == 1 && !best_words[0]->combination) {
+ // Move the best single result to the main word.
+ word_data->word->ConsumeWordResults(best_words[0]);
+ } else {
+ // Words came from LSTM, and must be moved to the PAGE_RES properly.
+ word_data->word = best_words.back();
+ pr_it->ReplaceCurrentWord(&best_words);
+ }
+ ASSERT_HOST(word_data->word->box_word != nullptr);
+ } else {
+ tprintf("no best words!!\n");
+ }
+ clock_t ocr_t = clock();
+ if (tessedit_timing_debug) {
+ tprintf("%s (ocr took %.2f sec)\n",
+ word_data->word->best_choice->unichar_string().c_str(),
+ static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
+ }
+}
+
+/**
+ * classify_word_pass1
+ *
+ * Baseline normalize the word and pass it to Tess.
+ */
+
+void Tesseract::classify_word_pass1(const WordData& word_data,
+ WERD_RES** in_word,
+ PointerVector<WERD_RES>* out_words) {
+ ROW* row = word_data.row;
+ BLOCK* block = word_data.block;
+ prev_word_best_choice_ = word_data.prev_word != nullptr
+ ? word_data.prev_word->word->best_choice : nullptr;
+#ifdef DISABLED_LEGACY_ENGINE
+ if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
+#else
+ if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
+ tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
+#endif // def DISABLED_LEGACY_ENGINE
+ if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
+ LSTMRecognizeWord(*block, row, *in_word, out_words);
+ if (!out_words->empty())
+ return; // Successful lstm recognition.
+ }
+ if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
+ // No fallback allowed, so use a fake.
+ (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
+ return;
+ }
+
+ #ifndef DISABLED_LEGACY_ENGINE
+ // Fall back to tesseract for failed words or odd words.
+ (*in_word)->SetupForRecognition(unicharset, this, BestPix(),
+ OEM_TESSERACT_ONLY, nullptr,
+ classify_bln_numeric_mode,
+ textord_use_cjk_fp_model,
+ poly_allow_detailed_fx, row, block);
+ #endif // ndef DISABLED_LEGACY_ENGINE
+ }
+
+#ifndef DISABLED_LEGACY_ENGINE
+ WERD_RES* word = *in_word;
+ match_word_pass_n(1, word, row, block);
+ if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
+ word->tess_would_adapt = AdaptableWord(word);
+ bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
+
+ if (adapt_ok) {
+ // Send word to adaptive classifier for training.
+ word->BestChoiceToCorrectText();
+ LearnWord(nullptr, word);
+ // Mark misadaptions if running blamer.
+ if (word->blamer_bundle != nullptr) {
+ word->blamer_bundle->SetMisAdaptionDebug(word->best_choice,
+ wordrec_debug_blamer);
+ }
+ }
+
+ if (tessedit_enable_doc_dict && !word->IsAmbiguous())
+ tess_add_doc_word(word->best_choice);
+ }
+#endif // ndef DISABLED_LEGACY_ENGINE
+}
+
+// Helper to report the result of the xheight fix.
+void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht,
+ WERD_RES* word, WERD_RES* new_word) {
+ tprintf("New XHT Match:%s = %s ",
+ word->best_choice->unichar_string().c_str(),
+ word->best_choice->debug_string().c_str());
+ word->reject_map.print(debug_fp);
+ tprintf(" -> %s = %s ",
+ new_word->best_choice->unichar_string().c_str(),
+ new_word->best_choice->debug_string().c_str());
+ new_word->reject_map.print(debug_fp);
+ tprintf(" %s->%s %s %s\n",
+ word->guessed_x_ht ? "GUESS" : "CERT",
+ new_word->guessed_x_ht ? "GUESS" : "CERT",
+ new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
+ accept_new_word ? "ACCEPTED" : "");
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+// Run the x-height fix-up, based on min/max top/bottom information in
+// unicharset.
+// Returns true if the word was changed.
+// See the comment in fixxht.cpp for a description of the overall process.
+bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
+ int original_misfits = CountMisfitTops(word);
+ if (original_misfits == 0)
+ return false;
+ float baseline_shift = 0.0f;
+ float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
+ if (baseline_shift != 0.0f) {
+ // Try the shift on its own first.
+ if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height,
+ word, block, row))
+ return false;
+ original_misfits = CountMisfitTops(word);
+ if (original_misfits > 0) {
+ float new_baseline_shift;
+ // Now recompute the new x_height.
+ new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
+ if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
+ // No test of return value here, as we are definitely making a change
+ // to the word by shifting the baseline.
+ TestNewNormalization(original_misfits, baseline_shift, new_x_ht,
+ word, block, row);
+ }
+ }
+ return true;
+ } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
+ return TestNewNormalization(original_misfits, 0.0f, new_x_ht,
+ word, block, row);
+ } else {
+ return false;
+ }
+}
+
+// Runs recognition with the test baseline shift and x-height and returns true
+// if there was an improvement in recognition result.
+bool Tesseract::TestNewNormalization(int original_misfits,
+ float baseline_shift, float new_x_ht,
+ WERD_RES *word, BLOCK* block, ROW *row) {
+ bool accept_new_x_ht = false;
+ WERD_RES new_x_ht_word(word->word);
+ if (word->blamer_bundle != nullptr) {
+ new_x_ht_word.blamer_bundle = new BlamerBundle();
+ new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
+ }
+ new_x_ht_word.x_height = new_x_ht;
+ new_x_ht_word.baseline_shift = baseline_shift;
+ new_x_ht_word.caps_height = 0.0;
+ new_x_ht_word.SetupForRecognition(
+ unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
+ classify_bln_numeric_mode, textord_use_cjk_fp_model,
+ poly_allow_detailed_fx, row, block);
+ match_word_pass_n(2, &new_x_ht_word, row, block);
+ if (!new_x_ht_word.tess_failed) {
+ int new_misfits = CountMisfitTops(&new_x_ht_word);
+ if (debug_x_ht_level >= 1) {
+ tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
+ original_misfits, word->x_height,
+ new_misfits, new_x_ht);
+ tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
+ word->best_choice->rating(), word->best_choice->certainty(),
+ new_x_ht_word.best_choice->rating(),
+ new_x_ht_word.best_choice->certainty());
+ }
+ // The misfits must improve and either the rating or certainty.
+ accept_new_x_ht = new_misfits < original_misfits &&
+ (new_x_ht_word.best_choice->certainty() >
+ word->best_choice->certainty() ||
+ new_x_ht_word.best_choice->rating() <
+ word->best_choice->rating());
+ if (debug_x_ht_level >= 1) {
+ ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
+ }
+ }
+ if (accept_new_x_ht) {
+ word->ConsumeWordResults(&new_x_ht_word);
+ return true;
+ }
+ return false;
+}
+
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+/**
+ * classify_word_pass2
+ *
+ * Control what to do with the word in pass 2
+ */
+
+void Tesseract::classify_word_pass2(const WordData& word_data,
+ WERD_RES** in_word,
+ PointerVector<WERD_RES>* out_words) {
+ // Return if we do not want to run Tesseract.
+ if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
+ return;
+ }
+#ifndef DISABLED_LEGACY_ENGINE
+ ROW* row = word_data.row;
+ BLOCK* block = word_data.block;
+ WERD_RES* word = *in_word;
+ prev_word_best_choice_ = word_data.prev_word != nullptr
+ ? word_data.prev_word->word->best_choice : nullptr;
+
+ check_debug_pt(word, 30);
+ if (!word->done) {
+ word->caps_height = 0.0;
+ if (word->x_height == 0.0f)
+ word->x_height = row->x_height();
+ match_word_pass_n(2, word, row, block);
+ check_debug_pt(word, 40);
+ }
+
+ SubAndSuperscriptFix(word);
+
+ if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
+ if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() &&
+ block->classify_rotation().y() == 0.0f) {
+ // Use the tops and bottoms since they are available.
+ TrainedXheightFix(word, block, row);
+ }
+ }
+#ifndef GRAPHICS_DISABLED
+ if (tessedit_display_outwords) {
+ if (fx_win == nullptr)
+ create_fx_win();
+ clear_fx_win();
+ word->rebuild_word->plot(fx_win);
+ TBOX wbox = word->rebuild_word->bounding_box();
+ fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
+ wbox.right(), wbox.bottom());
+ ScrollView::Update();
+ }
+#endif
+ check_debug_pt(word, 50);
+#endif // ndef DISABLED_LEGACY_ENGINE
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+/**
+ * match_word_pass2
+ *
+ * Baseline normalize the word and pass it to Tess.
+ */
+void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word,
+ ROW *row, BLOCK* block) {
+ if (word->tess_failed) return;
+ tess_segment_pass_n(pass_n, word);
+
+ if (!word->tess_failed) {
+ if (!word->word->flag (W_REP_CHAR)) {
+ word->fix_quotes();
+ if (tessedit_fix_hyphens)
+ word->fix_hyphens();
+ /* Don't trust fix_quotes! - though I think I've fixed the bug */
+ if (word->best_choice->length() != word->box_word->length()) {
+ tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
+ " #Blobs=%d\n",
+ word->best_choice->debug_string().c_str(),
+ word->best_choice->length(),
+ word->box_word->length());
+
+ }
+ word->tess_accepted = tess_acceptable_word(word);
+
+ // Also sets word->done flag
+ make_reject_map(word, row, pass_n);
+ }
+ }
+ set_word_fonts(word);
+
+ ASSERT_HOST(word->raw_choice != nullptr);
+}
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+// Helper to return the best rated BLOB_CHOICE in the whole word that matches
+// the given char_id, or nullptr if none can be found.
+static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id,
+ WERD_RES* word_res) {
+ // Find the corresponding best BLOB_CHOICE from any position in the word_res.
+ BLOB_CHOICE* best_choice = nullptr;
+ for (int i = 0; i < word_res->best_choice->length(); ++i) {
+ BLOB_CHOICE* choice = FindMatchingChoice(char_id,
+ word_res->GetBlobChoices(i));
+ if (choice != nullptr) {
+ if (best_choice == nullptr || choice->rating() < best_choice->rating())
+ best_choice = choice;
+ }
+ }
+ return best_choice;
+}
+
+// Helper to insert blob_choice in each location in the leader word if there is
+// no matching BLOB_CHOICE there already, and correct any incorrect results
+// in the best_choice.
+static void CorrectRepcharChoices(BLOB_CHOICE* blob_choice,
+ WERD_RES* word_res) {
+ WERD_CHOICE* word = word_res->best_choice;
+ for (int i = 0; i < word_res->best_choice->length(); ++i) {
+ BLOB_CHOICE* choice = FindMatchingChoice(blob_choice->unichar_id(),
+ word_res->GetBlobChoices(i));
+ if (choice == nullptr) {
+ BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));
+ choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
+ }
+ }
+ // Correct any incorrect results in word.
+ for (int i = 0; i < word->length(); ++i) {
+ if (word->unichar_id(i) != blob_choice->unichar_id())
+ word->set_unichar_id(blob_choice->unichar_id(), i);
+ }
+}
+
+/**
+ * fix_rep_char()
+ * The word is a repeated char. (Leader.) Find the repeated char character.
+ * Create the appropriate single-word or multi-word sequence according to
+ * the size of spaces in between blobs, and correct the classifications
+ * where some of the characters disagree with the majority.
+ */
+void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
+ WERD_RES *word_res = page_res_it->word();
+ const WERD_CHOICE &word = *(word_res->best_choice);
+
+ // Find the frequency of each unique character in the word.
+ SortHelper<UNICHAR_ID> rep_ch(word.length());
+ for (int i = 0; i < word.length(); ++i) {
+ rep_ch.Add(word.unichar_id(i), 1);
+ }
+
+ // Find the most frequent result.
+ UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
+ int max_count = rep_ch.MaxCount(&maxch_id);
+ // Find the best exemplar of a classifier result for maxch_id.
+ BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
+ if (best_choice == nullptr) {
+ tprintf("Failed to find a choice for %s, occurring %d times\n",
+ word_res->uch_set->debug_str(maxch_id).c_str(), max_count);
+ return;
+ }
+ word_res->done = true;
+
+ // Measure the mean space.
+ int gap_count = 0;
+ WERD* werd = word_res->word;
+ C_BLOB_IT blob_it(werd->cblob_list());
+ C_BLOB* prev_blob = blob_it.data();
+ for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
+ C_BLOB* blob = blob_it.data();
+ int gap = blob->bounding_box().left();
+ gap -= prev_blob->bounding_box().right();
+ ++gap_count;
+ prev_blob = blob;
+ }
+ // Just correct existing classification.
+ CorrectRepcharChoices(best_choice, word_res);
+ word_res->reject_map.initialise(word.length());
+}
+
+ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(
+ const UNICHARSET& char_set, const char *s, const char *lengths) {
+ int i = 0;
+ int offset = 0;
+ int leading_punct_count;
+ int upper_count = 0;
+ int hyphen_pos = -1;
+ ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;
+
+ if (strlen (lengths) > 20)
+ return word_type;
+
+ /* Single Leading punctuation char*/
+
+ if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
+ offset += lengths[i++];
+ leading_punct_count = i;
+
+ /* Initial cap */
+ while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
+ offset += lengths[i++];
+ upper_count++;
+ }
+ if (upper_count > 1) {
+ word_type = AC_UPPER_CASE;
+ } else {
+ /* Lower case word, possibly with an initial cap */
+ while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
+ offset += lengths[i++];
+ }
+ if (i - leading_punct_count < quality_min_initial_alphas_reqd)
+ goto not_a_word;
+ /*
+ Allow a single hyphen in a lower case word
+ - don't trust upper case - I've seen several cases of "H" -> "I-I"
+ */
+ if (lengths[i] == 1 && s[offset] == '-') {
+ hyphen_pos = i;
+ offset += lengths[i++];
+ if (s[offset] != '\0') {
+ while ((s[offset] != '\0') &&
+ char_set.get_islower(s + offset, lengths[i])) {
+ offset += lengths[i++];
+ }
+ if (i < hyphen_pos + 3)
+ goto not_a_word;
+ }
+ } else {
+ /* Allow "'s" in NON hyphenated lower case words */
+ if (lengths[i] == 1 && (s[offset] == '\'') &&
+ lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
+ offset += lengths[i++];
+ offset += lengths[i++];
+ }
+ }
+ if (upper_count > 0)
+ word_type = AC_INITIAL_CAP;
+ else
+ word_type = AC_LOWER_CASE;
+ }
+
+ /* Up to two different, constrained trailing punctuation chars */
+ if (lengths[i] == 1 && s[offset] != '\0' &&
+ STRING(chs_trailing_punct1).contains(s[offset]))
+ offset += lengths[i++];
+ if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
+ s[offset - lengths[i - 1]] != s[offset] &&
+ STRING(chs_trailing_punct2).contains (s[offset]))
+ offset += lengths[i++];
+
+ if (s[offset] != '\0')
+ word_type = AC_UNACCEPTABLE;
+
+ not_a_word:
+
+ if (word_type == AC_UNACCEPTABLE) {
+ /* Look for abbreviation string */
+ i = 0;
+ offset = 0;
+ if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
+ word_type = AC_UC_ABBREV;
+ while (s[offset] != '\0' &&
+ char_set.get_isupper(s + offset, lengths[i]) &&
+ lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
+ offset += lengths[i++];
+ offset += lengths[i++];
+ }
+ }
+ else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
+ word_type = AC_LC_ABBREV;
+ while (s[offset] != '\0' &&
+ char_set.get_islower(s + offset, lengths[i]) &&
+ lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
+ offset += lengths[i++];
+ offset += lengths[i++];
+ }
+ }
+ if (s[offset] != '\0')
+ word_type = AC_UNACCEPTABLE;
+ }
+
+ return word_type;
+}
+
+bool Tesseract::check_debug_pt(WERD_RES* word, int location) {
+ bool show_map_detail = false;
+ int16_t i;
+
+ if (!test_pt)
+ return false;
+
+ tessedit_rejection_debug.set_value (false);
+ debug_x_ht_level.set_value(0);
+
+ if (word->word->bounding_box().contains(FCOORD (test_pt_x, test_pt_y))) {
+ if (location < 0)
+ return true; // For breakpoint use
+ tessedit_rejection_debug.set_value(true);
+ debug_x_ht_level.set_value(2);
+ tprintf ("\n\nTESTWD::");
+ switch (location) {
+ case 0:
+ tprintf ("classify_word_pass1 start\n");
+ word->word->print();
+ break;
+ case 10:
+ tprintf ("make_reject_map: initial map");
+ break;
+ case 20:
+ tprintf ("make_reject_map: after NN");
+ break;
+ case 30:
+ tprintf ("classify_word_pass2 - START");
+ break;
+ case 40:
+ tprintf ("classify_word_pass2 - Pre Xht");
+ break;
+ case 50:
+ tprintf ("classify_word_pass2 - END");
+ show_map_detail = true;
+ break;
+ case 60:
+ tprintf ("fixspace");
+ break;
+ case 70:
+ tprintf ("MM pass START");
+ break;
+ case 80:
+ tprintf ("MM pass END");
+ break;
+ case 90:
+ tprintf ("After Poor quality rejection");
+ break;
+ case 100:
+ tprintf ("unrej_good_quality_words - START");
+ break;
+ case 110:
+ tprintf ("unrej_good_quality_words - END");
+ break;
+ case 120:
+ tprintf ("Write results pass");
+ show_map_detail = true;
+ break;
+ }
+ if (word->best_choice != nullptr) {
+ tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
+ word->reject_map.print(debug_fp);
+ tprintf("\n");
+ if (show_map_detail) {
+ tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
+ for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
+ tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
+ word->reject_map[i].full_print(debug_fp);
+ }
+ }
+ } else {
+ tprintf("null best choice\n");
+ }
+ tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
+ tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
+ return true;
+ } else {
+ return false;
+ }
+}
+
+/**
+ * find_modal_font
+ *
+ * Find the modal font and remove from the stats.
+ */
+static void find_modal_font( // good chars in word
+ STATS* fonts, // font stats
+ int16_t* font_out, // output font
+ int8_t* font_count // output count
+) {
+ int16_t font; //font index
+ int32_t count; //pile count
+
+ if (fonts->get_total () > 0) {
+ font = static_cast<int16_t>(fonts->mode ());
+ *font_out = font;
+ count = fonts->pile_count (font);
+ *font_count = count < INT8_MAX ? count : INT8_MAX;
+ fonts->add (font, -*font_count);
+ }
+ else {
+ *font_out = -1;
+ *font_count = 0;
+ }
+}
+
+/**
+ * set_word_fonts
+ *
+ * Get the fonts for the word.
+ */
+void Tesseract::set_word_fonts(WERD_RES *word) {
+ // Don't try to set the word fonts for an lstm word, as the configs
+ // will be meaningless.
+ if (word->chopped_word == nullptr) return;
+ ASSERT_HOST(word->best_choice != nullptr);
+
+#ifndef DISABLED_LEGACY_ENGINE
+ const int fontinfo_size = get_fontinfo_table().size();
+ if (fontinfo_size == 0) return;
+ GenericVector<int> font_total_score;
+ font_total_score.init_to_size(fontinfo_size, 0);
+
+ // Compute the font scores for the word
+ if (tessedit_debug_fonts) {
+ tprintf("Examining fonts in %s\n",
+ word->best_choice->debug_string().c_str());
+ }
+ for (int b = 0; b < word->best_choice->length(); ++b) {
+ const BLOB_CHOICE* choice = word->GetBlobChoice(b);
+ if (choice == nullptr) continue;
+ auto &fonts = choice->fonts();
+ for (int f = 0; f < fonts.size(); ++f) {
+ const int fontinfo_id = fonts[f].fontinfo_id;
+ if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
+ font_total_score[fontinfo_id] += fonts[f].score;
+ }
+ }
+ }
+ // Find the top and 2nd choice for the word.
+ int score1 = 0, score2 = 0;
+ int16_t font_id1 = -1, font_id2 = -1;
+ for (int f = 0; f < fontinfo_size; ++f) {
+ if (tessedit_debug_fonts && font_total_score[f] > 0) {
+ tprintf("Font %s, total score = %d\n",
+ fontinfo_table_.get(f).name, font_total_score[f]);
+ }
+ if (font_total_score[f] > score1) {
+ score2 = score1;
+ font_id2 = font_id1;
+ score1 = font_total_score[f];
+ font_id1 = f;
+ } else if (font_total_score[f] > score2) {
+ score2 = font_total_score[f];
+ font_id2 = f;
+ }
+ }
+ word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : nullptr;
+ word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : nullptr;
+ // Each score has a limit of UINT16_MAX, so divide by that to get the number
+ // of "votes" for that font, ie number of perfect scores.
+ word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX);
+ word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX);
+ if (score1 > 0) {
+ const FontInfo fi = fontinfo_table_.get(font_id1);
+ if (tessedit_debug_fonts) {
+ if (word->fontinfo_id2_count > 0 && font_id2 >= 0) {
+ tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
+ fi.name, word->fontinfo_id_count,
+ fontinfo_table_.get(font_id2).name,
+ word->fontinfo_id2_count);
+ } else {
+ tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
+ fi.name, word->fontinfo_id_count);
+ }
+ }
+ }
+#endif // ndef DISABLED_LEGACY_ENGINE
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+/**
+ * font_recognition_pass
+ *
+ * Smooth the fonts for the document.
+ */
+void Tesseract::font_recognition_pass(PAGE_RES* page_res) {
+ PAGE_RES_IT page_res_it(page_res);
+ WERD_RES *word; // current word
+ STATS doc_fonts(0, font_table_size_); // font counters
+
+ // Gather font id statistics.
+ for (page_res_it.restart_page(); page_res_it.word() != nullptr;
+ page_res_it.forward()) {
+ word = page_res_it.word();
+ if (word->fontinfo != nullptr) {
+ doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
+ }
+ if (word->fontinfo2 != nullptr) {
+ doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
+ }
+ }
+ int16_t doc_font; // modal font
+ int8_t doc_font_count; // modal font
+ find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
+ if (doc_font_count == 0)
+ return;
+ // Get the modal font pointer.
+ const FontInfo* modal_font = nullptr;
+ for (page_res_it.restart_page(); page_res_it.word() != nullptr;
+ page_res_it.forward()) {
+ word = page_res_it.word();
+ if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) {
+ modal_font = word->fontinfo;
+ break;
+ }
+ if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) {
+ modal_font = word->fontinfo2;
+ break;
+ }
+ }
+ ASSERT_HOST(modal_font != nullptr);
+
+ // Assign modal font to weak words.
+ for (page_res_it.restart_page(); page_res_it.word() != nullptr;
+ page_res_it.forward()) {
+ word = page_res_it.word();
+ const int length = word->best_choice->length();
+
+ const int count = word->fontinfo_id_count;
+ if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
+ word->fontinfo = modal_font;
+ // Counts only get 1 as it came from the doc.
+ word->fontinfo_id_count = 1;
+ }
+ }
+}
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+// If a word has multiple alternates check if the best choice is in the
+// dictionary. If not, replace it with an alternate that exists in the
+// dictionary.
+void Tesseract::dictionary_correction_pass(PAGE_RES *page_res) {
+ PAGE_RES_IT word_it(page_res);
+ for (WERD_RES* word = word_it.word(); word != nullptr;
+ word = word_it.forward()) {
+ if (word->best_choices.singleton())
+ continue; // There are no alternates.
+
+ const WERD_CHOICE* best = word->best_choice;
+ if (word->tesseract->getDict().valid_word(*best) != 0)
+ continue; // The best choice is in the dictionary.
+
+ WERD_CHOICE_IT choice_it(&word->best_choices);
+ for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
+ choice_it.forward()) {
+ WERD_CHOICE* alternate = choice_it.data();
+ if (word->tesseract->getDict().valid_word(*alternate)) {
+ // The alternate choice is in the dictionary.
+ if (tessedit_bigram_debug) {
+ tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
+ best->unichar_string().c_str(),
+ alternate->unichar_string().c_str());
+ }
+ // Replace the 'best' choice with a better choice.
+ word->ReplaceBestChoice(alternate);
+ break;
+ }
+ }
+ }
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/control.h b/tesseract/src/ccmain/control.h
new file mode 100644
index 00000000..cd57ddba
--- /dev/null
+++ b/tesseract/src/ccmain/control.h
@@ -0,0 +1,38 @@
+/**********************************************************************
+ * File: control.h (Formerly control.h)
+ * Description: Module-independent matcher controller.
+ * Author: Ray Smith
+ * Created: Thu Apr 23 11:09:58 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+/**
+ * @file control.h
+ * Module-independent matcher controller.
+ */
+
+#ifndef CONTROL_H
+#define CONTROL_H
+
+enum ACCEPTABLE_WERD_TYPE
+{
+ AC_UNACCEPTABLE, ///< Unacceptable word
+ AC_LOWER_CASE, ///< ALL lower case
+ AC_UPPER_CASE, ///< ALL upper case
+ AC_INITIAL_CAP, ///< ALL but initial lc
+ AC_LC_ABBREV, ///< a.b.c.
+ AC_UC_ABBREV ///< A.B.C.
+};
+
+#endif
diff --git a/tesseract/src/ccmain/docqual.cpp b/tesseract/src/ccmain/docqual.cpp
new file mode 100644
index 00000000..f74f9ead
--- /dev/null
+++ b/tesseract/src/ccmain/docqual.cpp
@@ -0,0 +1,981 @@
+/******************************************************************
+ * File: docqual.cpp (Formerly docqual.c)
+ * Description: Document Quality Metrics
+ * Author: Phil Cheatle
+ *
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <cctype>
+#include "docqual.h"
+#include "reject.h"
+#include "tessvars.h"
+#include "tesseractclass.h"
+
+namespace tesseract{
+
+static void countMatchingBlobs(int16_t& match_count, int /*index*/) {
+ ++match_count;
+}
+
+static void countAcceptedBlobs(WERD_RES* word, int16_t& match_count,
+ int16_t& accepted_match_count, int index) {
+ if (word->reject_map[index].accepted()) {
+ ++accepted_match_count;
+ }
+ ++match_count;
+}
+
+static void acceptIfGoodQuality(WERD_RES* word, int index) {
+ if (word->reject_map[index].accept_if_good_quality()) {
+ word->reject_map[index].setrej_quality_accept();
+ }
+}
+
+/*************************************************************************
+ * word_blob_quality()
+ * How many blobs in the box_word are identical to those of the inword?
+ * ASSUME blobs in both initial word and box_word are in ascending order of
+ * left hand blob edge.
+ *************************************************************************/
+int16_t Tesseract::word_blob_quality(WERD_RES* word) {
+ int16_t match_count = 0;
+ if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
+ !word->rebuild_word->blobs.empty()) {
+ using namespace std::placeholders; // for _1
+ word->bln_boxes->ProcessMatchedBlobs(
+ *word->rebuild_word,
+ std::bind(countMatchingBlobs, match_count, _1));
+ }
+ return match_count;
+}
+
+int16_t Tesseract::word_outline_errs(WERD_RES *word) {
+ int16_t i = 0;
+ int16_t err_count = 0;
+
+ if (word->rebuild_word != nullptr) {
+ for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
+ TBLOB* blob = word->rebuild_word->blobs[b];
+ err_count += count_outline_errs(word->best_choice->unichar_string()[i],
+ blob->NumOutlines());
+ i++;
+ }
+ }
+ return err_count;
+}
+
+/*************************************************************************
+ * word_char_quality()
+ * Combination of blob quality and outline quality - how many good chars are
+ * there? - I.e chars which pass the blob AND outline tests.
+ *************************************************************************/
+void Tesseract::word_char_quality(WERD_RES* word, int16_t* match_count,
+ int16_t* accepted_match_count) {
+ *match_count = 0;
+ *accepted_match_count = 0;
+ if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
+ !word->rebuild_word->blobs.empty()) {
+ using namespace std::placeholders; // for _1
+ word->bln_boxes->ProcessMatchedBlobs(
+ *word->rebuild_word,
+ std::bind(countAcceptedBlobs,
+ word, *match_count, *accepted_match_count, _1));
+ }
+}
+
+/*************************************************************************
+ * unrej_good_chs()
+ * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
+ *************************************************************************/
+void Tesseract::unrej_good_chs(WERD_RES* word) {
+ if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
+ word->rebuild_word->blobs.empty()) {
+ using namespace std::placeholders; // for _1
+ word->bln_boxes->ProcessMatchedBlobs(
+ *word->rebuild_word, std::bind(acceptIfGoodQuality, word, _1));
+ }
+}
+
+int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {
+ int expected_outline_count;
+
+ if (STRING (outlines_odd).contains (c))
+ return 0; // Don't use this char
+ else if (STRING (outlines_2).contains (c))
+ expected_outline_count = 2;
+ else
+ expected_outline_count = 1;
+ return abs (outline_count - expected_outline_count);
+}
+
+void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
+ bool good_quality_doc) {
+ if ((tessedit_good_quality_unrej && good_quality_doc))
+ unrej_good_quality_words(page_res_it);
+ doc_and_block_rejection(page_res_it, good_quality_doc);
+ if (unlv_tilde_crunching) {
+ tilde_crunch(page_res_it);
+ tilde_delete(page_res_it);
+ }
+}
+
+/*************************************************************************
+ * unrej_good_quality_words()
+ * Accept potential rejects in words which pass the following checks:
+ * - Contains a potential reject
+ * - Word looks like a sensible alpha word.
+ * - Word segmentation is the same as the original image
+ * - All characters have the expected number of outlines
+ * NOTE - the rejection counts are recalculated after unrejection
+ * - CAN'T do it in a single pass without a bit of fiddling
+ * - keep it simple but inefficient
+ *************************************************************************/
+void Tesseract::unrej_good_quality_words( //unreject potential
+ PAGE_RES_IT &page_res_it) {
+ WERD_RES *word;
+ ROW_RES *current_row;
+ BLOCK_RES *current_block;
+ int i;
+
+ page_res_it.restart_page ();
+ while (page_res_it.word () != nullptr) {
+ check_debug_pt (page_res_it.word (), 100);
+ if (bland_unrej) {
+ word = page_res_it.word ();
+ for (i = 0; i < word->reject_map.length (); i++) {
+ if (word->reject_map[i].accept_if_good_quality ())
+ word->reject_map[i].setrej_quality_accept ();
+ }
+ page_res_it.forward ();
+ }
+ else if ((page_res_it.row ()->char_count > 0) &&
+ ((page_res_it.row ()->rej_count /
+ static_cast<float>(page_res_it.row ()->char_count)) <=
+ quality_rowrej_pc)) {
+ word = page_res_it.word ();
+ if (word->reject_map.quality_recoverable_rejects() &&
+ (tessedit_unrej_any_wd ||
+ acceptable_word_string(*word->uch_set,
+ word->best_choice->unichar_string().c_str(),
+ word->best_choice->unichar_lengths().c_str())
+ != AC_UNACCEPTABLE)) {
+ unrej_good_chs(word);
+ }
+ page_res_it.forward ();
+ }
+ else {
+ // Skip to end of dodgy row.
+ current_row = page_res_it.row ();
+ while ((page_res_it.word () != nullptr) &&
+ (page_res_it.row () == current_row))
+ page_res_it.forward ();
+ }
+ check_debug_pt (page_res_it.word (), 110);
+ }
+ page_res_it.restart_page ();
+ page_res_it.page_res->char_count = 0;
+ page_res_it.page_res->rej_count = 0;
+ current_block = nullptr;
+ current_row = nullptr;
+ while (page_res_it.word () != nullptr) {
+ if (current_block != page_res_it.block ()) {
+ current_block = page_res_it.block ();
+ current_block->char_count = 0;
+ current_block->rej_count = 0;
+ }
+ if (current_row != page_res_it.row ()) {
+ current_row = page_res_it.row ();
+ current_row->char_count = 0;
+ current_row->rej_count = 0;
+ current_row->whole_word_rej_count = 0;
+ }
+ page_res_it.rej_stat_word ();
+ page_res_it.forward ();
+ }
+}
+
+
+/*************************************************************************
+ * doc_and_block_rejection()
+ *
+ * If the page has too many rejects - reject all of it.
+ * If any block has too many rejects - reject all words in the block
+ *************************************************************************/
+
+void Tesseract::doc_and_block_rejection( //reject big chunks
+ PAGE_RES_IT &page_res_it,
+ bool good_quality_doc) {
+ int16_t block_no = 0;
+ int16_t row_no = 0;
+ BLOCK_RES *current_block;
+ ROW_RES *current_row;
+
+ bool rej_word;
+ bool prev_word_rejected;
+ int16_t char_quality = 0;
+ int16_t accepted_char_quality;
+
+ if (page_res_it.page_res->rej_count * 100.0 /
+ page_res_it.page_res->char_count > tessedit_reject_doc_percent) {
+ reject_whole_page(page_res_it);
+ if (tessedit_debug_doc_rejection) {
+ tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
+ page_res_it.page_res->char_count,
+ page_res_it.page_res->rej_count);
+ }
+ } else {
+ if (tessedit_debug_doc_rejection) {
+ tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
+ page_res_it.page_res->char_count,
+ page_res_it.page_res->rej_count);
+ }
+
+ /* Walk blocks testing for block rejection */
+
+ page_res_it.restart_page();
+ WERD_RES* word;
+ while ((word = page_res_it.word()) != nullptr) {
+ current_block = page_res_it.block();
+ block_no = current_block->block->pdblk.index();
+ if (current_block->char_count > 0 &&
+ (current_block->rej_count * 100.0 / current_block->char_count) >
+ tessedit_reject_block_percent) {
+ if (tessedit_debug_block_rejection) {
+ tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
+ block_no, current_block->char_count,
+ current_block->rej_count);
+ }
+ prev_word_rejected = false;
+ while ((word = page_res_it.word()) != nullptr &&
+ (page_res_it.block() == current_block)) {
+ if (tessedit_preserve_blk_rej_perfect_wds) {
+ rej_word = word->reject_map.reject_count() > 0 ||
+ word->reject_map.length () < tessedit_preserve_min_wd_len;
+ if (rej_word && tessedit_dont_blkrej_good_wds &&
+ word->reject_map.length() >= tessedit_preserve_min_wd_len &&
+ acceptable_word_string(
+ *word->uch_set,
+ word->best_choice->unichar_string().c_str(),
+ word->best_choice->unichar_lengths().c_str()) !=
+ AC_UNACCEPTABLE) {
+ word_char_quality(word, &char_quality, &accepted_char_quality);
+ rej_word = char_quality != word->reject_map.length();
+ }
+ } else {
+ rej_word = true;
+ }
+ if (rej_word) {
+ /*
+ Reject spacing if both current and prev words are rejected.
+ NOTE - this is NOT restricted to FUZZY spaces. - When tried this
+ generated more space errors.
+ */
+ if (tessedit_use_reject_spaces &&
+ prev_word_rejected &&
+ page_res_it.prev_row() == page_res_it.row() &&
+ word->word->space() == 1)
+ word->reject_spaces = true;
+ word->reject_map.rej_word_block_rej();
+ }
+ prev_word_rejected = rej_word;
+ page_res_it.forward();
+ }
+ } else {
+ if (tessedit_debug_block_rejection) {
+ tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
+ block_no, page_res_it.block()->char_count,
+ page_res_it.block()->rej_count);
+ }
+
+ /* Walk rows in block testing for row rejection */
+ row_no = 0;
+ while (page_res_it.word() != nullptr &&
+ page_res_it.block() == current_block) {
+ current_row = page_res_it.row();
+ row_no++;
+ /* Reject whole row if:
+ fraction of chars on row which are rejected exceed a limit AND
+ fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
+ limit
+ */
+ if (current_row->char_count > 0 &&
+ (current_row->rej_count * 100.0 / current_row->char_count) >
+ tessedit_reject_row_percent &&
+ (current_row->whole_word_rej_count * 100.0 /
+ current_row->rej_count) <
+ tessedit_whole_wd_rej_row_percent) {
+ if (tessedit_debug_block_rejection) {
+ tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
+ row_no, current_row->char_count,
+ current_row->rej_count);
+ }
+ prev_word_rejected = false;
+ while ((word = page_res_it.word()) != nullptr &&
+ page_res_it.row () == current_row) {
+ /* Preserve words on good docs unless they are mostly rejected*/
+ if (!tessedit_row_rej_good_docs && good_quality_doc) {
+ rej_word = word->reject_map.reject_count() /
+ static_cast<float>(word->reject_map.length()) >
+ tessedit_good_doc_still_rowrej_wd;
+ } else if (tessedit_preserve_row_rej_perfect_wds) {
+ /* Preserve perfect words anyway */
+ rej_word = word->reject_map.reject_count() > 0 ||
+ word->reject_map.length () < tessedit_preserve_min_wd_len;
+ if (rej_word && tessedit_dont_rowrej_good_wds &&
+ word->reject_map.length() >= tessedit_preserve_min_wd_len &&
+ acceptable_word_string(*word->uch_set,
+ word->best_choice->unichar_string().c_str(),
+ word->best_choice->unichar_lengths().c_str()) !=
+ AC_UNACCEPTABLE) {
+ word_char_quality(word, &char_quality,
+ &accepted_char_quality);
+ rej_word = char_quality != word->reject_map.length();
+ }
+ } else {
+ rej_word = true;
+ }
+ if (rej_word) {
+ /*
+ Reject spacing if both current and prev words are rejected.
+ NOTE - this is NOT restricted to FUZZY spaces. - When tried
+ this generated more space errors.
+ */
+ if (tessedit_use_reject_spaces &&
+ prev_word_rejected &&
+ page_res_it.prev_row() == page_res_it.row() &&
+ word->word->space () == 1)
+ word->reject_spaces = true;
+ word->reject_map.rej_word_row_rej();
+ }
+ prev_word_rejected = rej_word;
+ page_res_it.forward();
+ }
+ } else {
+ if (tessedit_debug_block_rejection) {
+ tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
+ row_no, current_row->char_count, current_row->rej_count);
+ }
+ while (page_res_it.word() != nullptr &&
+ page_res_it.row() == current_row)
+ page_res_it.forward();
+ }
+ }
+ }
+ }
+ }
+}
+
+/*************************************************************************
+ * reject_whole_page()
+ * Don't believe any of it - set the reject map to 00..00 in all words
+ *
+ *************************************************************************/
+
+void reject_whole_page(PAGE_RES_IT &page_res_it) {
+ page_res_it.restart_page ();
+ while (page_res_it.word () != nullptr) {
+ page_res_it.word ()->reject_map.rej_word_doc_rej ();
+ page_res_it.forward ();
+ }
+ //whole page is rejected
+ page_res_it.page_res->rejected = true;
+}
+
+void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
+ WERD_RES *word;
+ GARBAGE_LEVEL garbage_level;
+ PAGE_RES_IT copy_it;
+ bool prev_potential_marked = false;
+ bool found_terrible_word = false;
+ bool ok_dict_word;
+
+ page_res_it.restart_page();
+ while (page_res_it.word() != nullptr) {
+ POLY_BLOCK* pb = page_res_it.block()->block->pdblk.poly_block();
+ if (pb != nullptr && !pb->IsText()) {
+ page_res_it.forward();
+ continue;
+ }
+ word = page_res_it.word();
+
+ if (crunch_early_convert_bad_unlv_chs)
+ convert_bad_unlv_chs(word);
+
+ if (crunch_early_merge_tess_fails)
+ word->merge_tess_fails();
+
+ if (word->reject_map.accept_count () != 0) {
+ found_terrible_word = false;
+ //Forget earlier potential crunches
+ prev_potential_marked = false;
+ }
+ else {
+ ok_dict_word = safe_dict_word(word);
+ garbage_level = garbage_word(word, ok_dict_word);
+
+ if ((garbage_level != G_NEVER_CRUNCH) &&
+ (terrible_word_crunch (word, garbage_level))) {
+ if (crunch_debug > 0) {
+ tprintf ("T CRUNCHING: \"%s\"\n",
+ word->best_choice->unichar_string().c_str());
+ }
+ word->unlv_crunch_mode = CR_KEEP_SPACE;
+ if (prev_potential_marked) {
+ while (copy_it.word () != word) {
+ if (crunch_debug > 0) {
+ tprintf ("P1 CRUNCHING: \"%s\"\n",
+ copy_it.word()->best_choice->unichar_string().c_str());
+ }
+ copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
+ copy_it.forward ();
+ }
+ prev_potential_marked = false;
+ }
+ found_terrible_word = true;
+ }
+ else if ((garbage_level != G_NEVER_CRUNCH) &&
+ (potential_word_crunch (word,
+ garbage_level, ok_dict_word))) {
+ if (found_terrible_word) {
+ if (crunch_debug > 0) {
+ tprintf ("P2 CRUNCHING: \"%s\"\n",
+ word->best_choice->unichar_string().c_str());
+ }
+ word->unlv_crunch_mode = CR_KEEP_SPACE;
+ }
+ else if (!prev_potential_marked) {
+ copy_it = page_res_it;
+ prev_potential_marked = true;
+ if (crunch_debug > 1) {
+ tprintf ("P3 CRUNCHING: \"%s\"\n",
+ word->best_choice->unichar_string().c_str());
+ }
+ }
+ }
+ else {
+ found_terrible_word = false;
+ //Forget earlier potential crunches
+ prev_potential_marked = false;
+ if (crunch_debug > 2) {
+ tprintf ("NO CRUNCH: \"%s\"\n",
+ word->best_choice->unichar_string().c_str());
+ }
+ }
+ }
+ page_res_it.forward ();
+ }
+}
+
+
+bool Tesseract::terrible_word_crunch(WERD_RES* word,
+ GARBAGE_LEVEL garbage_level) {
+ float rating_per_ch;
+ int adjusted_len;
+ int crunch_mode = 0;
+
+ if ((word->best_choice->unichar_string().length() == 0) ||
+ (strspn(word->best_choice->unichar_string().c_str(), " ") ==
+ word->best_choice->unichar_string().unsigned_size()))
+ crunch_mode = 1;
+ else {
+ adjusted_len = word->reject_map.length ();
+ if (adjusted_len > crunch_rating_max)
+ adjusted_len = crunch_rating_max;
+ rating_per_ch = word->best_choice->rating () / adjusted_len;
+
+ if (rating_per_ch > crunch_terrible_rating)
+ crunch_mode = 2;
+ else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
+ crunch_mode = 3;
+ else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
+ (garbage_level != G_OK))
+ crunch_mode = 4;
+ else if ((rating_per_ch > crunch_poor_garbage_rate) &&
+ (garbage_level != G_OK))
+ crunch_mode = 5;
+ }
+ if (crunch_mode > 0) {
+ if (crunch_debug > 2) {
+ tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
+ crunch_mode, word->best_choice->unichar_string().c_str());
+ }
+ return true;
+ }
+ else
+ return false;
+}
+
+bool Tesseract::potential_word_crunch(WERD_RES* word,
+ GARBAGE_LEVEL garbage_level,
+ bool ok_dict_word) {
+ float rating_per_ch;
+ int adjusted_len;
+ const char *str = word->best_choice->unichar_string().c_str();
+ const char *lengths = word->best_choice->unichar_lengths().c_str();
+ bool word_crunchable;
+ int poor_indicator_count = 0;
+
+ word_crunchable = !crunch_leave_accept_strings ||
+ word->reject_map.length() < 3 ||
+ (acceptable_word_string(*word->uch_set,
+ str, lengths) == AC_UNACCEPTABLE &&
+ !ok_dict_word);
+
+ adjusted_len = word->reject_map.length();
+ if (adjusted_len > 10)
+ adjusted_len = 10;
+ rating_per_ch = word->best_choice->rating() / adjusted_len;
+
+ if (rating_per_ch > crunch_pot_poor_rate) {
+ if (crunch_debug > 2) {
+ tprintf("Potential poor rating on \"%s\"\n",
+ word->best_choice->unichar_string().c_str());
+ }
+ poor_indicator_count++;
+ }
+
+ if (word_crunchable &&
+ word->best_choice->certainty() < crunch_pot_poor_cert) {
+ if (crunch_debug > 2) {
+ tprintf("Potential poor cert on \"%s\"\n",
+ word->best_choice->unichar_string().c_str());
+ }
+ poor_indicator_count++;
+ }
+
+ if (garbage_level != G_OK) {
+ if (crunch_debug > 2) {
+ tprintf("Potential garbage on \"%s\"\n",
+ word->best_choice->unichar_string().c_str());
+ }
+ poor_indicator_count++;
+ }
+ return poor_indicator_count >= crunch_pot_indicators;
+}
+
+void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
+ WERD_RES *word;
+ PAGE_RES_IT copy_it;
+ bool deleting_from_bol = false;
+ bool marked_delete_point = false;
+ int16_t debug_delete_mode;
+ CRUNCH_MODE delete_mode;
+ int16_t x_debug_delete_mode;
+ CRUNCH_MODE x_delete_mode;
+
+ page_res_it.restart_page();
+ while (page_res_it.word() != nullptr) {
+ word = page_res_it.word();
+
+ delete_mode = word_deletable (word, debug_delete_mode);
+ if (delete_mode != CR_NONE) {
+ if (word->word->flag (W_BOL) || deleting_from_bol) {
+ if (crunch_debug > 0) {
+ tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
+ debug_delete_mode,
+ word->best_choice->unichar_string().c_str());
+ }
+ word->unlv_crunch_mode = delete_mode;
+ deleting_from_bol = true;
+ } else if (word->word->flag(W_EOL)) {
+ if (marked_delete_point) {
+ while (copy_it.word() != word) {
+ x_delete_mode = word_deletable (copy_it.word (),
+ x_debug_delete_mode);
+ if (crunch_debug > 0) {
+ tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
+ x_debug_delete_mode,
+ copy_it.word()->best_choice->unichar_string().c_str());
+ }
+ copy_it.word ()->unlv_crunch_mode = x_delete_mode;
+ copy_it.forward ();
+ }
+ }
+ if (crunch_debug > 0) {
+ tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
+ debug_delete_mode,
+ word->best_choice->unichar_string().c_str());
+ }
+ word->unlv_crunch_mode = delete_mode;
+ deleting_from_bol = false;
+ marked_delete_point = false;
+ }
+ else {
+ if (!marked_delete_point) {
+ copy_it = page_res_it;
+ marked_delete_point = true;
+ }
+ }
+ }
+ else {
+ deleting_from_bol = false;
+ //Forget earlier potential crunches
+ marked_delete_point = false;
+ }
+ /*
+ The following step has been left till now as the tess fails are used to
+ determine if the word is deletable.
+ */
+ if (!crunch_early_merge_tess_fails)
+ word->merge_tess_fails();
+ page_res_it.forward ();
+ }
+}
+
+
+void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
+ int i;
+ UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
+ UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
+ UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
+ UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
+ for (i = 0; i < word_res->reject_map.length(); ++i) {
+ if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
+ word_res->best_choice->set_unichar_id(unichar_dash, i);
+ if (word_res->reject_map[i].accepted ())
+ word_res->reject_map[i].setrej_unlv_rej ();
+ }
+ if (word_res->best_choice->unichar_id(i) == unichar_pow) {
+ word_res->best_choice->set_unichar_id(unichar_space, i);
+ if (word_res->reject_map[i].accepted ())
+ word_res->reject_map[i].setrej_unlv_rej ();
+ }
+ }
+}
+
+GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {
+ enum STATES
+ {
+ JUNK,
+ FIRST_UPPER,
+ FIRST_LOWER,
+ FIRST_NUM,
+ SUBSEQUENT_UPPER,
+ SUBSEQUENT_LOWER,
+ SUBSEQUENT_NUM
+ };
+ const char *str = word->best_choice->unichar_string().c_str();
+ const char *lengths = word->best_choice->unichar_lengths().c_str();
+ STATES state = JUNK;
+ int len = 0;
+ int isolated_digits = 0;
+ int isolated_alphas = 0;
+ int bad_char_count = 0;
+ int tess_rejs = 0;
+ int dodgy_chars = 0;
+ int ok_chars;
+ UNICHAR_ID last_char = -1;
+ int alpha_repetition_count = 0;
+ int longest_alpha_repetition_count = 0;
+ int longest_lower_run_len = 0;
+ int lower_string_count = 0;
+ int longest_upper_run_len = 0;
+ int upper_string_count = 0;
+ int total_alpha_count = 0;
+ int total_digit_count = 0;
+
+ for (; *str != '\0'; str += *(lengths++)) {
+ len++;
+ if (word->uch_set->get_isupper (str, *lengths)) {
+ total_alpha_count++;
+ switch (state) {
+ case SUBSEQUENT_UPPER:
+ case FIRST_UPPER:
+ state = SUBSEQUENT_UPPER;
+ upper_string_count++;
+ if (longest_upper_run_len < upper_string_count)
+ longest_upper_run_len = upper_string_count;
+ if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
+ alpha_repetition_count++;
+ if (longest_alpha_repetition_count < alpha_repetition_count) {
+ longest_alpha_repetition_count = alpha_repetition_count;
+ }
+ }
+ else {
+ last_char = word->uch_set->unichar_to_id(str, *lengths);
+ alpha_repetition_count = 1;
+ }
+ break;
+ case FIRST_NUM:
+ isolated_digits++;
+ // Fall through.
+ default:
+ state = FIRST_UPPER;
+ last_char = word->uch_set->unichar_to_id(str, *lengths);
+ alpha_repetition_count = 1;
+ upper_string_count = 1;
+ break;
+ }
+ }
+ else if (word->uch_set->get_islower (str, *lengths)) {
+ total_alpha_count++;
+ switch (state) {
+ case SUBSEQUENT_LOWER:
+ case FIRST_LOWER:
+ state = SUBSEQUENT_LOWER;
+ lower_string_count++;
+ if (longest_lower_run_len < lower_string_count)
+ longest_lower_run_len = lower_string_count;
+ if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
+ alpha_repetition_count++;
+ if (longest_alpha_repetition_count < alpha_repetition_count) {
+ longest_alpha_repetition_count = alpha_repetition_count;
+ }
+ }
+ else {
+ last_char = word->uch_set->unichar_to_id(str, *lengths);
+ alpha_repetition_count = 1;
+ }
+ break;
+ case FIRST_NUM:
+ isolated_digits++;
+ // Fall through.
+ default:
+ state = FIRST_LOWER;
+ last_char = word->uch_set->unichar_to_id(str, *lengths);
+ alpha_repetition_count = 1;
+ lower_string_count = 1;
+ break;
+ }
+ }
+ else if (word->uch_set->get_isdigit (str, *lengths)) {
+ total_digit_count++;
+ switch (state) {
+ case FIRST_NUM:
+ state = SUBSEQUENT_NUM;
+ case SUBSEQUENT_NUM:
+ break;
+ case FIRST_UPPER:
+ case FIRST_LOWER:
+ isolated_alphas++;
+ // Fall through.
+ default:
+ state = FIRST_NUM;
+ break;
+ }
+ }
+ else {
+ if (*lengths == 1 && *str == ' ')
+ tess_rejs++;
+ else
+ bad_char_count++;
+ switch (state) {
+ case FIRST_NUM:
+ isolated_digits++;
+ break;
+ case FIRST_UPPER:
+ case FIRST_LOWER:
+ isolated_alphas++;
+ default:
+ break;
+ }
+ state = JUNK;
+ }
+ }
+
+ switch (state) {
+ case FIRST_NUM:
+ isolated_digits++;
+ break;
+ case FIRST_UPPER:
+ case FIRST_LOWER:
+ isolated_alphas++;
+ default:
+ break;
+ }
+
+ if (crunch_include_numerals) {
+ total_alpha_count += total_digit_count - isolated_digits;
+ }
+
+ if (crunch_leave_ok_strings && len >= 4 &&
+ 2 * (total_alpha_count - isolated_alphas) > len &&
+ longest_alpha_repetition_count < crunch_long_repetitions) {
+ if ((crunch_accept_ok &&
+ acceptable_word_string(*word->uch_set, str, lengths) !=
+ AC_UNACCEPTABLE) ||
+ longest_lower_run_len > crunch_leave_lc_strings ||
+ longest_upper_run_len > crunch_leave_uc_strings)
+ return G_NEVER_CRUNCH;
+ }
+ if (word->reject_map.length() > 1 &&
+ strpbrk(str, " ") == nullptr &&
+ (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
+ word->best_choice->permuter() == FREQ_DAWG_PERM ||
+ word->best_choice->permuter() == USER_DAWG_PERM ||
+ word->best_choice->permuter() == NUMBER_PERM ||
+ acceptable_word_string(*word->uch_set, str, lengths) !=
+ AC_UNACCEPTABLE || ok_dict_word))
+ return G_OK;
+
+ ok_chars = len - bad_char_count - isolated_digits -
+ isolated_alphas - tess_rejs;
+
+ if (crunch_debug > 3) {
+ tprintf("garbage_word: \"%s\"\n",
+ word->best_choice->unichar_string().c_str());
+ tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
+ len,
+ bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
+ }
+ if (bad_char_count == 0 &&
+ tess_rejs == 0 &&
+ (len > isolated_digits + isolated_alphas || len <= 2))
+ return G_OK;
+
+ if (tess_rejs > ok_chars ||
+ (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
+ return G_TERRIBLE;
+
+ if (len > 4) {
+ dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
+ isolated_alphas;
+ if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5)
+ return G_DODGY;
+ else
+ return G_OK;
+ } else {
+ dodgy_chars = 2 * tess_rejs + bad_char_count;
+ if ((len == 4 && dodgy_chars > 2) ||
+ (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
+ return G_DODGY;
+ else
+ return G_OK;
+ }
+}
+
+
+/*************************************************************************
+ * word_deletable()
+ * DELETE WERDS AT ENDS OF ROWS IF
+ * Word is crunched &&
+ * ( string length = 0 OR
+ * > 50% of chars are "|" (before merging) OR
+ * certainty < -10 OR
+ * rating /char > 60 OR
+ * TOP of word is more than 0.5 xht BELOW baseline OR
+ * BOTTOM of word is more than 0.5 xht ABOVE xht OR
+ * length of word < 3xht OR
+ * height of word < 0.7 xht OR
+ * height of word > 3.0 xht OR
+ * >75% of the outline BBs have longest dimension < 0.5xht
+ *************************************************************************/
+
+CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {
+ int word_len = word->reject_map.length ();
+ float rating_per_ch;
+ TBOX box; //BB of word
+
+ if (word->unlv_crunch_mode == CR_NONE) {
+ delete_mode = 0;
+ return CR_NONE;
+ }
+
+ if (word_len == 0) {
+ delete_mode = 1;
+ return CR_DELETE;
+ }
+
+ if (word->rebuild_word != nullptr) {
+ // Cube leaves rebuild_word nullptr.
+ box = word->rebuild_word->bounding_box();
+ if (box.height () < crunch_del_min_ht * kBlnXHeight) {
+ delete_mode = 4;
+ return CR_DELETE;
+ }
+
+ if (noise_outlines(word->rebuild_word)) {
+ delete_mode = 5;
+ return CR_DELETE;
+ }
+ }
+
+ if ((failure_count (word) * 1.5) > word_len) {
+ delete_mode = 2;
+ return CR_LOOSE_SPACE;
+ }
+
+ if (word->best_choice->certainty () < crunch_del_cert) {
+ delete_mode = 7;
+ return CR_LOOSE_SPACE;
+ }
+
+ rating_per_ch = word->best_choice->rating () / word_len;
+
+ if (rating_per_ch > crunch_del_rating) {
+ delete_mode = 8;
+ return CR_LOOSE_SPACE;
+ }
+
+ if (box.top () < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
+ delete_mode = 9;
+ return CR_LOOSE_SPACE;
+ }
+
+ if (box.bottom () >
+ kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
+ delete_mode = 10;
+ return CR_LOOSE_SPACE;
+ }
+
+ if (box.height () > crunch_del_max_ht * kBlnXHeight) {
+ delete_mode = 11;
+ return CR_LOOSE_SPACE;
+ }
+
+ if (box.width () < crunch_del_min_width * kBlnXHeight) {
+ delete_mode = 3;
+ return CR_LOOSE_SPACE;
+ }
+
+ delete_mode = 0;
+ return CR_NONE;
+}
+
+int16_t Tesseract::failure_count(WERD_RES *word) {
+ const char *str = word->best_choice->unichar_string().c_str();
+ int tess_rejs = 0;
+
+ for (; *str != '\0'; str++) {
+ if (*str == ' ')
+ tess_rejs++;
+ }
+ return tess_rejs;
+}
+
+
+bool Tesseract::noise_outlines(TWERD* word) {
+ TBOX box; // BB of outline
+ int16_t outline_count = 0;
+ int16_t small_outline_count = 0;
+ int16_t max_dimension;
+ float small_limit = kBlnXHeight * crunch_small_outlines_size;
+
+ for (int b = 0; b < word->NumBlobs(); ++b) {
+ TBLOB* blob = word->blobs[b];
+ for (TESSLINE* ol = blob->outlines; ol != nullptr; ol = ol->next) {
+ outline_count++;
+ box = ol->bounding_box();
+ if (box.height() > box.width())
+ max_dimension = box.height();
+ else
+ max_dimension = box.width();
+ if (max_dimension < small_limit)
+ small_outline_count++;
+ }
+ }
+ return small_outline_count >= outline_count;
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/docqual.h b/tesseract/src/ccmain/docqual.h
new file mode 100644
index 00000000..57fa9aeb
--- /dev/null
+++ b/tesseract/src/ccmain/docqual.h
@@ -0,0 +1,43 @@
+/******************************************************************
+ * File: docqual.h (Formerly docqual.h)
+ * Description: Document Quality Metrics
+ * Author: Phil Cheatle
+ *
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef DOCQUAL_H
+#define DOCQUAL_H
+
+#include <cstdint> // for int16_t
+
+namespace tesseract {
+
+class PAGE_RES_IT;
+class ROW;
+class WERD_RES;
+
+enum GARBAGE_LEVEL
+{
+ G_NEVER_CRUNCH,
+ G_OK,
+ G_DODGY,
+ G_TERRIBLE
+};
+
+int16_t word_blob_quality(WERD_RES* word);
+void reject_whole_page(PAGE_RES_IT &page_res_it);
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/ccmain/equationdetect.cpp b/tesseract/src/ccmain/equationdetect.cpp
new file mode 100644
index 00000000..518468b4
--- /dev/null
+++ b/tesseract/src/ccmain/equationdetect.cpp
@@ -0,0 +1,1516 @@
+///////////////////////////////////////////////////////////////////////
+// File: equationdetect.cpp
+// Description: Helper classes to detect equations.
+// Author: Zongyi (Joe) Liu (joeliu@google.com)
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#include "equationdetect.h"
+
+#include "bbgrid.h"
+#include "classify.h"
+#include "colpartition.h"
+#include "colpartitiongrid.h"
+#include "colpartitionset.h"
+#include "ratngs.h"
+#include "tesseractclass.h"
+
+#include "helpers.h"
+
+#include <algorithm>
+#include <cfloat>
+#include <limits>
+#include <memory>
+
+namespace tesseract {
+
+// Config variables.
+static BOOL_VAR(equationdetect_save_bi_image, false, "Save input bi image");
+static BOOL_VAR(equationdetect_save_spt_image, false, "Save special character image");
+static BOOL_VAR(equationdetect_save_seed_image, false, "Save the seed image");
+static BOOL_VAR(equationdetect_save_merged_image, false, "Save the merged image");
+
+///////////////////////////////////////////////////////////////////////////
+// Utility ColParition sort functions.
+///////////////////////////////////////////////////////////////////////////
+static int SortCPByTopReverse(const void* p1, const void* p2) {
+ const ColPartition* cp1 = *static_cast<ColPartition* const*>(p1);
+ const ColPartition* cp2 = *static_cast<ColPartition* const*>(p2);
+ ASSERT_HOST(cp1 != nullptr && cp2 != nullptr);
+ const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box());
+ return box2.top() - box1.top();
+}
+
+static int SortCPByBottom(const void* p1, const void* p2) {
+ const ColPartition* cp1 = *static_cast<ColPartition* const*>(p1);
+ const ColPartition* cp2 = *static_cast<ColPartition* const*>(p2);
+ ASSERT_HOST(cp1 != nullptr && cp2 != nullptr);
+ const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box());
+ return box1.bottom() - box2.bottom();
+}
+
+static int SortCPByHeight(const void* p1, const void* p2) {
+ const ColPartition* cp1 = *static_cast<ColPartition* const*>(p1);
+ const ColPartition* cp2 = *static_cast<ColPartition* const*>(p2);
+ ASSERT_HOST(cp1 != nullptr && cp2 != nullptr);
+ const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box());
+ return box1.height() - box2.height();
+}
+
+// TODO(joeliu): we may want to parameterize these constants.
+const float kMathDigitDensityTh1 = 0.25;
+const float kMathDigitDensityTh2 = 0.1;
+const float kMathItalicDensityTh = 0.5;
+const float kUnclearDensityTh = 0.25;
+const int kSeedBlobsCountTh = 10;
+const int kLeftIndentAlignmentCountTh = 1;
+
+// Returns true if PolyBlockType is of text type or equation type.
+inline bool IsTextOrEquationType(PolyBlockType type) {
+ return PTIsTextType(type) || type == PT_EQUATION;
+}
+
+inline bool IsLeftIndented(const EquationDetect::IndentType type) {
+ return type == EquationDetect::LEFT_INDENT ||
+ type == EquationDetect::BOTH_INDENT;
+}
+
+inline bool IsRightIndented(const EquationDetect::IndentType type) {
+ return type == EquationDetect::RIGHT_INDENT ||
+ type == EquationDetect::BOTH_INDENT;
+}
+
+EquationDetect::EquationDetect(const char* equ_datapath,
+ const char* equ_name) {
+ const char* default_name = "equ";
+ if (equ_name == nullptr) {
+ equ_name = default_name;
+ }
+ lang_tesseract_ = nullptr;
+ resolution_ = 0;
+ page_count_ = 0;
+
+ if (equ_tesseract_.init_tesseract(equ_datapath, equ_name,
+ OEM_TESSERACT_ONLY)) {
+ tprintf("Warning: equation region detection requested,"
+ " but %s failed to load from %s\n", equ_name, equ_datapath);
+ }
+
+ cps_super_bbox_ = nullptr;
+}
+
+EquationDetect::~EquationDetect() { delete (cps_super_bbox_); }
+
+void EquationDetect::SetLangTesseract(Tesseract* lang_tesseract) {
+ lang_tesseract_ = lang_tesseract;
+}
+
+void EquationDetect::SetResolution(const int resolution) {
+ resolution_ = resolution;
+}
+
+int EquationDetect::LabelSpecialText(TO_BLOCK* to_block) {
+ if (to_block == nullptr) {
+ tprintf("Warning: input to_block is nullptr!\n");
+ return -1;
+ }
+
+ GenericVector<BLOBNBOX_LIST*> blob_lists;
+ blob_lists.push_back(&(to_block->blobs));
+ blob_lists.push_back(&(to_block->large_blobs));
+ for (int i = 0; i < blob_lists.size(); ++i) {
+ BLOBNBOX_IT bbox_it(blob_lists[i]);
+ for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
+ bbox_it.forward()) {
+ bbox_it.data()->set_special_text_type(BSTT_NONE);
+ }
+ }
+
+ return 0;
+}
+
+void EquationDetect::IdentifySpecialText(
+ BLOBNBOX *blobnbox, const int height_th) {
+ ASSERT_HOST(blobnbox != nullptr);
+ if (blobnbox->bounding_box().height() < height_th && height_th > 0) {
+ // For small blob, we simply set to BSTT_NONE.
+ blobnbox->set_special_text_type(BSTT_NONE);
+ return;
+ }
+
+ BLOB_CHOICE_LIST ratings_equ, ratings_lang;
+ C_BLOB* blob = blobnbox->cblob();
+ // TODO(joeliu/rays) Fix this. We may have to normalize separately for
+ // each classifier here, as they may require different PolygonalCopy.
+ TBLOB* tblob = TBLOB::PolygonalCopy(false, blob);
+ const TBOX& box = tblob->bounding_box();
+
+ // Normalize the blob. Set the origin to the place we want to be the
+ // bottom-middle, and scaling is to make the height the x-height.
+ const float scaling = static_cast<float>(kBlnXHeight) / box.height();
+ const float x_orig = (box.left() + box.right()) / 2.0f, y_orig = box.bottom();
+ std::unique_ptr<TBLOB> normed_blob(new TBLOB(*tblob));
+ normed_blob->Normalize(nullptr, nullptr, nullptr, x_orig, y_orig, scaling, scaling,
+ 0.0f, static_cast<float>(kBlnBaselineOffset),
+ false, nullptr);
+ equ_tesseract_.AdaptiveClassifier(normed_blob.get(), &ratings_equ);
+ lang_tesseract_->AdaptiveClassifier(normed_blob.get(), &ratings_lang);
+ delete tblob;
+
+ // Get the best choice from ratings_lang and rating_equ. As the choice in the
+ // list has already been sorted by the certainty, we simply use the first
+ // choice.
+ BLOB_CHOICE *lang_choice = nullptr, *equ_choice = nullptr;
+ if (ratings_lang.length() > 0) {
+ BLOB_CHOICE_IT choice_it(&ratings_lang);
+ lang_choice = choice_it.data();
+ }
+ if (ratings_equ.length() > 0) {
+ BLOB_CHOICE_IT choice_it(&ratings_equ);
+ equ_choice = choice_it.data();
+ }
+
+ const float lang_score = lang_choice ? lang_choice->certainty() : -FLT_MAX;
+ const float equ_score = equ_choice ? equ_choice->certainty() : -FLT_MAX;
+
+ const float kConfScoreTh = -5.0f, kConfDiffTh = 1.8;
+ // The scores here are negative, so the max/min == fabs(min/max).
+ // float ratio = fmax(lang_score, equ_score) / fmin(lang_score, equ_score);
+ const float diff = fabs(lang_score - equ_score);
+ BlobSpecialTextType type = BSTT_NONE;
+
+ // Classification.
+ if (fmax(lang_score, equ_score) < kConfScoreTh) {
+ // If both score are very small, then mark it as unclear.
+ type = BSTT_UNCLEAR;
+ } else if (diff > kConfDiffTh && equ_score > lang_score) {
+ // If equ_score is significantly higher, then we classify this character as
+ // math symbol.
+ type = BSTT_MATH;
+ } else if (lang_choice) {
+ // For other cases: lang_score is similar or significantly higher.
+ type = EstimateTypeForUnichar(
+ lang_tesseract_->unicharset, lang_choice->unichar_id());
+ }
+
+ if (type == BSTT_NONE && lang_tesseract_->get_fontinfo_table().get(
+ lang_choice->fontinfo_id()).is_italic()) {
+ // For text symbol, we still check if it is italic.
+ blobnbox->set_special_text_type(BSTT_ITALIC);
+ } else {
+ blobnbox->set_special_text_type(type);
+ }
+}
+
+BlobSpecialTextType EquationDetect::EstimateTypeForUnichar(
+ const UNICHARSET& unicharset, const UNICHAR_ID id) const {
+ const STRING s = unicharset.id_to_unichar(id);
+ if (unicharset.get_isalpha(id)) {
+ return BSTT_NONE;
+ }
+
+ if (unicharset.get_ispunctuation(id)) {
+ // Exclude some special texts that are likely to be confused as math symbol.
+ static GenericVector<UNICHAR_ID> ids_to_exclude;
+ if (ids_to_exclude.empty()) {
+ static const STRING kCharsToEx[] = {"'", "`", "\"", "\\", ",", ".",
+ "〈", "〉", "《", "》", "」", "「", ""};
+ int i = 0;
+ while (kCharsToEx[i] != "") {
+ ids_to_exclude.push_back(
+ unicharset.unichar_to_id(kCharsToEx[i++].c_str()));
+ }
+ ids_to_exclude.sort();
+ }
+ return ids_to_exclude.bool_binary_search(id) ? BSTT_NONE : BSTT_MATH;
+ }
+
+ // Check if it is digit. In addition to the isdigit attribute, we also check
+ // if this character belongs to those likely to be confused with a digit.
+ static const STRING kDigitsChars = "|";
+ if (unicharset.get_isdigit(id) ||
+ (s.length() == 1 && kDigitsChars.contains(s[0]))) {
+ return BSTT_DIGIT;
+ } else {
+ return BSTT_MATH;
+ }
+}
+
+void EquationDetect::IdentifySpecialText() {
+ // Set configuration for Tesseract::AdaptiveClassifier.
+ equ_tesseract_.tess_cn_matching.set_value(1); // turn it on
+ equ_tesseract_.tess_bn_matching.set_value(0);
+
+ // Set the multiplier to zero for lang_tesseract_ to improve the accuracy.
+ const int classify_class_pruner = lang_tesseract_->classify_class_pruner_multiplier;
+ const int classify_integer_matcher =
+ lang_tesseract_->classify_integer_matcher_multiplier;
+ lang_tesseract_->classify_class_pruner_multiplier.set_value(0);
+ lang_tesseract_->classify_integer_matcher_multiplier.set_value(0);
+
+ ColPartitionGridSearch gsearch(part_grid_);
+ ColPartition *part = nullptr;
+ gsearch.StartFullSearch();
+ while ((part = gsearch.NextFullSearch()) != nullptr) {
+ if (!IsTextOrEquationType(part->type())) {
+ continue;
+ }
+ IdentifyBlobsToSkip(part);
+ BLOBNBOX_C_IT bbox_it(part->boxes());
+ // Compute the height threshold.
+ GenericVector<int> blob_heights;
+ for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
+ bbox_it.forward()) {
+ if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
+ blob_heights.push_back(bbox_it.data()->bounding_box().height());
+ }
+ }
+ blob_heights.sort();
+ const int height_th = blob_heights[blob_heights.size() / 2] / 3 * 2;
+ for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
+ bbox_it.forward()) {
+ if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
+ IdentifySpecialText(bbox_it.data(), height_th);
+ }
+ }
+ }
+
+ // Set the multiplier values back.
+ lang_tesseract_->classify_class_pruner_multiplier.set_value(
+ classify_class_pruner);
+ lang_tesseract_->classify_integer_matcher_multiplier.set_value(
+ classify_integer_matcher);
+
+ if (equationdetect_save_spt_image) { // For debug.
+ STRING outfile;
+ GetOutputTiffName("_spt", &outfile);
+ PaintSpecialTexts(outfile);
+ }
+}
+
+void EquationDetect::IdentifyBlobsToSkip(ColPartition* part) {
+ ASSERT_HOST(part);
+ BLOBNBOX_C_IT blob_it(part->boxes());
+
+ for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+ // At this moment, no blob should have been joined.
+ ASSERT_HOST(!blob_it.data()->joined_to_prev());
+ }
+ for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+ BLOBNBOX* blob = blob_it.data();
+ if (blob->joined_to_prev() || blob->special_text_type() == BSTT_SKIP) {
+ continue;
+ }
+ TBOX blob_box = blob->bounding_box();
+
+ // Search if any blob can be merged into blob. If found, then we mark all
+ // these blobs as BSTT_SKIP.
+ BLOBNBOX_C_IT blob_it2 = blob_it;
+ bool found = false;
+ while (!blob_it2.at_last()) {
+ BLOBNBOX* nextblob = blob_it2.forward();
+ const TBOX& nextblob_box = nextblob->bounding_box();
+ if (nextblob_box.left() >= blob_box.right()) {
+ break;
+ }
+ const float kWidthR = 0.4, kHeightR = 0.3;
+ const bool xoverlap = blob_box.major_x_overlap(nextblob_box),
+ yoverlap = blob_box.y_overlap(nextblob_box);
+ const float widthR = static_cast<float>(
+ std::min(nextblob_box.width(), blob_box.width())) /
+ std::max(nextblob_box.width(), blob_box.width());
+ const float heightR = static_cast<float>(
+ std::min(nextblob_box.height(), blob_box.height())) /
+ std::max(nextblob_box.height(), blob_box.height());
+
+ if (xoverlap && yoverlap && widthR > kWidthR && heightR > kHeightR) {
+ // Found one, set nextblob type and recompute blob_box.
+ found = true;
+ nextblob->set_special_text_type(BSTT_SKIP);
+ blob_box += nextblob_box;
+ }
+ }
+ if (found) {
+ blob->set_special_text_type(BSTT_SKIP);
+ }
+ }
+}
+
+int EquationDetect::FindEquationParts(
+ ColPartitionGrid* part_grid, ColPartitionSet** best_columns) {
+ if (!lang_tesseract_) {
+ tprintf("Warning: lang_tesseract_ is nullptr!\n");
+ return -1;
+ }
+ if (!part_grid || !best_columns) {
+ tprintf("part_grid/best_columns is nullptr!!\n");
+ return -1;
+ }
+ cp_seeds_.clear();
+ part_grid_ = part_grid;
+ best_columns_ = best_columns;
+ resolution_ = lang_tesseract_->source_resolution();
+ STRING outfile;
+ page_count_++;
+
+ if (equationdetect_save_bi_image) {
+ GetOutputTiffName("_bi", &outfile);
+ pixWrite(outfile.c_str(), lang_tesseract_->pix_binary(), IFF_TIFF_G4);
+ }
+
+ // Pass 0: Compute special text type for blobs.
+ IdentifySpecialText();
+
+ // Pass 1: Merge parts by overlap.
+ MergePartsByLocation();
+
+ // Pass 2: compute the math blob density and find the seed partition.
+ IdentifySeedParts();
+ // We still need separate seed into block seed and inline seed partition.
+ IdentifyInlineParts();
+
+ if (equationdetect_save_seed_image) {
+ GetOutputTiffName("_seed", &outfile);
+ PaintColParts(outfile);
+ }
+
+ // Pass 3: expand block equation seeds.
+ while (!cp_seeds_.empty()) {
+ GenericVector<ColPartition*> seeds_expanded;
+ for (int i = 0; i < cp_seeds_.size(); ++i) {
+ if (ExpandSeed(cp_seeds_[i])) {
+ // If this seed is expanded, then we add it into seeds_expanded. Note
+ // this seed has been removed from part_grid_ if it is expanded.
+ seeds_expanded.push_back(cp_seeds_[i]);
+ }
+ }
+ // Add seeds_expanded back into part_grid_ and reset cp_seeds_.
+ for (int i = 0; i < seeds_expanded.size(); ++i) {
+ InsertPartAfterAbsorb(seeds_expanded[i]);
+ }
+ cp_seeds_ = seeds_expanded;
+ }
+
+ // Pass 4: find math block satellite text partitions and merge them.
+ ProcessMathBlockSatelliteParts();
+
+ if (equationdetect_save_merged_image) { // For debug.
+ GetOutputTiffName("_merged", &outfile);
+ PaintColParts(outfile);
+ }
+
+ return 0;
+}
+
+void EquationDetect::MergePartsByLocation() {
+ while (true) {
+ ColPartition* part = nullptr;
+ // partitions that have been updated.
+ GenericVector<ColPartition*> parts_updated;
+ ColPartitionGridSearch gsearch(part_grid_);
+ gsearch.StartFullSearch();
+ while ((part = gsearch.NextFullSearch()) != nullptr) {
+ if (!IsTextOrEquationType(part->type())) {
+ continue;
+ }
+ GenericVector<ColPartition*> parts_to_merge;
+ SearchByOverlap(part, &parts_to_merge);
+ if (parts_to_merge.empty()) {
+ continue;
+ }
+
+ // Merge parts_to_merge with part, and remove them from part_grid_.
+ part_grid_->RemoveBBox(part);
+ for (int i = 0; i < parts_to_merge.size(); ++i) {
+ ASSERT_HOST(parts_to_merge[i] != nullptr && parts_to_merge[i] != part);
+ part->Absorb(parts_to_merge[i], nullptr);
+ }
+ gsearch.RepositionIterator();
+
+ parts_updated.push_back(part);
+ }
+
+ if (parts_updated.empty()) { // Exit the loop
+ break;
+ }
+
+ // Re-insert parts_updated into part_grid_.
+ for (int i = 0; i < parts_updated.size(); ++i) {
+ InsertPartAfterAbsorb(parts_updated[i]);
+ }
+ }
+}
+
+void EquationDetect::SearchByOverlap(
+ ColPartition* seed,
+ GenericVector<ColPartition*>* parts_overlap) {
+ ASSERT_HOST(seed != nullptr && parts_overlap != nullptr);
+ if (!IsTextOrEquationType(seed->type())) {
+ return;
+ }
+ ColPartitionGridSearch search(part_grid_);
+ const TBOX& seed_box(seed->bounding_box());
+ const int kRadNeighborCells = 30;
+ search.StartRadSearch((seed_box.left() + seed_box.right()) / 2,
+ (seed_box.top() + seed_box.bottom()) / 2,
+ kRadNeighborCells);
+ search.SetUniqueMode(true);
+
+ // Search iteratively.
+ ColPartition *part;
+ GenericVector<ColPartition*> parts;
+ const float kLargeOverlapTh = 0.95;
+ const float kEquXOverlap = 0.4, kEquYOverlap = 0.5;
+ while ((part = search.NextRadSearch()) != nullptr) {
+ if (part == seed || !IsTextOrEquationType(part->type())) {
+ continue;
+ }
+ const TBOX& part_box(part->bounding_box());
+ bool merge = false;
+
+ const float x_overlap_fraction = part_box.x_overlap_fraction(seed_box),
+ y_overlap_fraction = part_box.y_overlap_fraction(seed_box);
+
+ // If part is large overlapped with seed, then set merge to true.
+ if (x_overlap_fraction >= kLargeOverlapTh &&
+ y_overlap_fraction >= kLargeOverlapTh) {
+ merge = true;
+ } else if (seed->type() == PT_EQUATION &&
+ IsTextOrEquationType(part->type())) {
+ if ((x_overlap_fraction > kEquXOverlap && y_overlap_fraction > 0.0) ||
+ (x_overlap_fraction > 0.0 && y_overlap_fraction > kEquYOverlap)) {
+ merge = true;
+ }
+ }
+
+ if (merge) { // Remove the part from search and put it into parts.
+ search.RemoveBBox();
+ parts_overlap->push_back(part);
+ }
+ }
+}
+
+void EquationDetect::InsertPartAfterAbsorb(ColPartition* part) {
+ ASSERT_HOST(part);
+
+ // Before insert part back into part_grid_, we will need re-compute some
+ // of its attributes such as first_column_, last_column_. However, we still
+ // want to preserve its type.
+ BlobTextFlowType flow_type = part->flow();
+ PolyBlockType part_type = part->type();
+ BlobRegionType blob_type = part->blob_type();
+
+ // Call SetPartitionType to re-compute the attributes of part.
+ const TBOX& part_box(part->bounding_box());
+ int grid_x, grid_y;
+ part_grid_->GridCoords(
+ part_box.left(), part_box.bottom(), &grid_x, &grid_y);
+ part->SetPartitionType(resolution_, best_columns_[grid_y]);
+
+ // Reset the types back.
+ part->set_type(part_type);
+ part->set_blob_type(blob_type);
+ part->set_flow(flow_type);
+ part->SetBlobTypes();
+
+ // Insert into part_grid_.
+ part_grid_->InsertBBox(true, true, part);
+}
+
+void EquationDetect::IdentifySeedParts() {
+ ColPartitionGridSearch gsearch(part_grid_);
+ ColPartition *part = nullptr;
+ gsearch.StartFullSearch();
+
+ GenericVector<ColPartition*> seeds1, seeds2;
+ // The left coordinates of indented text partitions.
+ GenericVector<int> indented_texts_left;
+ // The foreground density of text partitions.
+ GenericVector<float> texts_foreground_density;
+ while ((part = gsearch.NextFullSearch()) != nullptr) {
+ if (!IsTextOrEquationType(part->type())) {
+ continue;
+ }
+ part->ComputeSpecialBlobsDensity();
+ const bool blobs_check = CheckSeedBlobsCount(part);
+ const int kTextBlobsTh = 20;
+
+ if (CheckSeedDensity(kMathDigitDensityTh1, kMathDigitDensityTh2, part) &&
+ blobs_check) {
+ // Passed high density threshold test, save into seeds1.
+ seeds1.push_back(part);
+ } else {
+ IndentType indent = IsIndented(part);
+ if (IsLeftIndented(indent) && blobs_check &&
+ CheckSeedDensity(kMathDigitDensityTh2, kMathDigitDensityTh2, part)) {
+ // Passed low density threshold test and is indented, save into seeds2.
+ seeds2.push_back(part);
+ } else if (!IsRightIndented(indent) &&
+ part->boxes_count() > kTextBlobsTh) {
+ // This is likely to be a text part, save the features.
+ const TBOX&box = part->bounding_box();
+ if (IsLeftIndented(indent)) {
+ indented_texts_left.push_back(box.left());
+ }
+ texts_foreground_density.push_back(ComputeForegroundDensity(box));
+ }
+ }
+ }
+
+ // Sort the features collected from text regions.
+ indented_texts_left.sort();
+ texts_foreground_density.sort();
+ float foreground_density_th = 0.15; // Default value.
+ if (!texts_foreground_density.empty()) {
+ // Use the median of the texts_foreground_density.
+ foreground_density_th = 0.8 * texts_foreground_density[
+ texts_foreground_density.size() / 2];
+ }
+
+ for (int i = 0; i < seeds1.size(); ++i) {
+ const TBOX& box = seeds1[i]->bounding_box();
+ if (CheckSeedFgDensity(foreground_density_th, seeds1[i]) &&
+ !(IsLeftIndented(IsIndented(seeds1[i])) &&
+ CountAlignment(indented_texts_left, box.left()) >=
+ kLeftIndentAlignmentCountTh)) {
+ // Mark as PT_EQUATION type.
+ seeds1[i]->set_type(PT_EQUATION);
+ cp_seeds_.push_back(seeds1[i]);
+ } else { // Mark as PT_INLINE_EQUATION type.
+ seeds1[i]->set_type(PT_INLINE_EQUATION);
+ }
+ }
+
+ for (int i = 0; i < seeds2.size(); ++i) {
+ if (CheckForSeed2(indented_texts_left, foreground_density_th, seeds2[i])) {
+ seeds2[i]->set_type(PT_EQUATION);
+ cp_seeds_.push_back(seeds2[i]);
+ }
+ }
+}
+
+float EquationDetect::ComputeForegroundDensity(const TBOX& tbox) {
+ Pix *pix_bi = lang_tesseract_->pix_binary();
+ const int pix_height = pixGetHeight(pix_bi);
+ Box* box = boxCreate(tbox.left(), pix_height - tbox.top(),
+ tbox.width(), tbox.height());
+ Pix *pix_sub = pixClipRectangle(pix_bi, box, nullptr);
+ l_float32 fract;
+ pixForegroundFraction(pix_sub, &fract);
+ pixDestroy(&pix_sub);
+ boxDestroy(&box);
+
+ return fract;
+}
+
+bool EquationDetect::CheckSeedFgDensity(const float density_th,
+ ColPartition* part) {
+ ASSERT_HOST(part);
+
+ // Split part horizontall, and check for each sub part.
+ GenericVector<TBOX> sub_boxes;
+ SplitCPHorLite(part, &sub_boxes);
+ float parts_passed = 0.0;
+ for (int i = 0; i < sub_boxes.size(); ++i) {
+ const float density = ComputeForegroundDensity(sub_boxes[i]);
+ if (density < density_th) {
+ parts_passed++;
+ }
+ }
+
+ // If most sub parts passed, then we return true.
+ const float kSeedPartRatioTh = 0.3;
+ bool retval = (parts_passed / sub_boxes.size() >= kSeedPartRatioTh);
+
+ return retval;
+}
+
+void EquationDetect::SplitCPHor(ColPartition* part,
+ GenericVector<ColPartition*>* parts_splitted) {
+ ASSERT_HOST(part && parts_splitted);
+ if (part->median_width() == 0 || part->boxes_count() == 0) {
+ return;
+ }
+
+ // Make a copy of part, and reset parts_splitted.
+ ColPartition* right_part = part->CopyButDontOwnBlobs();
+ parts_splitted->delete_data_pointers();
+ parts_splitted->clear();
+
+ const double kThreshold = part->median_width() * 3.0;
+ bool found_split = true;
+ while (found_split) {
+ found_split = false;
+ BLOBNBOX_C_IT box_it(right_part->boxes());
+ // Blobs are sorted left side first. If blobs overlap,
+ // the previous blob may have a "more right" right side.
+ // Account for this by always keeping the largest "right"
+ // so far.
+ int previous_right = INT32_MIN;
+
+ // Look for the next split in the partition.
+ for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
+ const TBOX& box = box_it.data()->bounding_box();
+ if (previous_right != INT32_MIN &&
+ box.left() - previous_right > kThreshold) {
+ // We have a split position. Split the partition in two pieces.
+ // Insert the left piece in the grid and keep processing the right.
+ const int mid_x = (box.left() + previous_right) / 2;
+ ColPartition* left_part = right_part;
+ right_part = left_part->SplitAt(mid_x);
+
+ parts_splitted->push_back(left_part);
+ left_part->ComputeSpecialBlobsDensity();
+ found_split = true;
+ break;
+ }
+
+ // The right side of the previous blobs.
+ previous_right = std::max(previous_right, static_cast<int>(box.right()));
+ }
+ }
+
+ // Add the last piece.
+ right_part->ComputeSpecialBlobsDensity();
+ parts_splitted->push_back(right_part);
+}
+
+void EquationDetect::SplitCPHorLite(ColPartition* part,
+ GenericVector<TBOX>* splitted_boxes) {
+ ASSERT_HOST(part && splitted_boxes);
+ splitted_boxes->clear();
+ if (part->median_width() == 0) {
+ return;
+ }
+
+ const double kThreshold = part->median_width() * 3.0;
+
+ // Blobs are sorted left side first. If blobs overlap,
+ // the previous blob may have a "more right" right side.
+ // Account for this by always keeping the largest "right"
+ // so far.
+ TBOX union_box;
+ int previous_right = INT32_MIN;
+ BLOBNBOX_C_IT box_it(part->boxes());
+ for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
+ const TBOX& box = box_it.data()->bounding_box();
+ if (previous_right != INT32_MIN &&
+ box.left() - previous_right > kThreshold) {
+ // We have a split position.
+ splitted_boxes->push_back(union_box);
+ previous_right = INT32_MIN;
+ }
+ if (previous_right == INT32_MIN) {
+ union_box = box;
+ } else {
+ union_box += box;
+ }
+ // The right side of the previous blobs.
+ previous_right = std::max(previous_right, static_cast<int>(box.right()));
+ }
+
+ // Add the last piece.
+ if (previous_right != INT32_MIN) {
+ splitted_boxes->push_back(union_box);
+ }
+}
+
+bool EquationDetect::CheckForSeed2(
+ const GenericVector<int>& indented_texts_left,
+ const float foreground_density_th,
+ ColPartition* part) {
+ ASSERT_HOST(part);
+ const TBOX& box = part->bounding_box();
+
+ // Check if it is aligned with any indented_texts_left.
+ if (!indented_texts_left.empty() &&
+ CountAlignment(indented_texts_left, box.left()) >=
+ kLeftIndentAlignmentCountTh) {
+ return false;
+ }
+
+ // Check the foreground density.
+ if (ComputeForegroundDensity(box) > foreground_density_th) {
+ return false;
+ }
+
+ return true;
+}
+
+int EquationDetect::CountAlignment(
+ const GenericVector<int>& sorted_vec, const int val) const {
+ if (sorted_vec.empty()) {
+ return 0;
+ }
+ const int kDistTh = static_cast<int>(roundf(0.03 * resolution_));
+ const int pos = sorted_vec.binary_search(val);
+ int count = 0;
+
+ // Search left side.
+ int index = pos;
+ while (index >= 0 && abs(val - sorted_vec[index--]) < kDistTh) {
+ count++;
+ }
+
+ // Search right side.
+ index = pos + 1;
+ while (index < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) {
+ count++;
+ }
+
+ return count;
+}
+
+void EquationDetect::IdentifyInlineParts() {
+ ComputeCPsSuperBBox();
+ IdentifyInlinePartsHorizontal();
+ const int textparts_linespacing = EstimateTextPartLineSpacing();
+ IdentifyInlinePartsVertical(true, textparts_linespacing);
+ IdentifyInlinePartsVertical(false, textparts_linespacing);
+}
+
+void EquationDetect::ComputeCPsSuperBBox() {
+ ColPartitionGridSearch gsearch(part_grid_);
+ ColPartition *part = nullptr;
+ gsearch.StartFullSearch();
+ delete cps_super_bbox_;
+ cps_super_bbox_ = new TBOX();
+ while ((part = gsearch.NextFullSearch()) != nullptr) {
+ (*cps_super_bbox_) += part->bounding_box();
+ }
+}
+
+void EquationDetect::IdentifyInlinePartsHorizontal() {
+ ASSERT_HOST(cps_super_bbox_);
+ GenericVector<ColPartition*> new_seeds;
+ const int kMarginDiffTh = IntCastRounded(
+ 0.5 * lang_tesseract_->source_resolution());
+ const int kGapTh = static_cast<int>(roundf(
+ 1.0 * lang_tesseract_->source_resolution()));
+ ColPartitionGridSearch search(part_grid_);
+ search.SetUniqueMode(true);
+ // The center x coordinate of the cp_super_bbox_.
+ const int cps_cx = cps_super_bbox_->left() + cps_super_bbox_->width() / 2;
+ for (int i = 0; i < cp_seeds_.size(); ++i) {
+ ColPartition* part = cp_seeds_[i];
+ const TBOX& part_box(part->bounding_box());
+ const int left_margin = part_box.left() - cps_super_bbox_->left(),
+ right_margin = cps_super_bbox_->right() - part_box.right();
+ bool right_to_left;
+ if (left_margin + kMarginDiffTh < right_margin &&
+ left_margin < kMarginDiffTh) {
+ // part is left aligned, so we search if it has any right neighbor.
+ search.StartSideSearch(
+ part_box.right(), part_box.top(), part_box.bottom());
+ right_to_left = false;
+ } else if (left_margin > cps_cx) {
+ // part locates on the right half on image, so search if it has any left
+ // neighbor.
+ search.StartSideSearch(
+ part_box.left(), part_box.top(), part_box.bottom());
+ right_to_left = true;
+ } else { // part is not an inline equation.
+ new_seeds.push_back(part);
+ continue;
+ }
+ ColPartition* neighbor = nullptr;
+ bool side_neighbor_found = false;
+ while ((neighbor = search.NextSideSearch(right_to_left)) != nullptr) {
+ const TBOX& neighbor_box(neighbor->bounding_box());
+ if (!IsTextOrEquationType(neighbor->type()) ||
+ part_box.x_gap(neighbor_box) > kGapTh ||
+ !part_box.major_y_overlap(neighbor_box) ||
+ part_box.major_x_overlap(neighbor_box)) {
+ continue;
+ }
+ // We have found one. Set the side_neighbor_found flag.
+ side_neighbor_found = true;
+ break;
+ }
+ if (!side_neighbor_found) { // Mark part as PT_INLINE_EQUATION.
+ part->set_type(PT_INLINE_EQUATION);
+ } else {
+ // Check the geometric feature of neighbor.
+ const TBOX& neighbor_box(neighbor->bounding_box());
+ if (neighbor_box.width() > part_box.width() &&
+ neighbor->type() != PT_EQUATION) { // Mark as PT_INLINE_EQUATION.
+ part->set_type(PT_INLINE_EQUATION);
+ } else { // part is not an inline equation type.
+ new_seeds.push_back(part);
+ }
+ }
+ }
+
+ // Reset the cp_seeds_ using the new_seeds.
+ cp_seeds_ = new_seeds;
+}
+
+int EquationDetect::EstimateTextPartLineSpacing() {
+ ColPartitionGridSearch gsearch(part_grid_);
+
+ // Get the y gap between text partitions;
+ ColPartition *current = nullptr, *prev = nullptr;
+ gsearch.StartFullSearch();
+ GenericVector<int> ygaps;
+ while ((current = gsearch.NextFullSearch()) != nullptr) {
+ if (!PTIsTextType(current->type())) {
+ continue;
+ }
+ if (prev != nullptr) {
+ const TBOX &current_box = current->bounding_box();
+ const TBOX &prev_box = prev->bounding_box();
+ // prev and current should be x major overlap and non y overlap.
+ if (current_box.major_x_overlap(prev_box) &&
+ !current_box.y_overlap(prev_box)) {
+ int gap = current_box.y_gap(prev_box);
+ if (gap < std::min(current_box.height(), prev_box.height())) {
+ // The gap should be smaller than the height of the bounding boxes.
+ ygaps.push_back(gap);
+ }
+ }
+ }
+ prev = current;
+ }
+
+ if (ygaps.size() < 8) { // We do not have enough data.
+ return -1;
+ }
+
+ // Compute the line spacing from ygaps: use the mean of the first half.
+ ygaps.sort();
+ int spacing = 0, count;
+ for (count = 0; count < ygaps.size() / 2; count++) {
+ spacing += ygaps[count];
+ }
+ return spacing / count;
+}
+
+void EquationDetect::IdentifyInlinePartsVertical(
+ const bool top_to_bottom, const int textparts_linespacing) {
+ if (cp_seeds_.empty()) {
+ return;
+ }
+
+ // Sort cp_seeds_.
+ if (top_to_bottom) { // From top to bottom.
+ cp_seeds_.sort(&SortCPByTopReverse);
+ } else { // From bottom to top.
+ cp_seeds_.sort(&SortCPByBottom);
+ }
+
+ GenericVector<ColPartition*> new_seeds;
+ for (int i = 0; i < cp_seeds_.size(); ++i) {
+ ColPartition* part = cp_seeds_[i];
+ // If we sort cp_seeds_ from top to bottom, then for each cp_seeds_, we look
+ // for its top neighbors, so that if two/more inline regions are connected
+ // to each other, then we will identify the top one, and then use it to
+ // identify the bottom one.
+ if (IsInline(!top_to_bottom, textparts_linespacing, part)) {
+ part->set_type(PT_INLINE_EQUATION);
+ } else {
+ new_seeds.push_back(part);
+ }
+ }
+ cp_seeds_ = new_seeds;
+}
+
+bool EquationDetect::IsInline(const bool search_bottom,
+ const int textparts_linespacing,
+ ColPartition* part) {
+ ASSERT_HOST(part != nullptr);
+ // Look for its nearest vertical neighbor that hardly overlaps in y but
+ // largely overlaps in x.
+ ColPartitionGridSearch search(part_grid_);
+ ColPartition *neighbor = nullptr;
+ const TBOX& part_box(part->bounding_box());
+ const float kYGapRatioTh = 1.0;
+
+ if (search_bottom) {
+ search.StartVerticalSearch(part_box.left(), part_box.right(),
+ part_box.bottom());
+ } else {
+ search.StartVerticalSearch(part_box.left(), part_box.right(),
+ part_box.top());
+ }
+ search.SetUniqueMode(true);
+ while ((neighbor = search.NextVerticalSearch(search_bottom)) != nullptr) {
+ const TBOX& neighbor_box(neighbor->bounding_box());
+ if (part_box.y_gap(neighbor_box) > kYGapRatioTh *
+ std::min(part_box.height(), neighbor_box.height())) {
+ // Finished searching.
+ break;
+ }
+ if (!PTIsTextType(neighbor->type())) {
+ continue;
+ }
+
+ // Check if neighbor and part is inline similar.
+ const float kHeightRatioTh = 0.5;
+ const int kYGapTh = textparts_linespacing > 0 ?
+ textparts_linespacing + static_cast<int>(roundf(0.02 * resolution_)):
+ static_cast<int>(roundf(0.05 * resolution_)); // Default value.
+ if (part_box.x_overlap(neighbor_box) && // Location feature.
+ part_box.y_gap(neighbor_box) <= kYGapTh && // Line spacing.
+ // Geo feature.
+ static_cast<float>(std::min(part_box.height(), neighbor_box.height())) /
+ std::max(part_box.height(), neighbor_box.height()) > kHeightRatioTh) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool EquationDetect::CheckSeedBlobsCount(ColPartition* part) {
+ if (!part) {
+ return false;
+ }
+ const int kSeedMathBlobsCount = 2;
+ const int kSeedMathDigitBlobsCount = 5;
+
+ const int blobs = part->boxes_count(),
+ math_blobs = part->SpecialBlobsCount(BSTT_MATH),
+ digit_blobs = part->SpecialBlobsCount(BSTT_DIGIT);
+ if (blobs < kSeedBlobsCountTh || math_blobs <= kSeedMathBlobsCount ||
+ math_blobs + digit_blobs <= kSeedMathDigitBlobsCount) {
+ return false;
+ }
+
+ return true;
+}
+
+bool EquationDetect::CheckSeedDensity(
+ const float math_density_high,
+ const float math_density_low,
+ const ColPartition* part) const {
+ ASSERT_HOST(part);
+ float math_digit_density = part->SpecialBlobsDensity(BSTT_MATH)
+ + part->SpecialBlobsDensity(BSTT_DIGIT);
+ float italic_density = part->SpecialBlobsDensity(BSTT_ITALIC);
+ if (math_digit_density > math_density_high) {
+ return true;
+ }
+ if (math_digit_density + italic_density > kMathItalicDensityTh &&
+ math_digit_density > math_density_low) {
+ return true;
+ }
+
+ return false;
+}
+
+EquationDetect::IndentType EquationDetect::IsIndented(ColPartition* part) {
+ ASSERT_HOST(part);
+
+ ColPartitionGridSearch search(part_grid_);
+ ColPartition *neighbor = nullptr;
+ const TBOX& part_box(part->bounding_box());
+ const int kXGapTh = static_cast<int>(roundf(0.5 * resolution_));
+ const int kRadiusTh = static_cast<int>(roundf(3.0 * resolution_));
+ const int kYGapTh = static_cast<int>(roundf(0.5 * resolution_));
+
+ // Here we use a simple approximation algorithm: from the center of part, We
+ // perform the radius search, and check if we can find a neighboring partition
+ // that locates on the top/bottom left of part.
+ search.StartRadSearch((part_box.left() + part_box.right()) / 2,
+ (part_box.top() + part_box.bottom()) / 2, kRadiusTh);
+ search.SetUniqueMode(true);
+ bool left_indented = false, right_indented = false;
+ while ((neighbor = search.NextRadSearch()) != nullptr &&
+ (!left_indented || !right_indented)) {
+ if (neighbor == part) {
+ continue;
+ }
+ const TBOX& neighbor_box(neighbor->bounding_box());
+
+ if (part_box.major_y_overlap(neighbor_box) &&
+ part_box.x_gap(neighbor_box) < kXGapTh) {
+ // When this happens, it is likely part is a fragment of an
+ // over-segmented colpartition. So we return false.
+ return NO_INDENT;
+ }
+
+ if (!IsTextOrEquationType(neighbor->type())) {
+ continue;
+ }
+
+ // The neighbor should be above/below part, and overlap in x direction.
+ if (!part_box.x_overlap(neighbor_box) || part_box.y_overlap(neighbor_box)) {
+ continue;
+ }
+
+ if (part_box.y_gap(neighbor_box) < kYGapTh) {
+ const int left_gap = part_box.left() - neighbor_box.left();
+ const int right_gap = neighbor_box.right() - part_box.right();
+ if (left_gap > kXGapTh) {
+ left_indented = true;
+ }
+ if (right_gap > kXGapTh) {
+ right_indented = true;
+ }
+ }
+ }
+
+ if (left_indented && right_indented) {
+ return BOTH_INDENT;
+ }
+ if (left_indented) {
+ return LEFT_INDENT;
+ }
+ if (right_indented) {
+ return RIGHT_INDENT;
+ }
+ return NO_INDENT;
+}
+
+bool EquationDetect::ExpandSeed(ColPartition* seed) {
+ if (seed == nullptr || // This seed has been absorbed by other seeds.
+ seed->IsVerticalType()) { // We skip vertical type right now.
+ return false;
+ }
+
+ // Expand in four directions.
+ GenericVector<ColPartition*> parts_to_merge;
+ ExpandSeedHorizontal(true, seed, &parts_to_merge);
+ ExpandSeedHorizontal(false, seed, &parts_to_merge);
+ ExpandSeedVertical(true, seed, &parts_to_merge);
+ ExpandSeedVertical(false, seed, &parts_to_merge);
+ SearchByOverlap(seed, &parts_to_merge);
+
+ if (parts_to_merge.empty()) { // We don't find any partition to merge.
+ return false;
+ }
+
+ // Merge all partitions in parts_to_merge with seed. We first remove seed
+ // from part_grid_ as its bounding box is going to expand. Then we add it
+ // back after it absorbs all parts_to_merge partitions.
+ part_grid_->RemoveBBox(seed);
+ for (int i = 0; i < parts_to_merge.size(); ++i) {
+ ColPartition* part = parts_to_merge[i];
+ if (part->type() == PT_EQUATION) {
+ // If part is in cp_seeds_, then we mark it as nullptr so that we won't
+ // process it again.
+ for (int j = 0; j < cp_seeds_.size(); ++j) {
+ if (part == cp_seeds_[j]) {
+ cp_seeds_[j] = nullptr;
+ break;
+ }
+ }
+ }
+
+ // part has already been removed from part_grid_ in function
+ // ExpandSeedHorizontal/ExpandSeedVertical.
+ seed->Absorb(part, nullptr);
+ }
+
+ return true;
+}
+
+void EquationDetect::ExpandSeedHorizontal(
+ const bool search_left,
+ ColPartition* seed,
+ GenericVector<ColPartition*>* parts_to_merge) {
+ ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr);
+ const float kYOverlapTh = 0.6;
+ const int kXGapTh = static_cast<int>(roundf(0.2 * resolution_));
+
+ ColPartitionGridSearch search(part_grid_);
+ const TBOX& seed_box(seed->bounding_box());
+ const int x = search_left ? seed_box.left() : seed_box.right();
+ search.StartSideSearch(x, seed_box.bottom(), seed_box.top());
+ search.SetUniqueMode(true);
+
+ // Search iteratively.
+ ColPartition *part = nullptr;
+ while ((part = search.NextSideSearch(search_left)) != nullptr) {
+ if (part == seed) {
+ continue;
+ }
+ const TBOX& part_box(part->bounding_box());
+ if (part_box.x_gap(seed_box) > kXGapTh) { // Out of scope.
+ break;
+ }
+
+ // Check part location.
+ if ((part_box.left() >= seed_box.left() && search_left) ||
+ (part_box.right() <= seed_box.right() && !search_left)) {
+ continue;
+ }
+
+ if (part->type() != PT_EQUATION) { // Non-equation type.
+ // Skip PT_LINLINE_EQUATION and non text type.
+ if (part->type() == PT_INLINE_EQUATION ||
+ (!IsTextOrEquationType(part->type()) &&
+ part->blob_type() != BRT_HLINE)) {
+ continue;
+ }
+ // For other types, it should be the near small neighbor of seed.
+ if (!IsNearSmallNeighbor(seed_box, part_box) ||
+ !CheckSeedNeighborDensity(part)) {
+ continue;
+ }
+ } else { // Equation type, check the y overlap.
+ if (part_box.y_overlap_fraction(seed_box) < kYOverlapTh &&
+ seed_box.y_overlap_fraction(part_box) < kYOverlapTh) {
+ continue;
+ }
+ }
+
+ // Passed the check, delete it from search and add into parts_to_merge.
+ search.RemoveBBox();
+ parts_to_merge->push_back(part);
+ }
+}
+
+void EquationDetect::ExpandSeedVertical(
+ const bool search_bottom,
+ ColPartition* seed,
+ GenericVector<ColPartition*>* parts_to_merge) {
+ ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr &&
+ cps_super_bbox_ != nullptr);
+ const float kXOverlapTh = 0.4;
+ const int kYGapTh = static_cast<int>(roundf(0.2 * resolution_));
+
+ ColPartitionGridSearch search(part_grid_);
+ const TBOX& seed_box(seed->bounding_box());
+ const int y = search_bottom ? seed_box.bottom() : seed_box.top();
+ search.StartVerticalSearch(
+ cps_super_bbox_->left(), cps_super_bbox_->right(), y);
+ search.SetUniqueMode(true);
+
+ // Search iteratively.
+ ColPartition *part = nullptr;
+ GenericVector<ColPartition*> parts;
+ int skipped_min_top = std::numeric_limits<int>::max(), skipped_max_bottom = -1;
+ while ((part = search.NextVerticalSearch(search_bottom)) != nullptr) {
+ if (part == seed) {
+ continue;
+ }
+ const TBOX& part_box(part->bounding_box());
+
+ if (part_box.y_gap(seed_box) > kYGapTh) { // Out of scope.
+ break;
+ }
+
+ // Check part location.
+ if ((part_box.bottom() >= seed_box.bottom() && search_bottom) ||
+ (part_box.top() <= seed_box.top() && !search_bottom)) {
+ continue;
+ }
+
+ bool skip_part = false;
+ if (part->type() != PT_EQUATION) { // Non-equation type.
+ // Skip PT_LINLINE_EQUATION and non text type.
+ if (part->type() == PT_INLINE_EQUATION ||
+ (!IsTextOrEquationType(part->type()) &&
+ part->blob_type() != BRT_HLINE)) {
+ skip_part = true;
+ } else if (!IsNearSmallNeighbor(seed_box, part_box) ||
+ !CheckSeedNeighborDensity(part)) {
+ // For other types, it should be the near small neighbor of seed.
+ skip_part = true;
+ }
+ } else { // Equation type, check the x overlap.
+ if (part_box.x_overlap_fraction(seed_box) < kXOverlapTh &&
+ seed_box.x_overlap_fraction(part_box) < kXOverlapTh) {
+ skip_part = true;
+ }
+ }
+ if (skip_part) {
+ if (part->type() != PT_EQUATION) {
+ if (skipped_min_top > part_box.top()) {
+ skipped_min_top = part_box.top();
+ }
+ if (skipped_max_bottom < part_box.bottom()) {
+ skipped_max_bottom = part_box.bottom();
+ }
+ }
+ } else {
+ parts.push_back(part);
+ }
+ }
+
+ // For every part in parts, we need verify it is not above skipped_min_top
+ // when search top, or not below skipped_max_bottom when search bottom. I.e.,
+ // we will skip a part if it looks like:
+ // search bottom | search top
+ // seed: ****************** | part: **********
+ // skipped: xxx | skipped: xxx
+ // part: ********** | seed: ***********
+ for (int i = 0; i < parts.size(); i++) {
+ const TBOX& part_box(parts[i]->bounding_box());
+ if ((search_bottom && part_box.top() <= skipped_max_bottom) ||
+ (!search_bottom && part_box.bottom() >= skipped_min_top)) {
+ continue;
+ }
+ // Add parts[i] into parts_to_merge, and delete it from part_grid_.
+ parts_to_merge->push_back(parts[i]);
+ part_grid_->RemoveBBox(parts[i]);
+ }
+}
+
+bool EquationDetect::IsNearSmallNeighbor(const TBOX& seed_box,
+ const TBOX& part_box) const {
+ const int kXGapTh = static_cast<int>(roundf(0.25 * resolution_));
+ const int kYGapTh = static_cast<int>(roundf(0.05 * resolution_));
+
+ // Check geometric feature.
+ if (part_box.height() > seed_box.height() ||
+ part_box.width() > seed_box.width()) {
+ return false;
+ }
+
+ // Check overlap and distance.
+ if ((!part_box.major_x_overlap(seed_box) ||
+ part_box.y_gap(seed_box) > kYGapTh) &&
+ (!part_box.major_y_overlap(seed_box) ||
+ part_box.x_gap(seed_box) > kXGapTh)) {
+ return false;
+ }
+
+ return true;
+}
+
+bool EquationDetect::CheckSeedNeighborDensity(const ColPartition* part) const {
+ ASSERT_HOST(part);
+ if (part->boxes_count() < kSeedBlobsCountTh) {
+ // Too few blobs, skip the check.
+ return true;
+ }
+
+ // We check the math blobs density and the unclear blobs density.
+ if (part->SpecialBlobsDensity(BSTT_MATH) +
+ part->SpecialBlobsDensity(BSTT_DIGIT) > kMathDigitDensityTh1 ||
+ part->SpecialBlobsDensity(BSTT_UNCLEAR) > kUnclearDensityTh) {
+ return true;
+ }
+
+ return false;
+}
+
+void EquationDetect::ProcessMathBlockSatelliteParts() {
+ // Iterate over part_grid_, and find all parts that are text type but not
+ // equation type.
+ ColPartition *part = nullptr;
+ GenericVector<ColPartition*> text_parts;
+ ColPartitionGridSearch gsearch(part_grid_);
+ gsearch.StartFullSearch();
+ while ((part = gsearch.NextFullSearch()) != nullptr) {
+ if (part->type() == PT_FLOWING_TEXT || part->type() == PT_HEADING_TEXT) {
+ text_parts.push_back(part);
+ }
+ }
+ if (text_parts.empty()) {
+ return;
+ }
+
+ // Compute the medium height of the text_parts.
+ text_parts.sort(&SortCPByHeight);
+ const TBOX& text_box = text_parts[text_parts.size() / 2]->bounding_box();
+ int med_height = text_box.height();
+ if (text_parts.size() % 2 == 0 && text_parts.size() > 1) {
+ const TBOX& text_box =
+ text_parts[text_parts.size() / 2 - 1]->bounding_box();
+ med_height = static_cast<int>(roundf(
+ 0.5 * (text_box.height() + med_height)));
+ }
+
+ // Iterate every text_parts and check if it is a math block satellite.
+ for (int i = 0; i < text_parts.size(); ++i) {
+ const TBOX& text_box(text_parts[i]->bounding_box());
+ if (text_box.height() > med_height) {
+ continue;
+ }
+ GenericVector<ColPartition*> math_blocks;
+ if (!IsMathBlockSatellite(text_parts[i], &math_blocks)) {
+ continue;
+ }
+
+ // Found. merge text_parts[i] with math_blocks.
+ part_grid_->RemoveBBox(text_parts[i]);
+ text_parts[i]->set_type(PT_EQUATION);
+ for (int j = 0; j < math_blocks.size(); ++j) {
+ part_grid_->RemoveBBox(math_blocks[j]);
+ text_parts[i]->Absorb(math_blocks[j], nullptr);
+ }
+ InsertPartAfterAbsorb(text_parts[i]);
+ }
+}
+
+bool EquationDetect::IsMathBlockSatellite(
+ ColPartition* part, GenericVector<ColPartition*>* math_blocks) {
+ ASSERT_HOST(part != nullptr && math_blocks != nullptr);
+ math_blocks->clear();
+ const TBOX& part_box(part->bounding_box());
+ // Find the top/bottom nearest neighbor of part.
+ ColPartition *neighbors[2];
+ int y_gaps[2] = {std::numeric_limits<int>::max(), std::numeric_limits<int>::max()};
+ // The horizontal boundary of the neighbors.
+ int neighbors_left = std::numeric_limits<int>::max(), neighbors_right = 0;
+ for (int i = 0; i < 2; ++i) {
+ neighbors[i] = SearchNNVertical(i != 0, part);
+ if (neighbors[i]) {
+ const TBOX& neighbor_box = neighbors[i]->bounding_box();
+ y_gaps[i] = neighbor_box.y_gap(part_box);
+ if (neighbor_box.left() < neighbors_left) {
+ neighbors_left = neighbor_box.left();
+ }
+ if (neighbor_box.right() > neighbors_right) {
+ neighbors_right = neighbor_box.right();
+ }
+ }
+ }
+ if (neighbors[0] == neighbors[1]) {
+ // This happens when part is inside neighbor.
+ neighbors[1] = nullptr;
+ y_gaps[1] = std::numeric_limits<int>::max();
+ }
+
+ // Check if part is within [neighbors_left, neighbors_right].
+ if (part_box.left() < neighbors_left || part_box.right() > neighbors_right) {
+ return false;
+ }
+
+ // Get the index of the near one in neighbors.
+ int index = y_gaps[0] < y_gaps[1] ? 0 : 1;
+
+ // Check the near one.
+ if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) {
+ math_blocks->push_back(neighbors[index]);
+ } else {
+ // If the near one failed the check, then we skip checking the far one.
+ return false;
+ }
+
+ // Check the far one.
+ index = 1 - index;
+ if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) {
+ math_blocks->push_back(neighbors[index]);
+ }
+
+ return true;
+}
+
+ColPartition* EquationDetect::SearchNNVertical(
+ const bool search_bottom, const ColPartition* part) {
+ ASSERT_HOST(part);
+ ColPartition *nearest_neighbor = nullptr, *neighbor = nullptr;
+ const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.5));
+
+ ColPartitionGridSearch search(part_grid_);
+ search.SetUniqueMode(true);
+ const TBOX& part_box(part->bounding_box());
+ int y = search_bottom ? part_box.bottom() : part_box.top();
+ search.StartVerticalSearch(part_box.left(), part_box.right(), y);
+ int min_y_gap = std::numeric_limits<int>::max();
+ while ((neighbor = search.NextVerticalSearch(search_bottom)) != nullptr) {
+ if (neighbor == part || !IsTextOrEquationType(neighbor->type())) {
+ continue;
+ }
+ const TBOX& neighbor_box(neighbor->bounding_box());
+ int y_gap = neighbor_box.y_gap(part_box);
+ if (y_gap > kYGapTh) { // Out of scope.
+ break;
+ }
+ if (!neighbor_box.major_x_overlap(part_box) ||
+ (search_bottom && neighbor_box.bottom() > part_box.bottom()) ||
+ (!search_bottom && neighbor_box.top() < part_box.top())) {
+ continue;
+ }
+ if (y_gap < min_y_gap) {
+ min_y_gap = y_gap;
+ nearest_neighbor = neighbor;
+ }
+ }
+
+ return nearest_neighbor;
+}
+
+bool EquationDetect::IsNearMathNeighbor(
+ const int y_gap, const ColPartition *neighbor) const {
+ if (!neighbor) {
+ return false;
+ }
+ const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.1));
+ return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh;
+}
+
+void EquationDetect::GetOutputTiffName(const char* name,
+ STRING* image_name) const {
+ ASSERT_HOST(image_name && name);
+ char page[50];
+ snprintf(page, sizeof(page), "%04d", page_count_);
+ *image_name = STRING(lang_tesseract_->imagebasename) + page + name + ".tif";
+}
+
+void EquationDetect::PaintSpecialTexts(const STRING& outfile) const {
+ Pix *pix = nullptr, *pixBi = lang_tesseract_->pix_binary();
+ pix = pixConvertTo32(pixBi);
+ ColPartitionGridSearch gsearch(part_grid_);
+ ColPartition* part = nullptr;
+ gsearch.StartFullSearch();
+ while ((part = gsearch.NextFullSearch()) != nullptr) {
+ BLOBNBOX_C_IT blob_it(part->boxes());
+ for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+ RenderSpecialText(pix, blob_it.data());
+ }
+ }
+
+ pixWrite(outfile.c_str(), pix, IFF_TIFF_LZW);
+ pixDestroy(&pix);
+}
+
+void EquationDetect::PaintColParts(const STRING& outfile) const {
+ Pix *pix = pixConvertTo32(lang_tesseract_->BestPix());
+ ColPartitionGridSearch gsearch(part_grid_);
+ gsearch.StartFullSearch();
+ ColPartition* part = nullptr;
+ while ((part = gsearch.NextFullSearch()) != nullptr) {
+ const TBOX& tbox = part->bounding_box();
+ Box *box = boxCreate(tbox.left(), pixGetHeight(pix) - tbox.top(),
+ tbox.width(), tbox.height());
+ if (part->type() == PT_EQUATION) {
+ pixRenderBoxArb(pix, box, 5, 255, 0, 0);
+ } else if (part->type() == PT_INLINE_EQUATION) {
+ pixRenderBoxArb(pix, box, 5, 0, 255, 0);
+ } else {
+ pixRenderBoxArb(pix, box, 5, 0, 0, 255);
+ }
+ boxDestroy(&box);
+ }
+
+ pixWrite(outfile.c_str(), pix, IFF_TIFF_LZW);
+ pixDestroy(&pix);
+}
+
+void EquationDetect::PrintSpecialBlobsDensity(const ColPartition* part) const {
+ ASSERT_HOST(part);
+ TBOX box(part->bounding_box());
+ int h = pixGetHeight(lang_tesseract_->BestPix());
+ tprintf("Printing special blobs density values for ColParition (t=%d,b=%d) ",
+ h - box.top(), h - box.bottom());
+ box.print();
+ tprintf("blobs count = %d, density = ", part->boxes_count());
+ for (int i = 0; i < BSTT_COUNT; ++i) {
+ auto type = static_cast<BlobSpecialTextType>(i);
+ tprintf("%d:%f ", i, part->SpecialBlobsDensity(type));
+ }
+ tprintf("\n");
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/equationdetect.h b/tesseract/src/ccmain/equationdetect.h
new file mode 100644
index 00000000..ffa418fe
--- /dev/null
+++ b/tesseract/src/ccmain/equationdetect.h
@@ -0,0 +1,273 @@
+///////////////////////////////////////////////////////////////////////
+// File: equationdetect.h
+// Description: The equation detection class that inherits equationdetectbase.
+// Author: Zongyi (Joe) Liu (joeliu@google.com)
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCMAIN_EQUATIONDETECT_H_
+#define TESSERACT_CCMAIN_EQUATIONDETECT_H_
+
+#include "blobbox.h" // for BLOBNBOX (ptr only), BlobSpecialText...
+#include "equationdetectbase.h" // for EquationDetectBase
+#include "genericvector.h" // for GenericVector
+#include "tesseractclass.h" // for Tesseract
+#include <tesseract/unichar.h> // for UNICHAR_ID
+
+class TBOX;
+class UNICHARSET;
+
+namespace tesseract {
+
+class Tesseract;
+class ColPartition;
+class ColPartitionGrid;
+class ColPartitionSet;
+
+class TESS_API EquationDetect : public EquationDetectBase {
+ public:
+ EquationDetect(const char* equ_datapath,
+ const char* equ_language);
+ ~EquationDetect() override;
+
+ enum IndentType {
+ NO_INDENT,
+ LEFT_INDENT,
+ RIGHT_INDENT,
+ BOTH_INDENT,
+ INDENT_TYPE_COUNT
+ };
+
+ // Reset the lang_tesseract_ pointer. This function should be called before we
+ // do any detector work.
+ void SetLangTesseract(Tesseract* lang_tesseract);
+
+ // Iterate over the blobs inside to_block, and set the blobs that we want to
+ // process to BSTT_NONE. (By default, they should be BSTT_SKIP). The function
+ // returns 0 upon success.
+ int LabelSpecialText(TO_BLOCK* to_block) override;
+
+ // Find possible equation partitions from part_grid. Should be called
+ // after the special_text_type of blobs are set.
+ // It returns 0 upon success.
+ int FindEquationParts(ColPartitionGrid* part_grid,
+ ColPartitionSet** best_columns) override;
+
+ // Reset the resolution of the processing image. TEST only function.
+ void SetResolution(const int resolution);
+
+ protected:
+ // Identify the special text type for one blob, and update its field. When
+ // height_th is set (> 0), we will label the blob as BSTT_NONE if its height
+ // is less than height_th.
+ void IdentifySpecialText(BLOBNBOX *blob, const int height_th);
+
+ // Estimate the type for one unichar.
+ BlobSpecialTextType EstimateTypeForUnichar(
+ const UNICHARSET& unicharset, const UNICHAR_ID id) const;
+
+ // Compute special text type for each blobs in part_grid_.
+ void IdentifySpecialText();
+
+ // Identify blobs that we want to skip during special blob type
+ // classification.
+ void IdentifyBlobsToSkip(ColPartition* part);
+
+ // The ColPartitions in part_grid_ maybe over-segmented, particularly in the
+ // block equation regions. So we like to identify these partitions and merge
+ // them before we do the searching.
+ void MergePartsByLocation();
+
+ // Staring from the seed center, we do radius search. And for partitions that
+ // have large overlaps with seed, we remove them from part_grid_ and add into
+ // parts_overlap. Note: this function may update the part_grid_, so if the
+ // caller is also running ColPartitionGridSearch, use the RepositionIterator
+ // to continue.
+ void SearchByOverlap(ColPartition* seed,
+ GenericVector<ColPartition*>* parts_overlap);
+
+ // Insert part back into part_grid_, after it absorbs some other parts.
+ void InsertPartAfterAbsorb(ColPartition* part);
+
+ // Identify the colparitions in part_grid_, label them as PT_EQUATION, and
+ // save them into cp_seeds_.
+ void IdentifySeedParts();
+
+ // Check the blobs count for a seed region candidate.
+ bool CheckSeedBlobsCount(ColPartition* part);
+
+ // Compute the foreground pixel density for a tbox area.
+ float ComputeForegroundDensity(const TBOX& tbox);
+
+ // Check if part from seed2 label: with low math density and left indented. We
+ // are using two checks:
+ // 1. If its left is aligned with any coordinates in indented_texts_left,
+ // which we assume have been sorted.
+ // 2. If its foreground density is over foreground_density_th.
+ bool CheckForSeed2(
+ const GenericVector<int>& indented_texts_left,
+ const float foreground_density_th,
+ ColPartition* part);
+
+ // Count the number of values in sorted_vec that is close to val, used to
+ // check if a partition is aligned with text partitions.
+ int CountAlignment(
+ const GenericVector<int>& sorted_vec, const int val) const;
+
+ // Check for a seed candidate using the foreground pixel density. And we
+ // return true if the density is below a certain threshold, because characters
+ // in equation regions usually are apart with more white spaces.
+ bool CheckSeedFgDensity(const float density_th, ColPartition* part);
+
+ // A light version of SplitCPHor: instead of really doing the part split, we
+ // simply compute the union bounding box of each split part.
+ void SplitCPHorLite(ColPartition* part, GenericVector<TBOX>* splitted_boxes);
+
+ // Split the part (horizontally), and save the split result into
+ // parts_splitted. Note that it is caller's responsibility to release the
+ // memory owns by parts_splitted. On the other hand, the part is unchanged
+ // during this process and still owns the blobs, so do NOT call DeleteBoxes
+ // when freeing the colpartitions in parts_splitted.
+ void SplitCPHor(ColPartition* part,
+ GenericVector<ColPartition*>* parts_splitted);
+
+ // Check the density for a seed candidate (part) using its math density and
+ // italic density, returns true if the check passed.
+ bool CheckSeedDensity(const float math_density_high,
+ const float math_density_low,
+ const ColPartition* part) const;
+
+ // Check if part is indented.
+ IndentType IsIndented(ColPartition* part);
+
+ // Identify inline partitions from cp_seeds_, and re-label them.
+ void IdentifyInlineParts();
+
+ // Compute the super bounding box for all colpartitions inside part_grid_.
+ void ComputeCPsSuperBBox();
+
+ // Identify inline partitions from cp_seeds_ using the horizontal search.
+ void IdentifyInlinePartsHorizontal();
+
+ // Estimate the line spacing between two text partitions. Returns -1 if not
+ // enough data.
+ int EstimateTextPartLineSpacing();
+
+ // Identify inline partitions from cp_seeds_ using vertical search.
+ void IdentifyInlinePartsVertical(const bool top_to_bottom,
+ const int textPartsLineSpacing);
+
+ // Check if part is an inline equation zone. This should be called after we
+ // identified the seed regions.
+ bool IsInline(const bool search_bottom,
+ const int textPartsLineSpacing,
+ ColPartition* part);
+
+ // For a given seed partition, we search the part_grid_ and see if there is
+ // any partition can be merged with it. It returns true if the seed has been
+ // expanded.
+ bool ExpandSeed(ColPartition* seed);
+
+ // Starting from the seed position, we search the part_grid_
+ // horizontally/vertically, find all partitions that can be
+ // merged with seed, remove them from part_grid_, and put them into
+ // parts_to_merge.
+ void ExpandSeedHorizontal(const bool search_left,
+ ColPartition* seed,
+ GenericVector<ColPartition*>* parts_to_merge);
+ void ExpandSeedVertical(const bool search_bottom,
+ ColPartition* seed,
+ GenericVector<ColPartition*>* parts_to_merge);
+
+ // Check if a part_box is the small neighbor of seed_box.
+ bool IsNearSmallNeighbor(const TBOX& seed_box,
+ const TBOX& part_box) const;
+
+ // Perform the density check for part, which we assume is nearing a seed
+ // partition. It returns true if the check passed.
+ bool CheckSeedNeighborDensity(const ColPartition* part) const;
+
+ // After identify the math blocks, we do one more scanning on all text
+ // partitions, and check if any of them is the satellite of:
+ // math blocks: here a p is the satellite of q if:
+ // 1. q is the nearest vertical neighbor of p, and
+ // 2. y_gap(p, q) is less than a threshold, and
+ // 3. x_overlap(p, q) is over a threshold.
+ // Note that p can be the satellites of two blocks: its top neighbor and
+ // bottom neighbor.
+ void ProcessMathBlockSatelliteParts();
+
+ // Check if part is the satellite of one/two math blocks. If it is, we return
+ // true, and save the blocks into math_blocks.
+ bool IsMathBlockSatellite(
+ ColPartition* part, GenericVector<ColPartition*>* math_blocks);
+
+ // Search the nearest neighbor of part in one vertical direction as defined in
+ // search_bottom. It returns the neighbor found that major x overlap with it,
+ // or nullptr when not found.
+ ColPartition* SearchNNVertical(const bool search_bottom,
+ const ColPartition* part);
+
+ // Check if the neighbor with vertical distance of y_gap is a near and math
+ // block partition.
+ bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const;
+
+ // Generate the tiff file name for output/debug file.
+ void GetOutputTiffName(const char* name, STRING* image_name) const;
+
+ // Debugger function that renders ColPartitions on the input image, where:
+ // parts labeled as PT_EQUATION will be painted in red, PT_INLINE_EQUATION
+ // will be painted in green, and other parts will be painted in blue.
+ void PaintColParts(const STRING& outfile) const;
+
+ // Debugger function that renders the blobs in part_grid_ over the input
+ // image.
+ void PaintSpecialTexts(const STRING& outfile) const;
+
+ // Debugger function that print the math blobs density values for a
+ // ColPartition object.
+ void PrintSpecialBlobsDensity(const ColPartition* part) const;
+
+ // The tesseract engine initialized from equation training data.
+ Tesseract equ_tesseract_;
+
+ // The tesseract engine used for OCR. This pointer is passed in by the caller,
+ // so do NOT destroy it in this class.
+ Tesseract* lang_tesseract_;
+
+ // The ColPartitionGrid that we are processing. This pointer is passed in from
+ // the caller, so do NOT destroy it in the class.
+ ColPartitionGrid* part_grid_ = nullptr;
+
+ // A simple array of pointers to the best assigned column division at
+ // each grid y coordinate. This pointer is passed in from the caller, so do
+ // NOT destroy it in the class.
+ ColPartitionSet** best_columns_ = nullptr;
+
+ // The super bounding box of all cps in the part_grid_.
+ TBOX* cps_super_bbox_;
+
+ // The seed ColPartition for equation region.
+ GenericVector<ColPartition*> cp_seeds_;
+
+ // The resolution (dpi) of the processing image.
+ int resolution_;
+
+ // The number of pages we have processed.
+ int page_count_;
+};
+
+} // namespace tesseract
+
+#endif // TESSERACT_CCMAIN_EQUATIONDETECT_H_
diff --git a/tesseract/src/ccmain/fixspace.cpp b/tesseract/src/ccmain/fixspace.cpp
new file mode 100644
index 00000000..c15e99d3
--- /dev/null
+++ b/tesseract/src/ccmain/fixspace.cpp
@@ -0,0 +1,885 @@
+/******************************************************************
+ * File: fixspace.cpp (Formerly fixspace.c)
+ * Description: Implements a pass over the page res, exploring the alternative
+ * spacing possibilities, trying to use context to improve the
+ * word spacing
+ * Author: Phil Cheatle
+ *
+ * (C) Copyright 1993, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "fixspace.h"
+
+#include "blobs.h" // for TWERD, TBLOB, TESSLINE
+#include "boxword.h" // for BoxWord
+#include "errcode.h" // for ASSERT_HOST
+#include "normalis.h" // for kBlnXHeight, kBlnBaselineOffset
+#include "pageres.h" // for WERD_RES_IT, WERD_RES, WERD_RES_LIST
+#include "params.h" // for IntParam, StringParam, BoolParam, Doub...
+#include "ratngs.h" // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM
+#include "rect.h" // for TBOX
+#include "stepblob.h" // for C_BLOB_IT, C_BLOB_LIST, C_BLOB
+#include "tesseractclass.h" // for Tesseract, TesseractStats, WordData
+#include "tessvars.h" // for debug_fp
+#include "tprintf.h" // for tprintf
+#include "unicharset.h" // for UNICHARSET
+#include "werd.h" // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP
+
+#include <tesseract/ocrclass.h> // for ETEXT_DESC
+#include "strngs.h" // for STRING
+#include <tesseract/unichar.h> // for UNICHAR_ID
+
+#include <cstdint> // for INT16_MAX, int16_t, int32_t
+
+namespace tesseract {
+
+class BLOCK;
+class ROW;
+
+#define PERFECT_WERDS 999
+
+/**********************************************************************
+ * c_blob_comparator()
+ *
+ * Blob comparator used to sort a blob list so that blobs are in increasing
+ * order of left edge.
+ **********************************************************************/
+
+static int c_blob_comparator( // sort blobs
+ const void *blob1p, // ptr to ptr to blob1
+ const void *blob2p // ptr to ptr to blob2
+ ) {
+ const C_BLOB *blob1 = *reinterpret_cast<const C_BLOB* const*>(blob1p);
+ const C_BLOB *blob2 = *reinterpret_cast<const C_BLOB* const*>(blob2p);
+
+ return blob1->bounding_box ().left () - blob2->bounding_box ().left ();
+}
+
+/**
+ * @name fix_fuzzy_spaces()
+ * Walk over the page finding sequences of words joined by fuzzy spaces. Extract
+ * them as a sublist, process the sublist to find the optimal arrangement of
+ * spaces then replace the sublist in the ROW_RES.
+ *
+ * @param monitor progress monitor
+ * @param word_count count of words in doc
+ * @param[out] page_res
+ */
+void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
+ int32_t word_count,
+ PAGE_RES *page_res) {
+ BLOCK_RES_IT block_res_it;
+ ROW_RES_IT row_res_it;
+ WERD_RES_IT word_res_it_from;
+ WERD_RES_IT word_res_it_to;
+ WERD_RES *word_res;
+ WERD_RES_LIST fuzzy_space_words;
+ int16_t new_length;
+ bool prevent_null_wd_fixsp; // DON'T process blobless wds
+ int32_t word_index; // current word
+
+ block_res_it.set_to_list(&page_res->block_res_list);
+ word_index = 0;
+ for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
+ block_res_it.forward()) {
+ row_res_it.set_to_list(&block_res_it.data()->row_res_list);
+ for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
+ row_res_it.forward()) {
+ word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
+ while (!word_res_it_from.at_last()) {
+ word_res = word_res_it_from.data();
+ while (!word_res_it_from.at_last() &&
+ !(word_res->combination ||
+ word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
+ word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
+ fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
+ block_res_it.data()->block);
+ word_res = word_res_it_from.forward();
+ word_index++;
+ if (monitor != nullptr) {
+ monitor->ocr_alive = true;
+ monitor->progress = 90 + 5 * word_index / word_count;
+ if (monitor->deadline_exceeded() ||
+ (monitor->cancel != nullptr &&
+ (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
+ return;
+ }
+ }
+
+ if (!word_res_it_from.at_last()) {
+ word_res_it_to = word_res_it_from;
+ prevent_null_wd_fixsp =
+ word_res->word->cblob_list()->empty();
+ if (check_debug_pt(word_res, 60))
+ debug_fix_space_level.set_value(10);
+ word_res_it_to.forward();
+ word_index++;
+ if (monitor != nullptr) {
+ monitor->ocr_alive = true;
+ monitor->progress = 90 + 5 * word_index / word_count;
+ if (monitor->deadline_exceeded() ||
+ (monitor->cancel != nullptr &&
+ (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
+ return;
+ }
+ while (!word_res_it_to.at_last () &&
+ (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
+ word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
+ if (check_debug_pt(word_res, 60))
+ debug_fix_space_level.set_value(10);
+ if (word_res->word->cblob_list()->empty())
+ prevent_null_wd_fixsp = true;
+ word_res = word_res_it_to.forward();
+ }
+ if (check_debug_pt(word_res, 60))
+ debug_fix_space_level.set_value(10);
+ if (word_res->word->cblob_list()->empty())
+ prevent_null_wd_fixsp = true;
+ if (prevent_null_wd_fixsp) {
+ word_res_it_from = word_res_it_to;
+ } else {
+ fuzzy_space_words.assign_to_sublist(&word_res_it_from,
+ &word_res_it_to);
+ fix_fuzzy_space_list(fuzzy_space_words,
+ row_res_it.data()->row,
+ block_res_it.data()->block);
+ new_length = fuzzy_space_words.length();
+ word_res_it_from.add_list_before(&fuzzy_space_words);
+ for (;
+ !word_res_it_from.at_last() && new_length > 0;
+ new_length--) {
+ word_res_it_from.forward();
+ }
+ }
+ if (test_pt)
+ debug_fix_space_level.set_value(0);
+ }
+ fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
+ block_res_it.data()->block);
+ // Last word in row
+ }
+ }
+ }
+}
+
+void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm,
+ ROW *row,
+ BLOCK* block) {
+ int16_t best_score;
+ WERD_RES_LIST current_perm;
+ int16_t current_score;
+ bool improved = false;
+
+ best_score = eval_word_spacing(best_perm); // default score
+ dump_words(best_perm, best_score, 1, improved);
+
+ if (best_score != PERFECT_WERDS)
+ initialise_search(best_perm, current_perm);
+
+ while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
+ match_current_words(current_perm, row, block);
+ current_score = eval_word_spacing(current_perm);
+ dump_words(current_perm, current_score, 2, improved);
+ if (current_score > best_score) {
+ best_perm.clear();
+ best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
+ best_score = current_score;
+ improved = true;
+ }
+ if (current_score < PERFECT_WERDS)
+ transform_to_next_perm(current_perm);
+ }
+ dump_words(best_perm, best_score, 3, improved);
+}
+
+void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
+ WERD_RES_IT src_it(&src_list);
+ WERD_RES_IT new_it(&new_list);
+ WERD_RES *src_wd;
+ WERD_RES *new_wd;
+
+ for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
+ src_wd = src_it.data();
+ if (!src_wd->combination) {
+ new_wd = WERD_RES::deep_copy(src_wd);
+ new_wd->combination = false;
+ new_wd->part_of_combo = false;
+ new_it.add_after_then_move(new_wd);
+ }
+ }
+}
+
+void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
+ BLOCK* block) {
+ WERD_RES_IT word_it(&words);
+ WERD_RES *word;
+ // Since we are not using PAGE_RES to iterate over words, we need to update
+ // prev_word_best_choice_ before calling classify_word_pass2().
+ prev_word_best_choice_ = nullptr;
+ for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
+ word = word_it.data();
+ if ((!word->part_of_combo) && (word->box_word == nullptr)) {
+ WordData word_data(block, row, word);
+ SetupWordPassN(2, &word_data);
+ classify_word_and_language(2, nullptr, &word_data);
+ }
+ prev_word_best_choice_ = word->best_choice;
+ }
+}
+
+/**
+ * @name eval_word_spacing()
+ * The basic measure is the number of characters in contextually confirmed
+ * words. (I.e the word is done)
+ * If all words are contextually confirmed the evaluation is deemed perfect.
+ *
+ * Some fiddles are done to handle "1"s as these are VERY frequent causes of
+ * fuzzy spaces. The problem with the basic measure is that "561 63" would score
+ * the same as "56163", though given our knowledge that the space is fuzzy, and
+ * that there is a "1" next to the fuzzy space, we need to ensure that "56163"
+ * is preferred.
+ *
+ * The solution is to NOT COUNT the score of any word which has a digit at one
+ * end and a "1Il" as the character the other side of the space.
+ *
+ * Conversely, any character next to a "1" within a word is counted as a positive
+ * score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of
+ * the "1" joined). "56163" would score 7 - all chars in a numeric word + 2
+ * sides of a "1" joined.
+ *
+ * The joined 1 rule is applied to any word REGARDLESS of contextual
+ * confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally
+ * confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.
+ *
+ */
+int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
+ WERD_RES_IT word_res_it(&word_res_list);
+ int16_t total_score = 0;
+ int16_t word_count = 0;
+ int16_t done_word_count = 0;
+ int16_t word_len;
+ int16_t i;
+ int16_t offset;
+ WERD_RES *word; // current word
+ int16_t prev_word_score = 0;
+ bool prev_word_done = false;
+ bool prev_char_1 = false; // prev ch a "1/I/l"?
+ bool prev_char_digit = false; // prev ch 2..9 or 0
+ bool current_char_1 = false;
+ bool current_word_ok_so_far;
+ STRING punct_chars = "!\"`',.:;";
+ bool prev_char_punct = false;
+ bool current_char_punct = false;
+ bool word_done = false;
+
+ do {
+ word = word_res_it.data();
+ word_done = fixspace_thinks_word_done(word);
+ word_count++;
+ if (word->tess_failed) {
+ total_score += prev_word_score;
+ if (prev_word_done)
+ done_word_count++;
+ prev_word_score = 0;
+ prev_char_1 = false;
+ prev_char_digit = false;
+ prev_word_done = false;
+ } else {
+ /*
+ Can we add the prev word score and potentially count this word?
+ Yes IF it didn't end in a 1 when the first char of this word is a digit
+ AND it didn't end in a digit when the first char of this word is a 1
+ */
+ word_len = word->reject_map.length();
+ current_word_ok_so_far = false;
+ if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
+ (prev_char_digit && (
+ (word_done &&
+ word->best_choice->unichar_lengths().c_str()[0] == 1 &&
+ word->best_choice->unichar_string()[0] == '1') ||
+ (!word_done && STRING(conflict_set_I_l_1).contains(
+ word->best_choice->unichar_string()[0])))))) {
+ total_score += prev_word_score;
+ if (prev_word_done)
+ done_word_count++;
+ current_word_ok_so_far = word_done;
+ }
+
+ if (current_word_ok_so_far) {
+ prev_word_done = true;
+ prev_word_score = word_len;
+ } else {
+ prev_word_done = false;
+ prev_word_score = 0;
+ }
+
+ /* Add 1 to total score for every joined 1 regardless of context and
+ rejtn */
+ for (i = 0, prev_char_1 = false; i < word_len; i++) {
+ current_char_1 = word->best_choice->unichar_string()[i] == '1';
+ if (prev_char_1 || (current_char_1 && (i > 0)))
+ total_score++;
+ prev_char_1 = current_char_1;
+ }
+
+ /* Add 1 to total score for every joined punctuation regardless of context
+ and rejtn */
+ if (tessedit_prefer_joined_punct) {
+ for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
+ offset += word->best_choice->unichar_lengths()[i++]) {
+ current_char_punct =
+ punct_chars.contains(word->best_choice->unichar_string()[offset]);
+ if (prev_char_punct || (current_char_punct && i > 0))
+ total_score++;
+ prev_char_punct = current_char_punct;
+ }
+ }
+ prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
+ for (i = 0, offset = 0; i < word_len - 1;
+ offset += word->best_choice->unichar_lengths()[i++]);
+ prev_char_1 =
+ ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
+ || (!word_done && STRING(conflict_set_I_l_1).contains(
+ word->best_choice->unichar_string()[offset])));
+ }
+ /* Find next word */
+ do {
+ word_res_it.forward();
+ } while (word_res_it.data()->part_of_combo);
+ } while (!word_res_it.at_first());
+ total_score += prev_word_score;
+ if (prev_word_done)
+ done_word_count++;
+ if (done_word_count == word_count)
+ return PERFECT_WERDS;
+ else
+ return total_score;
+}
+
+bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
+ int i;
+ int offset;
+
+ for (i = 0, offset = 0; i < char_position;
+ offset += word->best_choice->unichar_lengths()[i++]);
+ return (
+ word->uch_set->get_isdigit(
+ word->best_choice->unichar_string().c_str() + offset,
+ word->best_choice->unichar_lengths()[i]) ||
+ (word->best_choice->permuter() == NUMBER_PERM &&
+ STRING(numeric_punctuation).contains(
+ word->best_choice->unichar_string().c_str()[offset])));
+}
+
+/**
+ * @name transform_to_next_perm()
+ * Examines the current word list to find the smallest word gap size. Then walks
+ * the word list closing any gaps of this size by either inserted new
+ * combination words, or extending existing ones.
+ *
+ * The routine COULD be limited to stop it building words longer than N blobs.
+ *
+ * If there are no more gaps then it DELETES the entire list and returns the
+ * empty list to cause termination.
+ */
+void transform_to_next_perm(WERD_RES_LIST &words) {
+ WERD_RES_IT word_it(&words);
+ WERD_RES_IT prev_word_it(&words);
+ WERD_RES *word;
+ WERD_RES *prev_word;
+ WERD_RES *combo;
+ WERD *copy_word;
+ int16_t prev_right = -INT16_MAX;
+ TBOX box;
+ int16_t gap;
+ int16_t min_gap = INT16_MAX;
+
+ for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
+ word = word_it.data();
+ if (!word->part_of_combo) {
+ box = word->word->bounding_box();
+ if (prev_right > -INT16_MAX) {
+ gap = box.left() - prev_right;
+ if (gap < min_gap)
+ min_gap = gap;
+ }
+ prev_right = box.right();
+ }
+ }
+ if (min_gap < INT16_MAX) {
+ prev_right = -INT16_MAX; // back to start
+ word_it.set_to_list(&words);
+ // Note: we can't use cycle_pt due to inserted combos at start of list.
+ for (; (prev_right == -INT16_MAX) || !word_it.at_first();
+ word_it.forward()) {
+ word = word_it.data();
+ if (!word->part_of_combo) {
+ box = word->word->bounding_box();
+ if (prev_right > -INT16_MAX) {
+ gap = box.left() - prev_right;
+ if (gap <= min_gap) {
+ prev_word = prev_word_it.data();
+ if (prev_word->combination) {
+ combo = prev_word;
+ } else {
+ /* Make a new combination and insert before
+ * the first word being joined. */
+ copy_word = new WERD;
+ *copy_word = *(prev_word->word);
+ // deep copy
+ combo = new WERD_RES(copy_word);
+ combo->combination = true;
+ combo->x_height = prev_word->x_height;
+ prev_word->part_of_combo = true;
+ prev_word_it.add_before_then_move(combo);
+ }
+ combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
+ if (word->combination) {
+ combo->word->join_on(word->word);
+ // Move blobs to combo
+ // old combo no longer needed
+ delete word_it.extract();
+ } else {
+ // Copy current wd to combo
+ combo->copy_on(word);
+ word->part_of_combo = true;
+ }
+ combo->done = false;
+ combo->ClearResults();
+ } else {
+ prev_word_it = word_it; // catch up
+ }
+ }
+ prev_right = box.right();
+ }
+ }
+ } else {
+ words.clear(); // signal termination
+ }
+}
+
+void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score,
+ int16_t mode, bool improved) {
+ WERD_RES_IT word_res_it(&perm);
+
+ if (debug_fix_space_level > 0) {
+ if (mode == 1) {
+ stats_.dump_words_str = "";
+ for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
+ word_res_it.forward()) {
+ if (!word_res_it.data()->part_of_combo) {
+ stats_.dump_words_str +=
+ word_res_it.data()->best_choice->unichar_string();
+ stats_.dump_words_str += ' ';
+ }
+ }
+ }
+
+ if (debug_fix_space_level > 1) {
+ switch (mode) {
+ case 1:
+ tprintf("EXTRACTED (%d): \"", score);
+ break;
+ case 2:
+ tprintf("TESTED (%d): \"", score);
+ break;
+ case 3:
+ tprintf("RETURNED (%d): \"", score);
+ break;
+ }
+
+ for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
+ word_res_it.forward()) {
+ if (!word_res_it.data()->part_of_combo) {
+ tprintf("%s/%1d ",
+ word_res_it.data()->best_choice->unichar_string().c_str(),
+ static_cast<int>(word_res_it.data()->best_choice->permuter()));
+ }
+ }
+ tprintf("\"\n");
+ } else if (improved) {
+ tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.c_str());
+ for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
+ word_res_it.forward()) {
+ if (!word_res_it.data()->part_of_combo) {
+ tprintf("%s/%1d ",
+ word_res_it.data()->best_choice->unichar_string().c_str(),
+ static_cast<int>(word_res_it.data()->best_choice->permuter()));
+ }
+ }
+ tprintf("\"\n");
+ }
+ }
+}
+
+bool Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
+ if (word->done)
+ return true;
+
+ /*
+ Use all the standard pass 2 conditions for mode 5 in set_done() in
+ reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
+ CARE WHETHER WE HAVE of/at on/an etc.
+ */
+ if (fixsp_done_mode > 0 &&
+ (word->tess_accepted ||
+ (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
+ fixsp_done_mode == 3) &&
+ (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr) &&
+ ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
+ (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
+ (word->best_choice->permuter() == USER_DAWG_PERM) ||
+ (word->best_choice->permuter() == NUMBER_PERM))) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+
+/**
+ * @name fix_sp_fp_word()
+ * Test the current word to see if it can be split by deleting noise blobs. If
+ * so, do the business.
+ * Return with the iterator pointing to the same place if the word is unchanged,
+ * or the last of the replacement words.
+ */
+void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
+ BLOCK* block) {
+ WERD_RES *word_res;
+ WERD_RES_LIST sub_word_list;
+ WERD_RES_IT sub_word_list_it(&sub_word_list);
+ int16_t blob_index;
+ int16_t new_length;
+ float junk;
+
+ word_res = word_res_it.data();
+ if (word_res->word->flag(W_REP_CHAR) ||
+ word_res->combination ||
+ word_res->part_of_combo ||
+ !word_res->word->flag(W_DONT_CHOP))
+ return;
+
+ blob_index = worst_noise_blob(word_res, &junk);
+ if (blob_index < 0)
+ return;
+
+ if (debug_fix_space_level > 1) {
+ tprintf("FP fixspace working on \"%s\"\n",
+ word_res->best_choice->unichar_string().c_str());
+ }
+ word_res->word->rej_cblob_list()->sort(c_blob_comparator);
+ sub_word_list_it.add_after_stay_put(word_res_it.extract());
+ fix_noisy_space_list(sub_word_list, row, block);
+ new_length = sub_word_list.length();
+ word_res_it.add_list_before(&sub_word_list);
+ for (; !word_res_it.at_last() && new_length > 1; new_length--) {
+ word_res_it.forward();
+ }
+}
+
+void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
+ BLOCK* block) {
+ int16_t best_score;
+ WERD_RES_IT best_perm_it(&best_perm);
+ WERD_RES_LIST current_perm;
+ WERD_RES_IT current_perm_it(&current_perm);
+ WERD_RES *old_word_res;
+ int16_t current_score;
+ bool improved = false;
+
+ best_score = fp_eval_word_spacing(best_perm); // default score
+
+ dump_words(best_perm, best_score, 1, improved);
+
+ old_word_res = best_perm_it.data();
+ // Even deep_copy doesn't copy the underlying WERD unless its combination
+ // flag is true!.
+ old_word_res->combination = true; // Kludge to force deep copy
+ current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
+ old_word_res->combination = false; // Undo kludge
+
+ break_noisiest_blob_word(current_perm);
+
+ while (best_score != PERFECT_WERDS && !current_perm.empty()) {
+ match_current_words(current_perm, row, block);
+ current_score = fp_eval_word_spacing(current_perm);
+ dump_words(current_perm, current_score, 2, improved);
+ if (current_score > best_score) {
+ best_perm.clear();
+ best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
+ best_score = current_score;
+ improved = true;
+ }
+ if (current_score < PERFECT_WERDS) {
+ break_noisiest_blob_word(current_perm);
+ }
+ }
+ dump_words(best_perm, best_score, 3, improved);
+}
+
+
+/**
+ * break_noisiest_blob_word()
+ * Find the word with the blob which looks like the worst noise.
+ * Break the word into two, deleting the noise blob.
+ */
+void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
+ WERD_RES_IT word_it(&words);
+ WERD_RES_IT worst_word_it;
+ float worst_noise_score = 9999;
+ int worst_blob_index = -1; // Noisiest blob of noisiest wd
+ int blob_index; // of wds noisiest blob
+ float noise_score; // of wds noisiest blob
+ WERD_RES *word_res;
+ C_BLOB_IT blob_it;
+ C_BLOB_IT rej_cblob_it;
+ C_BLOB_LIST new_blob_list;
+ C_BLOB_IT new_blob_it;
+ C_BLOB_IT new_rej_cblob_it;
+ WERD *new_word;
+ int16_t start_of_noise_blob;
+ int16_t i;
+
+ for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
+ blob_index = worst_noise_blob(word_it.data(), &noise_score);
+ if (blob_index > -1 && worst_noise_score > noise_score) {
+ worst_noise_score = noise_score;
+ worst_blob_index = blob_index;
+ worst_word_it = word_it;
+ }
+ }
+ if (worst_blob_index < 0) {
+ words.clear(); // signal termination
+ return;
+ }
+
+ /* Now split the worst_word_it */
+
+ word_res = worst_word_it.data();
+
+ /* Move blobs before noise blob to a new bloblist */
+
+ new_blob_it.set_to_list(&new_blob_list);
+ blob_it.set_to_list(word_res->word->cblob_list());
+ for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
+ new_blob_it.add_after_then_move(blob_it.extract());
+ }
+ start_of_noise_blob = blob_it.data()->bounding_box().left();
+ delete blob_it.extract(); // throw out noise blob
+
+ new_word = new WERD(&new_blob_list, word_res->word);
+ new_word->set_flag(W_EOL, false);
+ word_res->word->set_flag(W_BOL, false);
+ word_res->word->set_blanks(1); // After break
+
+ new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
+ rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
+ for (;
+ (!rej_cblob_it.empty() &&
+ (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
+ rej_cblob_it.forward()) {
+ new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
+ }
+
+ auto* new_word_res = new WERD_RES(new_word);
+ new_word_res->combination = true;
+ worst_word_it.add_before_then_move(new_word_res);
+
+ word_res->ClearResults();
+}
+
+int16_t Tesseract::worst_noise_blob(WERD_RES *word_res,
+ float *worst_noise_score) {
+ float noise_score[512];
+ int i;
+ int min_noise_blob; // 1st contender
+ int max_noise_blob; // last contender
+ int non_noise_count;
+ int worst_noise_blob; // Worst blob
+ float small_limit = kBlnXHeight * fixsp_small_outlines_size;
+ float non_noise_limit = kBlnXHeight * 0.8;
+
+ if (word_res->rebuild_word == nullptr)
+ return -1; // Can't handle cube words.
+
+ // Normalised.
+ int blob_count = word_res->box_word->length();
+ ASSERT_HOST(blob_count <= 512);
+ if (blob_count < 5)
+ return -1; // too short to split
+
+ /* Get the noise scores for all blobs */
+
+ #ifndef SECURE_NAMES
+ if (debug_fix_space_level > 5)
+ tprintf("FP fixspace Noise metrics for \"%s\": ",
+ word_res->best_choice->unichar_string().c_str());
+ #endif
+
+ for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
+ TBLOB* blob = word_res->rebuild_word->blobs[i];
+ if (word_res->reject_map[i].accepted())
+ noise_score[i] = non_noise_limit;
+ else
+ noise_score[i] = blob_noise_score(blob);
+
+ if (debug_fix_space_level > 5)
+ tprintf("%1.1f ", noise_score[i]);
+ }
+ if (debug_fix_space_level > 5)
+ tprintf("\n");
+
+ /* Now find the worst one which is far enough away from the end of the word */
+
+ non_noise_count = 0;
+ for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
+ if (noise_score[i] >= non_noise_limit) {
+ non_noise_count++;
+ }
+ }
+ if (non_noise_count < fixsp_non_noise_limit)
+ return -1;
+
+ min_noise_blob = i;
+
+ non_noise_count = 0;
+ for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
+ i--) {
+ if (noise_score[i] >= non_noise_limit) {
+ non_noise_count++;
+ }
+ }
+ if (non_noise_count < fixsp_non_noise_limit)
+ return -1;
+
+ max_noise_blob = i;
+
+ if (min_noise_blob > max_noise_blob)
+ return -1;
+
+ *worst_noise_score = small_limit;
+ worst_noise_blob = -1;
+ for (i = min_noise_blob; i <= max_noise_blob; i++) {
+ if (noise_score[i] < *worst_noise_score) {
+ worst_noise_blob = i;
+ *worst_noise_score = noise_score[i];
+ }
+ }
+ return worst_noise_blob;
+}
+
+float Tesseract::blob_noise_score(TBLOB *blob) {
+ TBOX box; // BB of outline
+ int16_t outline_count = 0;
+ int16_t max_dimension;
+ int16_t largest_outline_dimension = 0;
+
+ for (TESSLINE* ol = blob->outlines; ol != nullptr; ol= ol->next) {
+ outline_count++;
+ box = ol->bounding_box();
+ if (box.height() > box.width()) {
+ max_dimension = box.height();
+ } else {
+ max_dimension = box.width();
+ }
+
+ if (largest_outline_dimension < max_dimension)
+ largest_outline_dimension = max_dimension;
+ }
+
+ if (outline_count > 5) {
+ // penalise LOTS of blobs
+ largest_outline_dimension *= 2;
+ }
+
+ box = blob->bounding_box();
+ if (box.bottom() > kBlnBaselineOffset * 4 ||
+ box.top() < kBlnBaselineOffset / 2) {
+ // Lax blob is if high or low
+ largest_outline_dimension /= 2;
+ }
+
+ return largest_outline_dimension;
+}
+
+void fixspace_dbg(WERD_RES *word) {
+ TBOX box = word->word->bounding_box();
+ const bool show_map_detail = false;
+ int16_t i;
+
+ box.print();
+ tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
+ tprintf("Blob count: %d (word); %d/%d (rebuild word)\n",
+ word->word->cblob_list()->length(),
+ word->rebuild_word->NumBlobs(),
+ word->box_word->length());
+ word->reject_map.print(debug_fp);
+ tprintf("\n");
+ if (show_map_detail) {
+ tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
+ for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
+ tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
+ word->reject_map[i].full_print(debug_fp);
+ }
+ }
+
+ tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
+ tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
+}
+
+
+/**
+ * fp_eval_word_spacing()
+ * Evaluation function for fixed pitch word lists.
+ *
+ * Basically, count the number of "nice" characters - those which are in tess
+ * acceptable words or in dict words and are not rejected.
+ * Penalise any potential noise chars
+ */
+int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
+ WERD_RES_IT word_it(&word_res_list);
+ WERD_RES *word;
+ int16_t score = 0;
+ int16_t i;
+ float small_limit = kBlnXHeight * fixsp_small_outlines_size;
+
+ for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
+ word = word_it.data();
+ if (word->rebuild_word == nullptr)
+ continue; // Can't handle cube words.
+ if (word->done ||
+ word->tess_accepted ||
+ word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
+ word->best_choice->permuter() == FREQ_DAWG_PERM ||
+ word->best_choice->permuter() == USER_DAWG_PERM ||
+ safe_dict_word(word) > 0) {
+ int num_blobs = word->rebuild_word->NumBlobs();
+ UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
+ for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
+ TBLOB* blob = word->rebuild_word->blobs[i];
+ if (word->best_choice->unichar_id(i) == space ||
+ blob_noise_score(blob) < small_limit) {
+ score -= 1; // penalise possibly erroneous non-space
+ } else if (word->reject_map[i].accepted()) {
+ score++;
+ }
+ }
+ }
+ }
+ if (score < 0)
+ score = 0;
+ return score;
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/fixspace.h b/tesseract/src/ccmain/fixspace.h
new file mode 100644
index 00000000..fd49bf29
--- /dev/null
+++ b/tesseract/src/ccmain/fixspace.h
@@ -0,0 +1,36 @@
+/******************************************************************
+ * File: fixspace.h (Formerly fixspace.h)
+ * Description: Implements a pass over the page res, exploring the alternative
+ * spacing possibilities, trying to use context to improve the
+ * word spacing
+ * Author: Phil Cheatle
+ * Created: Thu Oct 21 11:38:43 BST 1993
+ *
+ * (C) Copyright 1993, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef FIXSPACE_H
+#define FIXSPACE_H
+
+namespace tesseract {
+
+class WERD_RES;
+class WERD_RES_LIST;
+
+void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list);
+void transform_to_next_perm(WERD_RES_LIST &words);
+void fixspace_dbg(WERD_RES *word);
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/ccmain/fixxht.cpp b/tesseract/src/ccmain/fixxht.cpp
new file mode 100644
index 00000000..716ac385
--- /dev/null
+++ b/tesseract/src/ccmain/fixxht.cpp
@@ -0,0 +1,216 @@
+/**********************************************************************
+ * File: fixxht.cpp (Formerly fixxht.c)
+ * Description: Improve x_ht and look out for case inconsistencies
+ * Author: Phil Cheatle
+ * Created: Thu Aug 5 14:11:08 BST 1993
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "params.h"
+#include "float2int.h"
+#include "tesseractclass.h"
+
+#include <algorithm>
+#include <cstring>
+#include <cctype>
+
+namespace tesseract {
+
+// Fixxht overview.
+// Premise: Initial estimate of x-height is adequate most of the time, but
+// occasionally it is incorrect. Most notable causes of failure are:
+// 1. Small caps, where the top of the caps is the same as the body text
+// xheight. For small caps words the xheight needs to be reduced to correctly
+// recognize the caps in the small caps word.
+// 2. All xheight lines, such as summer. Here the initial estimate will have
+// guessed that the blob tops are caps and will have placed the xheight too low.
+// 3. Noise/logos beside words, or changes in font size on a line. Such
+// things can blow the statistics and cause an incorrect estimate.
+// 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged.
+// In this case the x-height is often still correct.
+//
+// Algorithm.
+// Compare the vertical position (top only) of alphnumerics in a word with
+// the range of positions in training data (in the unicharset).
+// See CountMisfitTops. If any characters disagree sufficiently with the
+// initial xheight estimate, then recalculate the xheight, re-run OCR on
+// the word, and if the number of vertical misfits goes down, along with
+// either the word rating or certainty, then keep the new xheight.
+// The new xheight is calculated as follows:ComputeCompatibleXHeight
+// For each alphanumeric character that has a vertically misplaced top
+// (a misfit), yet its bottom is within the acceptable range (ie it is not
+// likely a sub-or super-script) calculate the range of acceptable xheight
+// positions from its range of tops, and give each value in the range a
+// number of votes equal to the distance of its top from its acceptance range.
+// The x-height position with the median of the votes becomes the new
+// x-height. This assumes that most characters will be correctly recognized
+// even if the x-height is incorrect. This is not a terrible assumption, but
+// it is not great. An improvement would be to use a classifier that does
+// not care about vertical position or scaling at all.
+// Separately collect stats on shifted baselines and apply the same logic to
+// computing a best-fit shift to fix the error. If the baseline needs to be
+// shifted, but the x-height is OK, returns the original x-height along with
+// the baseline shift to indicate that recognition needs to re-run.
+
+// If the max-min top of a unicharset char is bigger than kMaxCharTopRange
+// then the char top cannot be used to judge misfits or suggest a new top.
+const int kMaxCharTopRange = 48;
+
+// Returns the number of misfit blob tops in this word.
+int Tesseract::CountMisfitTops(WERD_RES *word_res) {
+ int bad_blobs = 0;
+ int num_blobs = word_res->rebuild_word->NumBlobs();
+ for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
+ TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
+ UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
+ if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
+ int top = blob->bounding_box().top();
+ if (top >= INT_FEAT_RANGE)
+ top = INT_FEAT_RANGE - 1;
+ int min_bottom, max_bottom, min_top, max_top;
+ unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
+ &min_top, &max_top);
+ if (max_top - min_top > kMaxCharTopRange)
+ continue;
+ bool bad = top < min_top - x_ht_acceptance_tolerance ||
+ top > max_top + x_ht_acceptance_tolerance;
+ if (bad)
+ ++bad_blobs;
+ if (debug_x_ht_level >= 1) {
+ tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
+ unicharset.id_to_unichar(class_id),
+ bad ? "Misfit" : "OK", top, min_top, max_top,
+ static_cast<int>(x_ht_acceptance_tolerance));
+ }
+ }
+ }
+ return bad_blobs;
+}
+
+// Returns a new x-height maximally compatible with the result in word_res.
+// See comment above for overall algorithm.
+float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res,
+ float* baseline_shift) {
+ STATS top_stats(0, UINT8_MAX);
+ STATS shift_stats(-UINT8_MAX, UINT8_MAX);
+ int bottom_shift = 0;
+ int num_blobs = word_res->rebuild_word->NumBlobs();
+ do {
+ top_stats.clear();
+ shift_stats.clear();
+ for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
+ TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
+ UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
+ if (unicharset.get_isalpha(class_id) ||
+ unicharset.get_isdigit(class_id)) {
+ int top = blob->bounding_box().top() + bottom_shift;
+ // Clip the top to the limit of normalized feature space.
+ if (top >= INT_FEAT_RANGE)
+ top = INT_FEAT_RANGE - 1;
+ int bottom = blob->bounding_box().bottom() + bottom_shift;
+ int min_bottom, max_bottom, min_top, max_top;
+ unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
+ &min_top, &max_top);
+ // Chars with a wild top range would mess up the result so ignore them.
+ if (max_top - min_top > kMaxCharTopRange)
+ continue;
+ int misfit_dist = std::max((min_top - x_ht_acceptance_tolerance) - top,
+ top - (max_top + x_ht_acceptance_tolerance));
+ int height = top - kBlnBaselineOffset;
+ if (debug_x_ht_level >= 2) {
+ tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
+ unicharset.id_to_unichar(class_id),
+ height, min_bottom, max_bottom, min_top, max_top,
+ bottom, top);
+ }
+ // Use only chars that fit in the expected bottom range, and where
+ // the range of tops is sensibly near the xheight.
+ if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
+ bottom - x_ht_acceptance_tolerance <= max_bottom &&
+ min_top > kBlnBaselineOffset &&
+ max_top - kBlnBaselineOffset >= kBlnXHeight &&
+ misfit_dist > 0) {
+ // Compute the x-height position using proportionality between the
+ // actual height and expected height.
+ int min_xht = DivRounded(height * kBlnXHeight,
+ max_top - kBlnBaselineOffset);
+ int max_xht = DivRounded(height * kBlnXHeight,
+ min_top - kBlnBaselineOffset);
+ if (debug_x_ht_level >= 2) {
+ tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
+ }
+ // The range of expected heights gets a vote equal to the distance
+ // of the actual top from the expected top.
+ for (int y = min_xht; y <= max_xht; ++y)
+ top_stats.add(y, misfit_dist);
+ } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
+ bottom - x_ht_acceptance_tolerance > max_bottom) &&
+ bottom_shift == 0) {
+ // Get the range of required bottom shift.
+ int min_shift = min_bottom - bottom;
+ int max_shift = max_bottom - bottom;
+ if (debug_x_ht_level >= 2) {
+ tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
+ }
+ // The range of expected shifts gets a vote equal to the min distance
+ // of the actual bottom from the expected bottom, spread over the
+ // range of its acceptance.
+ int misfit_weight = abs(min_shift);
+ if (max_shift > min_shift)
+ misfit_weight /= max_shift - min_shift;
+ for (int y = min_shift; y <= max_shift; ++y)
+ shift_stats.add(y, misfit_weight);
+ } else {
+ if (bottom_shift == 0) {
+ // Things with bottoms that are already ok need to say so, on the
+ // 1st iteration only.
+ shift_stats.add(0, kBlnBaselineOffset);
+ }
+ if (debug_x_ht_level >= 2) {
+ tprintf(" already OK\n");
+ }
+ }
+ }
+ }
+ if (shift_stats.get_total() > top_stats.get_total()) {
+ bottom_shift = IntCastRounded(shift_stats.median());
+ if (debug_x_ht_level >= 2) {
+ tprintf("Applying bottom shift=%d\n", bottom_shift);
+ }
+ }
+ } while (bottom_shift != 0 &&
+ top_stats.get_total() < shift_stats.get_total());
+ // Baseline shift is opposite sign to the bottom shift.
+ *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
+ if (debug_x_ht_level >= 2) {
+ tprintf("baseline shift=%g\n", *baseline_shift);
+ }
+ if (top_stats.get_total() == 0)
+ return bottom_shift != 0 ? word_res->x_height : 0.0f;
+ // The new xheight is just the median vote, which is then scaled out
+ // of BLN space back to pixel space to get the x-height in pixel space.
+ float new_xht = top_stats.median();
+ if (debug_x_ht_level >= 2) {
+ tprintf("Median xht=%f\n", new_xht);
+ tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
+ new_xht, new_xht / word_res->denorm.y_scale());
+ }
+ // The xheight must change by at least x_ht_min_change to be used.
+ if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
+ return new_xht / word_res->denorm.y_scale();
+ else
+ return bottom_shift != 0 ? word_res->x_height : 0.0f;
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/linerec.cpp b/tesseract/src/ccmain/linerec.cpp
new file mode 100644
index 00000000..4db50e03
--- /dev/null
+++ b/tesseract/src/ccmain/linerec.cpp
@@ -0,0 +1,307 @@
+///////////////////////////////////////////////////////////////////////
+// File: linerec.cpp
+// Description: Top-level line-based recognition module for Tesseract.
+// Author: Ray Smith
+//
+// (C) Copyright 2013, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#include "tesseractclass.h"
+
+#include "allheaders.h"
+#include "boxread.h"
+#include "imagedata.h"
+#include "lstmrecognizer.h"
+#include "recodebeam.h"
+#include "pageres.h"
+#include "tprintf.h"
+
+#include <algorithm>
+
+namespace tesseract {
+
+// Scale factor to make certainty more comparable to Tesseract.
+const float kCertaintyScale = 7.0f;
+// Worst acceptable certainty for a dictionary word.
+const float kWorstDictCertainty = -25.0f;
+
+// Generates training data for training a line recognizer, eg LSTM.
+// Breaks the page into lines, according to the boxes, and writes them to a
+// serialized DocumentData based on output_basename.
+// Return true if successful, false if an error occurred.
+bool Tesseract::TrainLineRecognizer(const char* input_imagename,
+ const STRING& output_basename,
+ BLOCK_LIST *block_list) {
+ STRING lstmf_name = output_basename + ".lstmf";
+ DocumentData images(lstmf_name);
+ if (applybox_page > 0) {
+ // Load existing document for the previous pages.
+ if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) {
+ tprintf("Failed to read training data from %s!\n", lstmf_name.c_str());
+ return false;
+ }
+ }
+ std::vector<TBOX> boxes;
+ std::vector<STRING> texts;
+ // Get the boxes for this page, if there are any.
+ if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr,
+ nullptr) ||
+ boxes.empty()) {
+ tprintf("Failed to read boxes from %s\n", input_imagename);
+ return false;
+ }
+ TrainFromBoxes(boxes, texts, block_list, &images);
+ if (images.PagesSize() == 0) {
+ tprintf("Failed to read pages from %s\n", input_imagename);
+ return false;
+ }
+ images.Shuffle();
+ if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) {
+ tprintf("Failed to write training data to %s!\n", lstmf_name.c_str());
+ return false;
+ }
+ return true;
+}
+
+// Generates training data for training a line recognizer, eg LSTM.
+// Breaks the boxes into lines, normalizes them, converts to ImageData and
+// appends them to the given training_data.
+void Tesseract::TrainFromBoxes(const std::vector<TBOX>& boxes,
+ const std::vector<STRING>& texts,
+ BLOCK_LIST *block_list,
+ DocumentData* training_data) {
+ int box_count = boxes.size();
+ // Process all the text lines in this page, as defined by the boxes.
+ int end_box = 0;
+ // Don't let \t, which marks newlines in the box file, get into the line
+ // content, as that makes the line unusable in training.
+ while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
+ for (int start_box = end_box; start_box < box_count; start_box = end_box) {
+ // Find the textline of boxes starting at start and their bounding box.
+ TBOX line_box = boxes[start_box];
+ STRING line_str = texts[start_box];
+ for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t";
+ ++end_box) {
+ line_box += boxes[end_box];
+ line_str += texts[end_box];
+ }
+ // Find the most overlapping block.
+ BLOCK* best_block = nullptr;
+ int best_overlap = 0;
+ BLOCK_IT b_it(block_list);
+ for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+ BLOCK* block = b_it.data();
+ if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText())
+ continue; // Not a text block.
+ TBOX block_box = block->pdblk.bounding_box();
+ block_box.rotate(block->re_rotation());
+ if (block_box.major_overlap(line_box)) {
+ TBOX overlap_box = line_box.intersection(block_box);
+ if (overlap_box.area() > best_overlap) {
+ best_overlap = overlap_box.area();
+ best_block = block;
+ }
+ }
+ }
+ ImageData* imagedata = nullptr;
+ if (best_block == nullptr) {
+ tprintf("No block overlapping textline: %s\n", line_str.c_str());
+ } else {
+ imagedata = GetLineData(line_box, boxes, texts, start_box, end_box,
+ *best_block);
+ }
+ if (imagedata != nullptr)
+ training_data->AddPageToDocument(imagedata);
+ // Don't let \t, which marks newlines in the box file, get into the line
+ // content, as that makes the line unusable in training.
+ while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
+ }
+}
+
+// Returns an Imagedata containing the image of the given box,
+// and ground truth boxes/truth text if available in the input.
+// The image is not normalized in any way.
+ImageData* Tesseract::GetLineData(const TBOX& line_box,
+ const std::vector<TBOX>& boxes,
+ const std::vector<STRING>& texts,
+ int start_box, int end_box,
+ const BLOCK& block) {
+ TBOX revised_box;
+ ImageData* image_data = GetRectImage(line_box, block, kImagePadding,
+ &revised_box);
+ if (image_data == nullptr) return nullptr;
+ image_data->set_page_number(applybox_page);
+ // Copy the boxes and shift them so they are relative to the image.
+ FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
+ ICOORD shift = -revised_box.botleft();
+ std::vector<TBOX> line_boxes;
+ std::vector<STRING> line_texts;
+ for (int b = start_box; b < end_box; ++b) {
+ TBOX box = boxes[b];
+ box.rotate(block_rotation);
+ box.move(shift);
+ line_boxes.push_back(box);
+ line_texts.push_back(texts[b]);
+ }
+ std::vector<int> page_numbers;
+ page_numbers.resize(line_boxes.size(), applybox_page);
+ image_data->AddBoxes(line_boxes, line_texts, page_numbers);
+ return image_data;
+}
+
+// Helper gets the image of a rectangle, using the block.re_rotation() if
+// needed to get to the image, and rotating the result back to horizontal
+// layout. (CJK characters will be on their left sides) The vertical text flag
+// is set in the returned ImageData if the text was originally vertical, which
+// can be used to invoke a different CJK recognition engine. The revised_box
+// is also returned to enable calculation of output bounding boxes.
+ImageData* Tesseract::GetRectImage(const TBOX& box, const BLOCK& block,
+ int padding, TBOX* revised_box) const {
+ TBOX wbox = box;
+ wbox.pad(padding, padding);
+ *revised_box = wbox;
+ // Number of clockwise 90 degree rotations needed to get back to tesseract
+ // coords from the clipped image.
+ int num_rotations = 0;
+ if (block.re_rotation().y() > 0.0f)
+ num_rotations = 1;
+ else if (block.re_rotation().x() < 0.0f)
+ num_rotations = 2;
+ else if (block.re_rotation().y() < 0.0f)
+ num_rotations = 3;
+ // Handle two cases automatically: 1 the box came from the block, 2 the box
+ // came from a box file, and refers to the image, which the block may not.
+ if (block.pdblk.bounding_box().major_overlap(*revised_box))
+ revised_box->rotate(block.re_rotation());
+ // Now revised_box always refers to the image.
+ // BestPix is never colormapped, but may be of any depth.
+ Pix* pix = BestPix();
+ int width = pixGetWidth(pix);
+ int height = pixGetHeight(pix);
+ TBOX image_box(0, 0, width, height);
+ // Clip to image bounds;
+ *revised_box &= image_box;
+ if (revised_box->null_box()) return nullptr;
+ Box* clip_box = boxCreate(revised_box->left(), height - revised_box->top(),
+ revised_box->width(), revised_box->height());
+ Pix* box_pix = pixClipRectangle(pix, clip_box, nullptr);
+ boxDestroy(&clip_box);
+ if (box_pix == nullptr) return nullptr;
+ if (num_rotations > 0) {
+ Pix* rot_pix = pixRotateOrth(box_pix, num_rotations);
+ pixDestroy(&box_pix);
+ box_pix = rot_pix;
+ }
+ // Convert sub-8-bit images to 8 bit.
+ int depth = pixGetDepth(box_pix);
+ if (depth < 8) {
+ Pix* grey;
+ grey = pixConvertTo8(box_pix, false);
+ pixDestroy(&box_pix);
+ box_pix = grey;
+ }
+ bool vertical_text = false;
+ if (num_rotations > 0) {
+ // Rotated the clipped revised box back to internal coordinates.
+ FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
+ revised_box->rotate(rotation);
+ if (num_rotations != 2)
+ vertical_text = true;
+ }
+ return new ImageData(vertical_text, box_pix);
+}
+
+// Recognizes a word or group of words, converting to WERD_RES in *words.
+// Analogous to classify_word_pass1, but can handle a group of words as well.
+void Tesseract::LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word,
+ PointerVector<WERD_RES>* words) {
+ TBOX word_box = word->word->bounding_box();
+ // Get the word image - no frills.
+ if (tessedit_pageseg_mode == PSM_SINGLE_WORD ||
+ tessedit_pageseg_mode == PSM_RAW_LINE) {
+ // In single word mode, use the whole image without any other row/word
+ // interpretation.
+ word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
+ } else {
+ float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
+ if (baseline + row->descenders() < word_box.bottom())
+ word_box.set_bottom(baseline + row->descenders());
+ if (baseline + row->x_height() + row->ascenders() > word_box.top())
+ word_box.set_top(baseline + row->x_height() + row->ascenders());
+ }
+ ImageData* im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
+ if (im_data == nullptr) return;
+
+ bool do_invert = tessedit_do_invert;
+ lstm_recognizer_->RecognizeLine(*im_data, do_invert, classify_debug_level > 0,
+ kWorstDictCertainty / kCertaintyScale,
+ word_box, words, lstm_choice_mode,
+ lstm_choice_iterations);
+ delete im_data;
+ SearchWords(words);
+}
+
+// Apply segmentation search to the given set of words, within the constraints
+// of the existing ratings matrix. If there is already a best_choice on a word
+// leaves it untouched and just sets the done/accepted etc flags.
+void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
+ // Run the segmentation search on the network outputs and make a BoxWord
+ // for each of the output words.
+ // If we drop a word as junk, then there is always a space in front of the
+ // next.
+ const Dict* stopper_dict = lstm_recognizer_->GetDict();
+ if (stopper_dict == nullptr) stopper_dict = &getDict();
+ bool any_nonspace_delimited = false;
+ for (int w = 0; w < words->size(); ++w) {
+ WERD_RES* word = (*words)[w];
+ if (word->best_choice != nullptr &&
+ word->best_choice->ContainsAnyNonSpaceDelimited()) {
+ any_nonspace_delimited = true;
+ break;
+ }
+ }
+ for (int w = 0; w < words->size(); ++w) {
+ WERD_RES* word = (*words)[w];
+ if (word->best_choice == nullptr) {
+ // It is a dud.
+ word->SetupFake(lstm_recognizer_->GetUnicharset());
+ } else {
+ // Set the best state.
+ for (int i = 0; i < word->best_choice->length(); ++i) {
+ int length = word->best_choice->state(i);
+ word->best_state.push_back(length);
+ }
+ word->reject_map.initialise(word->best_choice->length());
+ word->tess_failed = false;
+ word->tess_accepted = true;
+ word->tess_would_adapt = false;
+ word->done = true;
+ word->tesseract = this;
+ float word_certainty = std::min(word->space_certainty,
+ word->best_choice->certainty());
+ word_certainty *= kCertaintyScale;
+ if (getDict().stopper_debug_level >= 1) {
+ tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
+ word->best_choice->certainty(), word->space_certainty,
+ std::min(word->space_certainty, word->best_choice->certainty()) *
+ kCertaintyScale,
+ word_certainty);
+ word->best_choice->print();
+ }
+ word->best_choice->set_certainty(word_certainty);
+
+ word->tess_accepted = stopper_dict->AcceptableResult(word);
+ }
+ }
+}
+
+} // namespace tesseract.
diff --git a/tesseract/src/ccmain/ltrresultiterator.cpp b/tesseract/src/ccmain/ltrresultiterator.cpp
new file mode 100644
index 00000000..5b6cfaf5
--- /dev/null
+++ b/tesseract/src/ccmain/ltrresultiterator.cpp
@@ -0,0 +1,492 @@
+///////////////////////////////////////////////////////////////////////
+// File: ltrresultiterator.cpp
+// Description: Iterator for tesseract results in strict left-to-right
+// order that avoids using tesseract internal data structures.
+// Author: Ray Smith
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include <tesseract/ltrresultiterator.h>
+
+#include "pageres.h"
+#include "tesseractclass.h"
+
+#include "allheaders.h"
+
+#include "strngs.h"
+
+namespace tesseract {
+
+LTRResultIterator::LTRResultIterator(PAGE_RES* page_res, Tesseract* tesseract,
+ int scale, int scaled_yres, int rect_left,
+ int rect_top, int rect_width,
+ int rect_height)
+ : PageIterator(page_res, tesseract, scale, scaled_yres, rect_left, rect_top,
+ rect_width, rect_height),
+ line_separator_("\n"),
+ paragraph_separator_("\n") {}
+
+// Destructor.
+// It is defined here, so the compiler can create a single vtable
+// instead of weak vtables in every compilation unit.
+LTRResultIterator::~LTRResultIterator() = default;
+
+// Returns the null terminated UTF-8 encoded text string for the current
+// object at the given level. Use delete [] to free after use.
+char* LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {
+ if (it_->word() == nullptr)
+ return nullptr; // Already at the end!
+ STRING text;
+ PAGE_RES_IT res_it(*it_);
+ WERD_CHOICE* best_choice = res_it.word()->best_choice;
+ ASSERT_HOST(best_choice != nullptr);
+ if (level == RIL_SYMBOL) {
+ text = res_it.word()->BestUTF8(blob_index_, false);
+ } else if (level == RIL_WORD) {
+ text = best_choice->unichar_string();
+ } else {
+ bool eol = false; // end of line?
+ bool eop = false; // end of paragraph?
+ do { // for each paragraph in a block
+ do { // for each text line in a paragraph
+ do { // for each word in a text line
+ best_choice = res_it.word()->best_choice;
+ ASSERT_HOST(best_choice != nullptr);
+ text += best_choice->unichar_string();
+ text += " ";
+ res_it.forward();
+ eol = res_it.row() != res_it.prev_row();
+ } while (!eol);
+ text.truncate_at(text.length() - 1);
+ text += line_separator_;
+ eop = res_it.block() != res_it.prev_block() ||
+ res_it.row()->row->para() != res_it.prev_row()->row->para();
+ } while (level != RIL_TEXTLINE && !eop);
+ if (eop)
+ text += paragraph_separator_;
+ } while (level == RIL_BLOCK && res_it.block() == res_it.prev_block());
+ }
+ int length = text.length() + 1;
+ char* result = new char[length];
+ strncpy(result, text.c_str(), length);
+ return result;
+}
+
+// Set the string inserted at the end of each text line. "\n" by default.
+void LTRResultIterator::SetLineSeparator(const char* new_line) {
+ line_separator_ = new_line;
+}
+
+// Set the string inserted at the end of each paragraph. "\n" by default.
+void LTRResultIterator::SetParagraphSeparator(const char* new_para) {
+ paragraph_separator_ = new_para;
+}
+
+// Returns the mean confidence of the current object at the given level.
+// The number should be interpreted as a percent probability. (0.0f-100.0f)
+float LTRResultIterator::Confidence(PageIteratorLevel level) const {
+ if (it_->word() == nullptr)
+ return 0.0f; // Already at the end!
+ float mean_certainty = 0.0f;
+ int certainty_count = 0;
+ PAGE_RES_IT res_it(*it_);
+ WERD_CHOICE* best_choice = res_it.word()->best_choice;
+ ASSERT_HOST(best_choice != nullptr);
+ switch (level) {
+ case RIL_BLOCK:
+ do {
+ best_choice = res_it.word()->best_choice;
+ ASSERT_HOST(best_choice != nullptr);
+ mean_certainty += best_choice->certainty();
+ ++certainty_count;
+ res_it.forward();
+ } while (res_it.block() == res_it.prev_block());
+ break;
+ case RIL_PARA:
+ do {
+ best_choice = res_it.word()->best_choice;
+ ASSERT_HOST(best_choice != nullptr);
+ mean_certainty += best_choice->certainty();
+ ++certainty_count;
+ res_it.forward();
+ } while (res_it.block() == res_it.prev_block() &&
+ res_it.row()->row->para() == res_it.prev_row()->row->para());
+ break;
+ case RIL_TEXTLINE:
+ do {
+ best_choice = res_it.word()->best_choice;
+ ASSERT_HOST(best_choice != nullptr);
+ mean_certainty += best_choice->certainty();
+ ++certainty_count;
+ res_it.forward();
+ } while (res_it.row() == res_it.prev_row());
+ break;
+ case RIL_WORD:
+ mean_certainty += best_choice->certainty();
+ ++certainty_count;
+ break;
+ case RIL_SYMBOL:
+ mean_certainty += best_choice->certainty(blob_index_);
+ ++certainty_count;
+ }
+ if (certainty_count > 0) {
+ mean_certainty /= certainty_count;
+ return ClipToRange(100 + 5 * mean_certainty, 0.0f, 100.0f);
+ }
+ return 0.0f;
+}
+
+void LTRResultIterator::RowAttributes(float* row_height, float* descenders,
+ float* ascenders) const {
+ *row_height = it_->row()->row->x_height() + it_->row()->row->ascenders() -
+ it_->row()->row->descenders();
+ *descenders = it_->row()->row->descenders();
+ *ascenders = it_->row()->row->ascenders();
+}
+
+// Returns the font attributes of the current word. If iterating at a higher
+// level object than words, eg textlines, then this will return the
+// attributes of the first word in that textline.
+// The actual return value is a string representing a font name. It points
+// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
+// the iterator itself, ie rendered invalid by various members of
+// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
+// Pointsize is returned in printers points (1/72 inch.)
+const char* LTRResultIterator::WordFontAttributes(
+ bool* is_bold, bool* is_italic, bool* is_underlined, bool* is_monospace,
+ bool* is_serif, bool* is_smallcaps, int* pointsize, int* font_id) const {
+ const char* result = nullptr;
+
+ if (it_->word() == nullptr) {
+ // Already at the end!
+ *pointsize = 0;
+ } else {
+ float row_height = it_->row()->row->x_height() +
+ it_->row()->row->ascenders() -
+ it_->row()->row->descenders();
+ // Convert from pixels to printers points.
+ *pointsize =
+ scaled_yres_ > 0
+ ? static_cast<int>(row_height * kPointsPerInch / scaled_yres_ + 0.5)
+ : 0;
+
+ #ifndef DISABLED_LEGACY_ENGINE
+ const FontInfo* font_info = it_->word()->fontinfo;
+ if (font_info) {
+ // Font information available.
+ *font_id = font_info->universal_id;
+ *is_bold = font_info->is_bold();
+ *is_italic = font_info->is_italic();
+ *is_underlined = false; // TODO(rays) fix this!
+ *is_monospace = font_info->is_fixed_pitch();
+ *is_serif = font_info->is_serif();
+ result = font_info->name;
+ }
+ #endif // ndef DISABLED_LEGACY_ENGINE
+
+ *is_smallcaps = it_->word()->small_caps;
+ }
+
+ if (!result) {
+ *is_bold = false;
+ *is_italic = false;
+ *is_underlined = false;
+ *is_monospace = false;
+ *is_serif = false;
+ *is_smallcaps = false;
+ *font_id = -1;
+ }
+
+ return result;
+}
+
+// Returns the name of the language used to recognize this word.
+const char* LTRResultIterator::WordRecognitionLanguage() const {
+ if (it_->word() == nullptr || it_->word()->tesseract == nullptr)
+ return nullptr;
+ return it_->word()->tesseract->lang.c_str();
+}
+
+// Return the overall directionality of this word.
+StrongScriptDirection LTRResultIterator::WordDirection() const {
+ if (it_->word() == nullptr)
+ return DIR_NEUTRAL;
+ bool has_rtl = it_->word()->AnyRtlCharsInWord();
+ bool has_ltr = it_->word()->AnyLtrCharsInWord();
+ if (has_rtl && !has_ltr)
+ return DIR_RIGHT_TO_LEFT;
+ if (has_ltr && !has_rtl)
+ return DIR_LEFT_TO_RIGHT;
+ if (!has_ltr && !has_rtl)
+ return DIR_NEUTRAL;
+ return DIR_MIX;
+}
+
+// Returns true if the current word was found in a dictionary.
+bool LTRResultIterator::WordIsFromDictionary() const {
+ if (it_->word() == nullptr)
+ return false; // Already at the end!
+ int permuter = it_->word()->best_choice->permuter();
+ return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM ||
+ permuter == USER_DAWG_PERM;
+}
+
+// Returns the number of blanks before the current word.
+int LTRResultIterator::BlanksBeforeWord() const {
+ if (it_->word() == nullptr)
+ return 1;
+ return it_->word()->word->space();
+}
+
+// Returns true if the current word is numeric.
+bool LTRResultIterator::WordIsNumeric() const {
+ if (it_->word() == nullptr)
+ return false; // Already at the end!
+ int permuter = it_->word()->best_choice->permuter();
+ return permuter == NUMBER_PERM;
+}
+
+// Returns true if the word contains blamer information.
+bool LTRResultIterator::HasBlamerInfo() const {
+ return it_->word() != nullptr && it_->word()->blamer_bundle != nullptr &&
+ it_->word()->blamer_bundle->HasDebugInfo();
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+// Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
+// of the current word.
+const void* LTRResultIterator::GetParamsTrainingBundle() const {
+ return (it_->word() != nullptr && it_->word()->blamer_bundle != nullptr)
+ ? &(it_->word()->blamer_bundle->params_training_bundle())
+ : nullptr;
+}
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+// Returns the pointer to the string with blamer information for this word.
+// Assumes that the word's blamer_bundle is not nullptr.
+const char* LTRResultIterator::GetBlamerDebug() const {
+ return it_->word()->blamer_bundle->debug().c_str();
+}
+
+// Returns the pointer to the string with misadaption information for this word.
+// Assumes that the word's blamer_bundle is not nullptr.
+const char* LTRResultIterator::GetBlamerMisadaptionDebug() const {
+ return it_->word()->blamer_bundle->misadaption_debug().c_str();
+}
+
+// Returns true if a truth string was recorded for the current word.
+bool LTRResultIterator::HasTruthString() const {
+ if (it_->word() == nullptr)
+ return false; // Already at the end!
+ if (it_->word()->blamer_bundle == nullptr ||
+ it_->word()->blamer_bundle->NoTruth()) {
+ return false; // no truth information for this word
+ }
+ return true;
+}
+
+// Returns true if the given string is equivalent to the truth string for
+// the current word.
+bool LTRResultIterator::EquivalentToTruth(const char* str) const {
+ if (!HasTruthString())
+ return false;
+ ASSERT_HOST(it_->word()->uch_set != nullptr);
+ WERD_CHOICE str_wd(str, *(it_->word()->uch_set));
+ return it_->word()->blamer_bundle->ChoiceIsCorrect(&str_wd);
+}
+
+// Returns the null terminated UTF-8 encoded truth string for the current word.
+// Use delete [] to free after use.
+char* LTRResultIterator::WordTruthUTF8Text() const {
+ if (!HasTruthString())
+ return nullptr;
+ STRING truth_text = it_->word()->blamer_bundle->TruthString();
+ int length = truth_text.length() + 1;
+ char* result = new char[length];
+ strncpy(result, truth_text.c_str(), length);
+ return result;
+}
+
+// Returns the null terminated UTF-8 encoded normalized OCR string for the
+// current word. Use delete [] to free after use.
+char* LTRResultIterator::WordNormedUTF8Text() const {
+ if (it_->word() == nullptr)
+ return nullptr; // Already at the end!
+ STRING ocr_text;
+ WERD_CHOICE* best_choice = it_->word()->best_choice;
+ const UNICHARSET* unicharset = it_->word()->uch_set;
+ ASSERT_HOST(best_choice != nullptr);
+ for (int i = 0; i < best_choice->length(); ++i) {
+ ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i));
+ }
+ int length = ocr_text.length() + 1;
+ char* result = new char[length];
+ strncpy(result, ocr_text.c_str(), length);
+ return result;
+}
+
+// Returns a pointer to serialized choice lattice.
+// Fills lattice_size with the number of bytes in lattice data.
+const char* LTRResultIterator::WordLattice(int* lattice_size) const {
+ if (it_->word() == nullptr)
+ return nullptr; // Already at the end!
+ if (it_->word()->blamer_bundle == nullptr)
+ return nullptr;
+ *lattice_size = it_->word()->blamer_bundle->lattice_size();
+ return it_->word()->blamer_bundle->lattice_data();
+}
+
+// Returns true if the current symbol is a superscript.
+// If iterating at a higher level object than symbols, eg words, then
+// this will return the attributes of the first symbol in that word.
+bool LTRResultIterator::SymbolIsSuperscript() const {
+ if (cblob_it_ == nullptr && it_->word() != nullptr)
+ return it_->word()->best_choice->BlobPosition(blob_index_) ==
+ SP_SUPERSCRIPT;
+ return false;
+}
+
+// Returns true if the current symbol is a subscript.
+// If iterating at a higher level object than symbols, eg words, then
+// this will return the attributes of the first symbol in that word.
+bool LTRResultIterator::SymbolIsSubscript() const {
+ if (cblob_it_ == nullptr && it_->word() != nullptr)
+ return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUBSCRIPT;
+ return false;
+}
+
+// Returns true if the current symbol is a dropcap.
+// If iterating at a higher level object than symbols, eg words, then
+// this will return the attributes of the first symbol in that word.
+bool LTRResultIterator::SymbolIsDropcap() const {
+ if (cblob_it_ == nullptr && it_->word() != nullptr)
+ return it_->word()->best_choice->BlobPosition(blob_index_) == SP_DROPCAP;
+ return false;
+}
+
+ChoiceIterator::ChoiceIterator(const LTRResultIterator& result_it) {
+ ASSERT_HOST(result_it.it_->word() != nullptr);
+ word_res_ = result_it.it_->word();
+ oemLSTM_ = word_res_->tesseract->AnyLSTMLang();
+ // Is there legacy engine related trained data?
+ bool oemLegacy = word_res_->tesseract->AnyTessLang();
+ // Is lstm_choice_mode activated?
+ bool lstm_choice_mode = word_res_->tesseract->lstm_choice_mode;
+ rating_coefficient_ = word_res_->tesseract->lstm_rating_coefficient;
+ blanks_before_word_ = result_it.BlanksBeforeWord();
+ BLOB_CHOICE_LIST* choices = nullptr;
+ tstep_index_ = &result_it.blob_index_;
+ if (oemLSTM_ && !word_res_->CTC_symbol_choices.empty()) {
+ if (!word_res_->CTC_symbol_choices[0].empty() &&
+ strcmp(word_res_->CTC_symbol_choices[0][0].first, " ")) {
+ blanks_before_word_ = 0;
+ }
+ auto index = *tstep_index_;
+ index += blanks_before_word_;
+ if (index < word_res_->CTC_symbol_choices.size()) {
+ LSTM_choices_ = &word_res_->CTC_symbol_choices[index];
+ filterSpaces();
+ }
+ }
+ if ((oemLegacy || !lstm_choice_mode) && word_res_->ratings != nullptr)
+ choices = word_res_->GetBlobChoices(result_it.blob_index_);
+ if (choices != nullptr && !choices->empty()) {
+ choice_it_ = new BLOB_CHOICE_IT(choices);
+ choice_it_->mark_cycle_pt();
+ } else {
+ choice_it_ = nullptr;
+ }
+ if (LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
+ LSTM_choice_it_ = LSTM_choices_->begin();
+ }
+}
+ChoiceIterator::~ChoiceIterator() {
+ delete choice_it_;
+}
+
+// Moves to the next choice for the symbol and returns false if there
+// are none left.
+bool ChoiceIterator::Next() {
+ if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
+ if (LSTM_choice_it_ != LSTM_choices_->end() &&
+ next(LSTM_choice_it_) == LSTM_choices_->end()) {
+ return false;
+ } else {
+ ++LSTM_choice_it_;
+ return true;
+ }
+ } else {
+ if (choice_it_ == nullptr)
+ return false;
+ choice_it_->forward();
+ return !choice_it_->cycled_list();
+ }
+}
+
+// Returns the null terminated UTF-8 encoded text string for the current
+// choice. Do NOT use delete [] to free after use.
+const char* ChoiceIterator::GetUTF8Text() const {
+ if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
+ std::pair<const char*, float> choice = *LSTM_choice_it_;
+ return choice.first;
+ } else {
+ if (choice_it_ == nullptr)
+ return nullptr;
+ UNICHAR_ID id = choice_it_->data()->unichar_id();
+ return word_res_->uch_set->id_to_unichar_ext(id);
+ }
+}
+
+// Returns the confidence of the current choice depending on the used language
+// data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All
+// choices for one symbol should roughly add up to 1.0f.
+// If only traineddata of the legacy engine is used, the number should be
+// interpreted as a percent probability. (0.0f-100.0f) In this case
+// probabilities won't add up to 100. Each one stands on its own.
+float ChoiceIterator::Confidence() const {
+ float confidence;
+ if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
+ std::pair<const char*, float> choice = *LSTM_choice_it_;
+ confidence = 100 - rating_coefficient_ * choice.second;
+ } else {
+ if (choice_it_ == nullptr)
+ return 0.0f;
+ confidence = 100 + 5 * choice_it_->data()->certainty();
+ }
+ return ClipToRange(confidence, 0.0f, 100.0f);
+}
+
+// Returns the set of timesteps which belong to the current symbol
+std::vector<std::vector<std::pair<const char*, float>>>*
+ChoiceIterator::Timesteps() const {
+ int offset = *tstep_index_ + blanks_before_word_;
+ if (offset >= word_res_->segmented_timesteps.size() || !oemLSTM_) {
+ return nullptr;
+ }
+ return &word_res_->segmented_timesteps[offset];
+}
+
+void ChoiceIterator::filterSpaces() {
+ if (LSTM_choices_->empty())
+ return;
+ std::vector<std::pair<const char*, float>>::iterator it;
+ for (it = LSTM_choices_->begin(); it != LSTM_choices_->end();) {
+ if (!strcmp(it->first, " ")) {
+ it = LSTM_choices_->erase(it);
+ } else {
+ ++it;
+ }
+ }
+}
+} // namespace tesseract.
diff --git a/tesseract/src/ccmain/mutableiterator.cpp b/tesseract/src/ccmain/mutableiterator.cpp
new file mode 100644
index 00000000..a472df18
--- /dev/null
+++ b/tesseract/src/ccmain/mutableiterator.cpp
@@ -0,0 +1,24 @@
+///////////////////////////////////////////////////////////////////////
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "mutableiterator.h"
+
+namespace tesseract {
+
+// Destructor.
+// It is defined here, so the compiler can create a single vtable
+// instead of weak vtables in every compilation unit.
+MutableIterator::~MutableIterator() = default;
+
+} // namespace tesseract.
diff --git a/tesseract/src/ccmain/mutableiterator.h b/tesseract/src/ccmain/mutableiterator.h
new file mode 100644
index 00000000..de3a3612
--- /dev/null
+++ b/tesseract/src/ccmain/mutableiterator.h
@@ -0,0 +1,63 @@
+///////////////////////////////////////////////////////////////////////
+// File: mutableiterator.h
+// Description: Iterator for tesseract results providing access to
+// both high-level API and Tesseract internal data structures.
+// Author: David Eger
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCMAIN_MUTABLEITERATOR_H_
+#define TESSERACT_CCMAIN_MUTABLEITERATOR_H_
+
+#include <tesseract/resultiterator.h>
+
+class BLOB_CHOICE_IT;
+
+namespace tesseract {
+
+class Tesseract;
+
+// Class to iterate over tesseract results, providing access to all levels
+// of the page hierarchy, without including any tesseract headers or having
+// to handle any tesseract structures.
+// WARNING! This class points to data held within the TessBaseAPI class, and
+// therefore can only be used while the TessBaseAPI class still exists and
+// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+// DetectOS, or anything else that changes the internal PAGE_RES.
+// See tesseract/publictypes.h for the definition of PageIteratorLevel.
+// See also base class PageIterator, which contains the bulk of the interface.
+// ResultIterator adds text-specific methods for access to OCR output.
+// MutableIterator adds access to internal data structures.
+
+class TESS_API MutableIterator : public ResultIterator {
+ public:
+ // See argument descriptions in ResultIterator()
+ MutableIterator(PAGE_RES* page_res, Tesseract* tesseract,
+ int scale, int scaled_yres,
+ int rect_left, int rect_top,
+ int rect_width, int rect_height)
+ : ResultIterator(
+ LTRResultIterator(page_res, tesseract, scale, scaled_yres, rect_left,
+ rect_top, rect_width, rect_height)) {}
+ ~MutableIterator() override;
+
+ // See PageIterator and ResultIterator for most calls.
+
+ // Return access to Tesseract internals.
+ const PAGE_RES_IT *PageResIt() const { return it_; }
+};
+
+} // namespace tesseract.
+
+#endif // TESSERACT_CCMAIN_MUTABLEITERATOR_H_
diff --git a/tesseract/src/ccmain/osdetect.cpp b/tesseract/src/ccmain/osdetect.cpp
new file mode 100644
index 00000000..99a5362c
--- /dev/null
+++ b/tesseract/src/ccmain/osdetect.cpp
@@ -0,0 +1,579 @@
+///////////////////////////////////////////////////////////////////////
+// File: osdetect.cpp
+// Description: Orientation and script detection.
+// Author: Samuel Charron
+// Ranjith Unnikrishnan
+//
+// (C) Copyright 2008, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include <tesseract/osdetect.h>
+
+#include "blobbox.h"
+#include "blread.h"
+#include "colfind.h"
+#include "fontinfo.h"
+#include "imagefind.h"
+#include "linefind.h"
+#include "oldlist.h"
+#include "qrsequence.h"
+#include "ratngs.h"
+#include "tabvector.h"
+#include "tesseractclass.h"
+#include "textord.h"
+
+#include "strngs.h"
+
+#include <algorithm>
+#include <cmath> // for std::fabs
+#include <memory>
+
+namespace tesseract {
+
+const float kSizeRatioToReject = 2.0;
+const int kMinAcceptableBlobHeight = 10;
+
+const float kScriptAcceptRatio = 1.3;
+
+const float kHanRatioInKorean = 0.7;
+const float kHanRatioInJapanese = 0.3;
+
+const float kNonAmbiguousMargin = 1.0;
+
+// General scripts
+static const char* han_script = "Han";
+static const char* latin_script = "Latin";
+static const char* katakana_script = "Katakana";
+static const char* hiragana_script = "Hiragana";
+static const char* hangul_script = "Hangul";
+
+// Pseudo-scripts Name
+const char* ScriptDetector::korean_script_ = "Korean";
+const char* ScriptDetector::japanese_script_ = "Japanese";
+const char* ScriptDetector::fraktur_script_ = "Fraktur";
+
+void OSResults::update_best_orientation() {
+ float first = orientations[0];
+ float second = orientations[1];
+ best_result.orientation_id = 0;
+ if (orientations[0] < orientations[1]) {
+ first = orientations[1];
+ second = orientations[0];
+ best_result.orientation_id = 1;
+ }
+ for (int i = 2; i < 4; ++i) {
+ if (orientations[i] > first) {
+ second = first;
+ first = orientations[i];
+ best_result.orientation_id = i;
+ } else if (orientations[i] > second) {
+ second = orientations[i];
+ }
+ }
+ // Store difference of top two orientation scores.
+ best_result.oconfidence = first - second;
+}
+
+void OSResults::set_best_orientation(int orientation_id) {
+ best_result.orientation_id = orientation_id;
+ best_result.oconfidence = 0;
+}
+
+void OSResults::update_best_script(int orientation) {
+ // We skip index 0 to ignore the "Common" script.
+ float first = scripts_na[orientation][1];
+ float second = scripts_na[orientation][2];
+ best_result.script_id = 1;
+ if (scripts_na[orientation][1] < scripts_na[orientation][2]) {
+ first = scripts_na[orientation][2];
+ second = scripts_na[orientation][1];
+ best_result.script_id = 2;
+ }
+ for (int i = 3; i < kMaxNumberOfScripts; ++i) {
+ if (scripts_na[orientation][i] > first) {
+ best_result.script_id = i;
+ second = first;
+ first = scripts_na[orientation][i];
+ } else if (scripts_na[orientation][i] > second) {
+ second = scripts_na[orientation][i];
+ }
+ }
+ best_result.sconfidence = (second == 0.0f) ? 2.0f :
+ (first / second - 1.0) / (kScriptAcceptRatio - 1.0);
+}
+
+int OSResults::get_best_script(int orientation_id) const {
+ int max_id = -1;
+ for (int j = 0; j < kMaxNumberOfScripts; ++j) {
+ const char *script = unicharset->get_script_from_script_id(j);
+ if (strcmp(script, "Common") && strcmp(script, "NULL")) {
+ if (max_id == -1 ||
+ scripts_na[orientation_id][j] > scripts_na[orientation_id][max_id])
+ max_id = j;
+ }
+ }
+ return max_id;
+}
+
+// Print the script scores for all possible orientations.
+void OSResults::print_scores(void) const {
+ for (int i = 0; i < 4; ++i) {
+ tprintf("Orientation id #%d", i);
+ print_scores(i);
+ }
+}
+
+// Print the script scores for the given candidate orientation.
+void OSResults::print_scores(int orientation_id) const {
+ for (int j = 0; j < kMaxNumberOfScripts; ++j) {
+ if (scripts_na[orientation_id][j]) {
+ tprintf("%12s\t: %f\n", unicharset->get_script_from_script_id(j),
+ scripts_na[orientation_id][j]);
+ }
+ }
+}
+
+// Accumulate scores with given OSResults instance and update the best script.
+void OSResults::accumulate(const OSResults& osr) {
+ for (int i = 0; i < 4; ++i) {
+ orientations[i] += osr.orientations[i];
+ for (int j = 0; j < kMaxNumberOfScripts; ++j)
+ scripts_na[i][j] += osr.scripts_na[i][j];
+ }
+ unicharset = osr.unicharset;
+ update_best_orientation();
+ update_best_script(best_result.orientation_id);
+}
+
+// Detect and erase horizontal/vertical lines and picture regions from the
+// image, so that non-text blobs are removed from consideration.
+static void remove_nontext_regions(tesseract::Tesseract *tess,
+ BLOCK_LIST *blocks,
+ TO_BLOCK_LIST *to_blocks) {
+ Pix *pix = tess->pix_binary();
+ ASSERT_HOST(pix != nullptr);
+ int vertical_x = 0;
+ int vertical_y = 1;
+ tesseract::TabVector_LIST v_lines;
+ tesseract::TabVector_LIST h_lines;
+ int resolution;
+ if (kMinCredibleResolution > pixGetXRes(pix)) {
+ resolution = kMinCredibleResolution;
+ tprintf("Warning. Invalid resolution %d dpi. Using %d instead.\n",
+ pixGetXRes(pix), resolution);
+ } else {
+ resolution = pixGetXRes(pix);
+ }
+
+ tesseract::LineFinder::FindAndRemoveLines(resolution, false, pix,
+ &vertical_x, &vertical_y,
+ nullptr, &v_lines, &h_lines);
+ Pix* im_pix = tesseract::ImageFind::FindImages(pix, nullptr);
+ if (im_pix != nullptr) {
+ pixSubtract(pix, pix, im_pix);
+ pixDestroy(&im_pix);
+ }
+ tess->mutable_textord()->find_components(tess->pix_binary(),
+ blocks, to_blocks);
+}
+
+// Find connected components in the page and process a subset until finished or
+// a stopping criterion is met.
+// Returns the number of blobs used in making the estimate. 0 implies failure.
+int orientation_and_script_detection(const char* filename,
+ OSResults* osr,
+ tesseract::Tesseract* tess) {
+ std::string name = filename; //truncated name
+
+ const char* lastdot = strrchr(name.c_str(), '.');
+ if (lastdot != nullptr)
+ name[lastdot-name.c_str()] = '\0';
+
+ ASSERT_HOST(tess->pix_binary() != nullptr);
+ int width = pixGetWidth(tess->pix_binary());
+ int height = pixGetHeight(tess->pix_binary());
+
+ BLOCK_LIST blocks;
+ if (!read_unlv_file(name, width, height, &blocks))
+ FullPageBlock(width, height, &blocks);
+
+ // Try to remove non-text regions from consideration.
+ TO_BLOCK_LIST land_blocks, port_blocks;
+ remove_nontext_regions(tess, &blocks, &port_blocks);
+
+ if (port_blocks.empty()) {
+ // page segmentation did not succeed, so we need to find_components first.
+ tess->mutable_textord()->find_components(tess->pix_binary(),
+ &blocks, &port_blocks);
+ } else {
+ TBOX page_box(0, 0, width, height);
+ // Filter_blobs sets up the TO_BLOCKs the same as find_components does.
+ tess->mutable_textord()->filter_blobs(page_box.topright(),
+ &port_blocks, true);
+ }
+
+ return os_detect(&port_blocks, osr, tess);
+}
+
+// Filter and sample the blobs.
+// Returns a non-zero number of blobs if the page was successfully processed, or
+// zero if the page had too few characters to be reliable
+int os_detect(TO_BLOCK_LIST* port_blocks, OSResults* osr,
+ tesseract::Tesseract* tess) {
+ int blobs_total = 0;
+ TO_BLOCK_IT block_it;
+ block_it.set_to_list(port_blocks);
+
+ BLOBNBOX_CLIST filtered_list;
+ BLOBNBOX_C_IT filtered_it(&filtered_list);
+
+ for (block_it.mark_cycle_pt(); !block_it.cycled_list();
+ block_it.forward ()) {
+ TO_BLOCK* to_block = block_it.data();
+ if (to_block->block->pdblk.poly_block() &&
+ !to_block->block->pdblk.poly_block()->IsText()) continue;
+ BLOBNBOX_IT bbox_it;
+ bbox_it.set_to_list(&to_block->blobs);
+ for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list ();
+ bbox_it.forward ()) {
+ BLOBNBOX* bbox = bbox_it.data();
+ C_BLOB* blob = bbox->cblob();
+ TBOX box = blob->bounding_box();
+ ++blobs_total;
+
+ // Catch illegal value of box width and avoid division by zero.
+ if (box.width() == 0) continue;
+ // TODO: Can height and width be negative? If not, remove fabs.
+ float y_x = std::fabs((box.height() * 1.0f) / box.width());
+ float x_y = 1.0f / y_x;
+ // Select a >= 1.0 ratio
+ float ratio = x_y > y_x ? x_y : y_x;
+ // Blob is ambiguous
+ if (ratio > kSizeRatioToReject) continue;
+ if (box.height() < kMinAcceptableBlobHeight) continue;
+ filtered_it.add_to_end(bbox);
+ }
+ }
+ return os_detect_blobs(nullptr, &filtered_list, osr, tess);
+}
+
+// Detect orientation and script from a list of blobs.
+// Returns a non-zero number of blobs if the list was successfully processed, or
+// zero if the list had too few characters to be reliable.
+// If allowed_scripts is non-null and non-empty, it is a list of scripts that
+// constrains both orientation and script detection to consider only scripts
+// from the list.
+int os_detect_blobs(const std::vector<int>* allowed_scripts,
+ BLOBNBOX_CLIST* blob_list, OSResults* osr,
+ tesseract::Tesseract* tess) {
+ OSResults osr_;
+ int minCharactersToTry = tess->min_characters_to_try;
+ int maxCharactersToTry = 5 * minCharactersToTry;
+ if (osr == nullptr)
+ osr = &osr_;
+
+ osr->unicharset = &tess->unicharset;
+ OrientationDetector o(allowed_scripts, osr);
+ ScriptDetector s(allowed_scripts, osr, tess);
+
+ BLOBNBOX_C_IT filtered_it(blob_list);
+ int real_max = std::min(filtered_it.length(), maxCharactersToTry);
+ // tprintf("Total blobs found = %d\n", blobs_total);
+ // tprintf("Number of blobs post-filtering = %d\n", filtered_it.length());
+ // tprintf("Number of blobs to try = %d\n", real_max);
+
+ // If there are too few characters, skip this page entirely.
+ if (real_max < minCharactersToTry / 2) {
+ tprintf("Too few characters. Skipping this page\n");
+ return 0;
+ }
+
+ auto** blobs = new BLOBNBOX*[filtered_it.length()];
+ int number_of_blobs = 0;
+ for (filtered_it.mark_cycle_pt (); !filtered_it.cycled_list ();
+ filtered_it.forward ()) {
+ blobs[number_of_blobs++] = filtered_it.data();
+ }
+ QRSequenceGenerator sequence(number_of_blobs);
+ int num_blobs_evaluated = 0;
+ for (int i = 0; i < real_max; ++i) {
+ if (os_detect_blob(blobs[sequence.GetVal()], &o, &s, osr, tess)
+ && i > minCharactersToTry) {
+ break;
+ }
+ ++num_blobs_evaluated;
+ }
+ delete [] blobs;
+
+ // Make sure the best_result is up-to-date
+ int orientation = o.get_orientation();
+ osr->update_best_script(orientation);
+ return num_blobs_evaluated;
+}
+
+// Processes a single blob to estimate script and orientation.
+// Return true if estimate of orientation and script satisfies stopping
+// criteria.
+bool os_detect_blob(BLOBNBOX* bbox, OrientationDetector* o,
+ ScriptDetector* s, OSResults* osr,
+ tesseract::Tesseract* tess) {
+ tess->tess_cn_matching.set_value(true); // turn it on
+ tess->tess_bn_matching.set_value(false);
+ C_BLOB* blob = bbox->cblob();
+ TBLOB* tblob = TBLOB::PolygonalCopy(tess->poly_allow_detailed_fx, blob);
+ TBOX box = tblob->bounding_box();
+ FCOORD current_rotation(1.0f, 0.0f);
+ FCOORD rotation90(0.0f, 1.0f);
+ BLOB_CHOICE_LIST ratings[4];
+ // Test the 4 orientations
+ for (int i = 0; i < 4; ++i) {
+ // Normalize the blob. Set the origin to the place we want to be the
+ // bottom-middle after rotation.
+ // Scaling is to make the rotated height the x-height.
+ float scaling = static_cast<float>(kBlnXHeight) / box.height();
+ float x_origin = (box.left() + box.right()) / 2.0f;
+ float y_origin = (box.bottom() + box.top()) / 2.0f;
+ if (i == 0 || i == 2) {
+ // Rotation is 0 or 180.
+ y_origin = i == 0 ? box.bottom() : box.top();
+ } else {
+ // Rotation is 90 or 270.
+ scaling = static_cast<float>(kBlnXHeight) / box.width();
+ x_origin = i == 1 ? box.left() : box.right();
+ }
+ std::unique_ptr<TBLOB> rotated_blob(new TBLOB(*tblob));
+ rotated_blob->Normalize(nullptr, &current_rotation, nullptr,
+ x_origin, y_origin, scaling, scaling,
+ 0.0f, static_cast<float>(kBlnBaselineOffset),
+ false, nullptr);
+ tess->AdaptiveClassifier(rotated_blob.get(), ratings + i);
+ current_rotation.rotate(rotation90);
+ }
+ delete tblob;
+
+ bool stop = o->detect_blob(ratings);
+ s->detect_blob(ratings);
+ int orientation = o->get_orientation();
+ stop = s->must_stop(orientation) && stop;
+ return stop;
+}
+
+
+OrientationDetector::OrientationDetector(
+ const std::vector<int>* allowed_scripts, OSResults* osr) {
+ osr_ = osr;
+ allowed_scripts_ = allowed_scripts;
+}
+
+// Score the given blob and return true if it is now sure of the orientation
+// after adding this block.
+bool OrientationDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
+ float blob_o_score[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+ float total_blob_o_score = 0.0f;
+
+ for (int i = 0; i < 4; ++i) {
+ BLOB_CHOICE_IT choice_it(scores + i);
+ if (!choice_it.empty()) {
+ BLOB_CHOICE* choice = nullptr;
+ if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {
+ // Find the top choice in an allowed script.
+ for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() &&
+ choice == nullptr; choice_it.forward()) {
+ int choice_script = choice_it.data()->script_id();
+ int s = 0;
+ for (s = 0; s < allowed_scripts_->size(); ++s) {
+ if ((*allowed_scripts_)[s] == choice_script) {
+ choice = choice_it.data();
+ break;
+ }
+ }
+ }
+ } else {
+ choice = choice_it.data();
+ }
+ if (choice != nullptr) {
+ // The certainty score ranges between [-20,0]. This is converted here to
+ // [0,1], with 1 indicating best match.
+ blob_o_score[i] = 1 + 0.05 * choice->certainty();
+ total_blob_o_score += blob_o_score[i];
+ }
+ }
+ }
+ if (total_blob_o_score == 0.0) return false;
+ // Fill in any blanks with the worst score of the others. This is better than
+ // picking an arbitrary probability for it and way better than -inf.
+ float worst_score = 0.0f;
+ int num_good_scores = 0;
+ for (float f : blob_o_score) {
+ if (f > 0.0f) {
+ ++num_good_scores;
+ if (worst_score == 0.0f || f < worst_score)
+ worst_score = f;
+ }
+ }
+ if (num_good_scores == 1) {
+ // Lower worst if there is only one.
+ worst_score /= 2.0f;
+ }
+ for (float& f : blob_o_score) {
+ if (f == 0.0f) {
+ f = worst_score;
+ total_blob_o_score += worst_score;
+ }
+ }
+ // Normalize the orientation scores for the blob and use them to
+ // update the aggregated orientation score.
+ for (int i = 0; total_blob_o_score != 0 && i < 4; ++i) {
+ osr_->orientations[i] += log(blob_o_score[i] / total_blob_o_score);
+ }
+
+ // TODO(ranjith) Add an early exit test, based on min_orientation_margin,
+ // as used in pagesegmain.cpp.
+ return false;
+}
+
+int OrientationDetector::get_orientation() {
+ osr_->update_best_orientation();
+ return osr_->best_result.orientation_id;
+}
+
+
+ScriptDetector::ScriptDetector(const std::vector<int>* allowed_scripts,
+ OSResults* osr, tesseract::Tesseract* tess) {
+ osr_ = osr;
+ tess_ = tess;
+ allowed_scripts_ = allowed_scripts;
+ katakana_id_ = tess_->unicharset.add_script(katakana_script);
+ hiragana_id_ = tess_->unicharset.add_script(hiragana_script);
+ han_id_ = tess_->unicharset.add_script(han_script);
+ hangul_id_ = tess_->unicharset.add_script(hangul_script);
+ japanese_id_ = tess_->unicharset.add_script(japanese_script_);
+ korean_id_ = tess_->unicharset.add_script(korean_script_);
+ latin_id_ = tess_->unicharset.add_script(latin_script);
+ fraktur_id_ = tess_->unicharset.add_script(fraktur_script_);
+}
+
+
+// Score the given blob and return true if it is now sure of the script after
+// adding this blob.
+void ScriptDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
+ for (int i = 0; i < 4; ++i) {
+ bool done[kMaxNumberOfScripts] = { false };
+
+ BLOB_CHOICE_IT choice_it;
+ choice_it.set_to_list(scores + i);
+
+ float prev_score = -1;
+ int script_count = 0;
+ int prev_id = -1;
+ int prev_fontinfo_id = -1;
+ const char* prev_unichar = "";
+ const char* unichar = "";
+
+ for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
+ choice_it.forward()) {
+ BLOB_CHOICE* choice = choice_it.data();
+ int id = choice->script_id();
+ if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {
+ // Check that the choice is in an allowed script.
+ int s = 0;
+ for (s = 0; s < allowed_scripts_->size(); ++s) {
+ if ((*allowed_scripts_)[s] == id) break;
+ }
+ if (s == allowed_scripts_->size()) continue; // Not found in list.
+ }
+ // Script already processed before.
+ if (done[id]) continue;
+ done[id] = true;
+
+ unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
+ // Save data from the first match
+ if (prev_score < 0) {
+ prev_score = -choice->certainty();
+ script_count = 1;
+ prev_id = id;
+ prev_unichar = unichar;
+ prev_fontinfo_id = choice->fontinfo_id();
+ } else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {
+ ++script_count;
+ }
+
+ if (strlen(prev_unichar) == 1)
+ if (unichar[0] >= '0' && unichar[0] <= '9')
+ break;
+
+ // if script_count is >= 2, character is ambiguous, skip other matches
+ // since they are useless.
+ if (script_count >= 2)
+ break;
+ }
+ // Character is non ambiguous
+ if (script_count == 1) {
+ // Update the score of the winning script
+ osr_->scripts_na[i][prev_id] += 1.0;
+
+ // Workaround for Fraktur
+ if (prev_id == latin_id_) {
+ if (prev_fontinfo_id >= 0) {
+ const tesseract::FontInfo &fi =
+ tess_->get_fontinfo_table().get(prev_fontinfo_id);
+ //printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
+ // fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
+ // fi.is_serif(), fi.is_fraktur(),
+ // prev_unichar);
+ if (fi.is_fraktur()) {
+ osr_->scripts_na[i][prev_id] -= 1.0;
+ osr_->scripts_na[i][fraktur_id_] += 1.0;
+ }
+ }
+ }
+
+ // Update Japanese / Korean pseudo-scripts
+ if (prev_id == katakana_id_)
+ osr_->scripts_na[i][japanese_id_] += 1.0;
+ if (prev_id == hiragana_id_)
+ osr_->scripts_na[i][japanese_id_] += 1.0;
+ if (prev_id == hangul_id_)
+ osr_->scripts_na[i][korean_id_] += 1.0;
+ if (prev_id == han_id_) {
+ osr_->scripts_na[i][korean_id_] += kHanRatioInKorean;
+ osr_->scripts_na[i][japanese_id_] += kHanRatioInJapanese;
+ }
+ }
+ } // iterate over each orientation
+}
+
+bool ScriptDetector::must_stop(int orientation) {
+ osr_->update_best_script(orientation);
+ return osr_->best_result.sconfidence > 1;
+}
+
+// Helper method to convert an orientation index to its value in degrees.
+// The value represents the amount of clockwise rotation in degrees that must be
+// applied for the text to be upright (readable).
+int OrientationIdToValue(const int& id) {
+ switch (id) {
+ case 0:
+ return 0;
+ case 1:
+ return 270;
+ case 2:
+ return 180;
+ case 3:
+ return 90;
+ default:
+ return -1;
+ }
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/output.cpp b/tesseract/src/ccmain/output.cpp
new file mode 100644
index 00000000..d043e03a
--- /dev/null
+++ b/tesseract/src/ccmain/output.cpp
@@ -0,0 +1,418 @@
+/******************************************************************
+ * File: output.cpp (Formerly output.c)
+ * Description: Output pass
+ * Author: Phil Cheatle
+ *
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "output.h"
+
+#include "control.h"
+#include "tesseractclass.h"
+#include "tessvars.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#include "docqual.h"
+#include "reject.h"
+#endif
+
+#include "helpers.h"
+
+#include <cctype>
+#include <cerrno>
+#include <cstring>
+
+#define CTRL_NEWLINE '\012' //newline
+#define CTRL_HARDLINE '\015' //cr
+
+namespace tesseract {
+void Tesseract::output_pass( //Tess output pass //send to api
+ PAGE_RES_IT &page_res_it,
+ const TBOX *target_word_box) {
+ BLOCK_RES *block_of_last_word;
+ bool force_eol; //During output
+ BLOCK *nextblock; //block of next word
+ WERD *nextword; //next word
+
+ page_res_it.restart_page ();
+ block_of_last_word = nullptr;
+ while (page_res_it.word () != nullptr) {
+ check_debug_pt (page_res_it.word (), 120);
+
+ if (target_word_box) {
+ TBOX current_word_box = page_res_it.word()->word->bounding_box();
+ FCOORD center_pt(
+ (current_word_box.right() + current_word_box.left()) / 2,
+ (current_word_box.bottom() + current_word_box.top()) / 2);
+ if (!target_word_box->contains(center_pt)) {
+ page_res_it.forward();
+ continue;
+ }
+ }
+ if (tessedit_write_block_separators &&
+ block_of_last_word != page_res_it.block ()) {
+ block_of_last_word = page_res_it.block ();
+ }
+
+ force_eol = (tessedit_write_block_separators &&
+ (page_res_it.block () != page_res_it.next_block ())) ||
+ (page_res_it.next_word () == nullptr);
+
+ if (page_res_it.next_word () != nullptr)
+ nextword = page_res_it.next_word ()->word;
+ else
+ nextword = nullptr;
+ if (page_res_it.next_block () != nullptr)
+ nextblock = page_res_it.next_block ()->block;
+ else
+ nextblock = nullptr;
+ //regardless of tilde crunching
+ write_results(page_res_it,
+ determine_newline_type(page_res_it.word()->word,
+ page_res_it.block()->block,
+ nextword, nextblock), force_eol);
+ page_res_it.forward();
+ }
+}
+
+
+/*************************************************************************
+ * write_results()
+ *
+ * All recognition and rejection has now been done. Generate the following:
+ * .txt file - giving the final best choices with NO highlighting
+ * .raw file - giving the tesseract top choice output for each word
+ * .map file - showing how the .txt file has been rejected in the .ep file
+ * epchoice list - a list of one element per word, containing the text for the
+ * epaper. Reject strings are inserted.
+ * inset list - a list of bounding boxes of reject insets - indexed by the
+ * reject strings in the epchoice text.
+ *************************************************************************/
+void Tesseract::write_results(PAGE_RES_IT& page_res_it,
+ char newline_type, // type of newline
+ bool force_eol) { // override tilde crunch?
+ WERD_RES *word = page_res_it.word();
+ const UNICHARSET &uchset = *word->uch_set;
+ int i;
+ bool need_reject = false;
+ UNICHAR_ID space = uchset.unichar_to_id(" ");
+
+ if ((word->unlv_crunch_mode != CR_NONE ||
+ word->best_choice->length() == 0) &&
+ !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
+ if ((word->unlv_crunch_mode != CR_DELETE) &&
+ (!stats_.tilde_crunch_written ||
+ ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
+ (word->word->space () > 0) &&
+ !word->word->flag (W_FUZZY_NON) &&
+ !word->word->flag (W_FUZZY_SP)))) {
+ if (!word->word->flag (W_BOL) &&
+ (word->word->space () > 0) &&
+ !word->word->flag (W_FUZZY_NON) &&
+ !word->word->flag (W_FUZZY_SP)) {
+ stats_.last_char_was_tilde = false;
+ }
+ need_reject = true;
+ }
+ if ((need_reject && !stats_.last_char_was_tilde) ||
+ (force_eol && stats_.write_results_empty_block)) {
+ /* Write a reject char - mark as rejected unless zero_rejection mode */
+ stats_.last_char_was_tilde = true;
+ stats_.tilde_crunch_written = true;
+ stats_.last_char_was_newline = false;
+ stats_.write_results_empty_block = false;
+ }
+
+ if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
+ stats_.tilde_crunch_written = false;
+ stats_.last_char_was_newline = true;
+ stats_.last_char_was_tilde = false;
+ }
+
+ if (force_eol)
+ stats_.write_results_empty_block = true;
+ return;
+ }
+
+ /* NORMAL PROCESSING of non tilde crunched words */
+
+ stats_.tilde_crunch_written = false;
+ if (newline_type)
+ stats_.last_char_was_newline = true;
+ else
+ stats_.last_char_was_newline = false;
+ stats_.write_results_empty_block = force_eol; // about to write a real word
+
+ if (unlv_tilde_crunching &&
+ stats_.last_char_was_tilde &&
+ (word->word->space() == 0) &&
+ !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
+ (word->best_choice->unichar_id(0) == space)) {
+ /* Prevent adjacent tilde across words - we know that adjacent tildes within
+ words have been removed */
+ word->MergeAdjacentBlobs(0);
+ }
+ if (newline_type ||
+ (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
+ stats_.last_char_was_tilde = false;
+ else {
+ if (word->reject_map.length () > 0) {
+ if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
+ stats_.last_char_was_tilde = true;
+ else
+ stats_.last_char_was_tilde = false;
+ }
+ else if (word->word->space () > 0)
+ stats_.last_char_was_tilde = false;
+ /* else it is unchanged as there are no output chars */
+ }
+
+ ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
+
+ set_unlv_suspects(word);
+ check_debug_pt (word, 120);
+ if (tessedit_rejection_debug) {
+ tprintf ("Dict word: \"%s\": %d\n",
+ word->best_choice->debug_string().c_str(),
+ dict_word(*(word->best_choice)));
+ }
+ if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
+ if (tessedit_zero_rejection) {
+ /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
+ for (i = 0; i < word->best_choice->length(); ++i) {
+ if (word->reject_map[i].rejected())
+ word->reject_map[i].setrej_minimal_rej_accept();
+ }
+ }
+ if (tessedit_minimal_rejection) {
+ /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
+ for (i = 0; i < word->best_choice->length(); ++i) {
+ if ((word->best_choice->unichar_id(i) != space) &&
+ word->reject_map[i].rejected())
+ word->reject_map[i].setrej_minimal_rej_accept();
+ }
+ }
+ }
+}
+
+/**********************************************************************
+ * determine_newline_type
+ *
+ * Find whether we have a wrapping or hard newline.
+ * Return false if not at end of line.
+ **********************************************************************/
+
+char determine_newline_type( //test line ends
+ WERD *word, //word to do
+ BLOCK *block, //current block
+ WERD *next_word, //next word
+ BLOCK *next_block //block of next word
+ ) {
+ int16_t end_gap; //to right edge
+ int16_t width; //of next word
+ TBOX word_box; //bounding
+ TBOX next_box; //next word
+ TBOX block_box; //block bounding
+
+ if (!word->flag (W_EOL))
+ return false; //not end of line
+ if (next_word == nullptr || next_block == nullptr || block != next_block)
+ return CTRL_NEWLINE;
+ if (next_word->space () > 0)
+ return CTRL_HARDLINE; //it is tabbed
+ word_box = word->bounding_box ();
+ next_box = next_word->bounding_box ();
+ block_box = block->pdblk.bounding_box ();
+ //gap to eol
+ end_gap = block_box.right () - word_box.right ();
+ end_gap -= static_cast<int32_t>(block->space ());
+ width = next_box.right () - next_box.left ();
+ // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
+ // block_box.right(),word_box.right(),end_gap,
+ // next_box.right(),next_box.left(),width,
+ // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
+ return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
+}
+
+/*************************************************************************
+ * get_rep_char()
+ * Return the first accepted character from the repetition string. This is the
+ * character which is repeated - as determined earlier by fix_rep_char()
+ *************************************************************************/
+UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
+ int i;
+ for (i = 0; ((i < word->reject_map.length()) &&
+ (word->reject_map[i].rejected())); ++i);
+
+ if (i < word->reject_map.length()) {
+ return word->best_choice->unichar_id(i);
+ } else {
+ return word->uch_set->unichar_to_id(unrecognised_char.c_str());
+ }
+}
+
+/*************************************************************************
+ * SUSPECT LEVELS
+ *
+ * 0 - don't reject ANYTHING
+ * 1,2 - partial rejection
+ * 3 - BEST
+ *
+ * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
+ * tessedit_minimal_rejection.
+ *************************************************************************/
+void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
+ int len = word_res->reject_map.length();
+ const WERD_CHOICE &word = *(word_res->best_choice);
+ const UNICHARSET &uchset = *word.unicharset();
+ int i;
+ float rating_per_ch;
+
+ if (suspect_level == 0) {
+ for (i = 0; i < len; i++) {
+ if (word_res->reject_map[i].rejected())
+ word_res->reject_map[i].setrej_minimal_rej_accept();
+ }
+ return;
+ }
+
+ if (suspect_level >= 3)
+ return; //Use defaults
+
+ /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
+
+ if (safe_dict_word(word_res) &&
+ (count_alphas(word) > suspect_short_words)) {
+ /* Unreject alphas in dictionary words */
+ for (i = 0; i < len; ++i) {
+ if (word_res->reject_map[i].rejected() &&
+ uchset.get_isalpha(word.unichar_id(i)))
+ word_res->reject_map[i].setrej_minimal_rej_accept();
+ }
+ }
+
+ rating_per_ch = word.rating() / word_res->reject_map.length();
+
+ if (rating_per_ch >= suspect_rating_per_ch)
+ return; // Don't touch bad ratings
+
+ if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
+ /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
+ for (i = 0; i < len; ++i) {
+ if (word_res->reject_map[i].rejected() &&
+ (!uchset.eq(word.unichar_id(i), " ")))
+ word_res->reject_map[i].setrej_minimal_rej_accept();
+ }
+ }
+
+ for (i = 0; i < len; i++) {
+ if (word_res->reject_map[i].rejected()) {
+ if (word_res->reject_map[i].flag(R_DOC_REJ))
+ word_res->reject_map[i].setrej_minimal_rej_accept();
+ if (word_res->reject_map[i].flag(R_BLOCK_REJ))
+ word_res->reject_map[i].setrej_minimal_rej_accept();
+ if (word_res->reject_map[i].flag(R_ROW_REJ))
+ word_res->reject_map[i].setrej_minimal_rej_accept();
+ }
+ }
+
+ if (suspect_level == 2)
+ return;
+
+ if (!suspect_constrain_1Il ||
+ (word_res->reject_map.length() <= suspect_short_words)) {
+ for (i = 0; i < len; i++) {
+ if (word_res->reject_map[i].rejected()) {
+ if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
+ word_res->reject_map[i].flag(R_POSTNN_1IL)))
+ word_res->reject_map[i].setrej_minimal_rej_accept();
+
+ if (!suspect_constrain_1Il &&
+ word_res->reject_map[i].flag(R_MM_REJECT))
+ word_res->reject_map[i].setrej_minimal_rej_accept();
+ }
+ }
+ }
+
+ if (acceptable_word_string(*word_res->uch_set,
+ word.unichar_string().c_str(),
+ word.unichar_lengths().c_str()) !=
+ AC_UNACCEPTABLE ||
+ acceptable_number_string(word.unichar_string().c_str(),
+ word.unichar_lengths().c_str())) {
+ if (word_res->reject_map.length() > suspect_short_words) {
+ for (i = 0; i < len; i++) {
+ if (word_res->reject_map[i].rejected() &&
+ (!word_res->reject_map[i].perm_rejected() ||
+ word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
+ word_res->reject_map[i].flag (R_POSTNN_1IL) ||
+ word_res->reject_map[i].flag (R_MM_REJECT))) {
+ word_res->reject_map[i].setrej_minimal_rej_accept();
+ }
+ }
+ }
+ }
+}
+
+int16_t Tesseract::count_alphas(const WERD_CHOICE &word) {
+ int count = 0;
+ for (int i = 0; i < word.length(); ++i) {
+ if (word.unicharset()->get_isalpha(word.unichar_id(i)))
+ count++;
+ }
+ return count;
+}
+
+
+int16_t Tesseract::count_alphanums(const WERD_CHOICE &word) {
+ int count = 0;
+ for (int i = 0; i < word.length(); ++i) {
+ if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
+ word.unicharset()->get_isdigit(word.unichar_id(i)))
+ count++;
+ }
+ return count;
+}
+
+
+bool Tesseract::acceptable_number_string(const char* s,
+ const char* lengths) {
+ bool prev_digit = false;
+
+ if (*lengths == 1 && *s == '(')
+ s++;
+
+ if (*lengths == 1 &&
+ ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
+ s++;
+
+ for (; *s != '\0'; s += *(lengths++)) {
+ if (unicharset.get_isdigit(s, *lengths))
+ prev_digit = true;
+ else if (prev_digit &&
+ (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
+ prev_digit = false;
+ else if (prev_digit && *lengths == 1 &&
+ (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
+ return true;
+ else if (prev_digit &&
+ *lengths == 1 && (*s == '%') &&
+ (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
+ (*(s + *lengths + *(lengths + 1)) == '\0'))
+ return true;
+ else
+ return false;
+ }
+ return true;
+}
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/output.h b/tesseract/src/ccmain/output.h
new file mode 100644
index 00000000..00f59466
--- /dev/null
+++ b/tesseract/src/ccmain/output.h
@@ -0,0 +1,37 @@
+/******************************************************************
+ * File: output.h (Formerly output.h)
+ * Description: Output pass
+ * Author: Phil Cheatle
+ * Created: Thu Aug 4 10:56:08 BST 1994
+ *
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef OUTPUT_H
+#define OUTPUT_H
+
+namespace tesseract {
+
+class BLOCK;
+class WERD;
+
+/** test line ends */
+char determine_newline_type(WERD *word, ///< word to do
+ BLOCK *block, ///< current block
+ WERD *next_word, ///< next word
+ BLOCK *next_block ///< block of next word
+ );
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/ccmain/pageiterator.cpp b/tesseract/src/ccmain/pageiterator.cpp
new file mode 100644
index 00000000..75dd9b40
--- /dev/null
+++ b/tesseract/src/ccmain/pageiterator.cpp
@@ -0,0 +1,635 @@
+///////////////////////////////////////////////////////////////////////
+// File: pageiterator.cpp
+// Description: Iterator for tesseract page structure that avoids using
+// tesseract internal data structures.
+// Author: Ray Smith
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include <tesseract/pageiterator.h>
+#include "allheaders.h"
+#include "helpers.h"
+#include "pageres.h"
+#include "tesseractclass.h"
+
+#include <algorithm>
+
+namespace tesseract {
+
+PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract, int scale,
+ int scaled_yres, int rect_left, int rect_top,
+ int rect_width, int rect_height)
+ : page_res_(page_res),
+ tesseract_(tesseract),
+ word_(nullptr),
+ word_length_(0),
+ blob_index_(0),
+ cblob_it_(nullptr),
+ include_upper_dots_(false),
+ include_lower_dots_(false),
+ scale_(scale),
+ scaled_yres_(scaled_yres),
+ rect_left_(rect_left),
+ rect_top_(rect_top),
+ rect_width_(rect_width),
+ rect_height_(rect_height) {
+ it_ = new PAGE_RES_IT(page_res);
+ PageIterator::Begin();
+}
+
+PageIterator::~PageIterator() {
+ delete it_;
+ delete cblob_it_;
+}
+
+/**
+ * PageIterators may be copied! This makes it possible to iterate over
+ * all the objects at a lower level, while maintaining an iterator to
+ * objects at a higher level.
+ */
+PageIterator::PageIterator(const PageIterator& src)
+ : page_res_(src.page_res_),
+ tesseract_(src.tesseract_),
+ word_(nullptr),
+ word_length_(src.word_length_),
+ blob_index_(src.blob_index_),
+ cblob_it_(nullptr),
+ include_upper_dots_(src.include_upper_dots_),
+ include_lower_dots_(src.include_lower_dots_),
+ scale_(src.scale_),
+ scaled_yres_(src.scaled_yres_),
+ rect_left_(src.rect_left_),
+ rect_top_(src.rect_top_),
+ rect_width_(src.rect_width_),
+ rect_height_(src.rect_height_) {
+ it_ = new PAGE_RES_IT(*src.it_);
+ BeginWord(src.blob_index_);
+}
+
+const PageIterator& PageIterator::operator=(const PageIterator& src) {
+ page_res_ = src.page_res_;
+ tesseract_ = src.tesseract_;
+ include_upper_dots_ = src.include_upper_dots_;
+ include_lower_dots_ = src.include_lower_dots_;
+ scale_ = src.scale_;
+ scaled_yres_ = src.scaled_yres_;
+ rect_left_ = src.rect_left_;
+ rect_top_ = src.rect_top_;
+ rect_width_ = src.rect_width_;
+ rect_height_ = src.rect_height_;
+ delete it_;
+ it_ = new PAGE_RES_IT(*src.it_);
+ BeginWord(src.blob_index_);
+ return *this;
+}
+
+bool PageIterator::PositionedAtSameWord(const PAGE_RES_IT* other) const {
+ return (it_ == nullptr && it_ == other) ||
+ ((other != nullptr) && (it_ != nullptr) && (*it_ == *other));
+}
+
+// ============= Moving around within the page ============.
+
+/** Resets the iterator to point to the start of the page. */
+void PageIterator::Begin() {
+ it_->restart_page_with_empties();
+ BeginWord(0);
+}
+
+void PageIterator::RestartParagraph() {
+ if (it_->block() == nullptr) return; // At end of the document.
+ PAGE_RES_IT para(page_res_);
+ PAGE_RES_IT next_para(para);
+ next_para.forward_paragraph();
+ while (next_para.cmp(*it_) <= 0) {
+ para = next_para;
+ next_para.forward_paragraph();
+ }
+ *it_ = para;
+ BeginWord(0);
+}
+
+bool PageIterator::IsWithinFirstTextlineOfParagraph() const {
+ PageIterator p_start(*this);
+ p_start.RestartParagraph();
+ return p_start.it_->row() == it_->row();
+}
+
+void PageIterator::RestartRow() {
+ it_->restart_row();
+ BeginWord(0);
+}
+
+/**
+ * Moves to the start of the next object at the given level in the
+ * page hierarchy, and returns false if the end of the page was reached.
+ * NOTE (CHANGED!) that ALL PageIteratorLevel level values will visit each
+ * non-text block at least once.
+ * Think of non text blocks as containing a single para, with at least one
+ * line, with a single imaginary word, containing a single symbol.
+ * The bounding boxes mark out any polygonal nature of the block, and
+ * PTIsTextType(BLockType()) is false for non-text blocks.
+ * Calls to Next with different levels may be freely intermixed.
+ * This function iterates words in right-to-left scripts correctly, if
+ * the appropriate language has been loaded into Tesseract.
+ */
+bool PageIterator::Next(PageIteratorLevel level) {
+ if (it_->block() == nullptr) return false; // Already at the end!
+ if (it_->word() == nullptr)
+ level = RIL_BLOCK;
+
+ switch (level) {
+ case RIL_BLOCK:
+ it_->forward_block();
+ break;
+ case RIL_PARA:
+ it_->forward_paragraph();
+ break;
+ case RIL_TEXTLINE:
+ for (it_->forward_with_empties(); it_->row() == it_->prev_row();
+ it_->forward_with_empties());
+ break;
+ case RIL_WORD:
+ it_->forward_with_empties();
+ break;
+ case RIL_SYMBOL:
+ if (cblob_it_ != nullptr)
+ cblob_it_->forward();
+ ++blob_index_;
+ if (blob_index_ >= word_length_)
+ it_->forward_with_empties();
+ else
+ return true;
+ break;
+ }
+ BeginWord(0);
+ return it_->block() != nullptr;
+}
+
+/**
+ * Returns true if the iterator is at the start of an object at the given
+ * level. Possible uses include determining if a call to Next(RIL_WORD)
+ * moved to the start of a RIL_PARA.
+ */
+bool PageIterator::IsAtBeginningOf(PageIteratorLevel level) const {
+ if (it_->block() == nullptr) return false; // Already at the end!
+ if (it_->word() == nullptr) return true; // In an image block.
+ switch (level) {
+ case RIL_BLOCK:
+ return blob_index_ == 0 && it_->block() != it_->prev_block();
+ case RIL_PARA:
+ return blob_index_ == 0 &&
+ (it_->block() != it_->prev_block() ||
+ it_->row()->row->para() != it_->prev_row()->row->para());
+ case RIL_TEXTLINE:
+ return blob_index_ == 0 && it_->row() != it_->prev_row();
+ case RIL_WORD:
+ return blob_index_ == 0;
+ case RIL_SYMBOL:
+ return true;
+ }
+ return false;
+}
+
+/**
+ * Returns whether the iterator is positioned at the last element in a
+ * given level. (e.g. the last word in a line, the last line in a block)
+ */
+bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
+ PageIteratorLevel element) const {
+ if (Empty(element)) return true; // Already at the end!
+ // The result is true if we step forward by element and find we are
+ // at the the end of the page or at beginning of *all* levels in:
+ // [level, element).
+ // When there is more than one level difference between element and level,
+ // we could for instance move forward one symbol and still be at the first
+ // word on a line, so we also have to be at the first symbol in a word.
+ PageIterator next(*this);
+ next.Next(element);
+ if (next.Empty(element)) return true; // Reached the end of the page.
+ while (element > level) {
+ element = static_cast<PageIteratorLevel>(element - 1);
+ if (!next.IsAtBeginningOf(element))
+ return false;
+ }
+ return true;
+}
+
+/**
+ * Returns whether this iterator is positioned
+ * before other: -1
+ * equal to other: 0
+ * after other: 1
+ */
+int PageIterator::Cmp(const PageIterator &other) const {
+ int word_cmp = it_->cmp(*other.it_);
+ if (word_cmp != 0)
+ return word_cmp;
+ if (blob_index_ < other.blob_index_)
+ return -1;
+ if (blob_index_ == other.blob_index_)
+ return 0;
+ return 1;
+}
+
+// ============= Accessing data ==============.
+// Coordinate system:
+// Integer coordinates are at the cracks between the pixels.
+// The top-left corner of the top-left pixel in the image is at (0,0).
+// The bottom-right corner of the bottom-right pixel in the image is at
+// (width, height).
+// Every bounding box goes from the top-left of the top-left contained
+// pixel to the bottom-right of the bottom-right contained pixel, so
+// the bounding box of the single top-left pixel in the image is:
+// (0,0)->(1,1).
+// If an image rectangle has been set in the API, then returned coordinates
+// relate to the original (full) image, rather than the rectangle.
+
+/**
+ * Returns the bounding rectangle of the current object at the given level in
+ * the coordinates of the working image that is pix_binary().
+ * See comment on coordinate system above.
+ * Returns false if there is no such object at the current position.
+ */
+bool PageIterator::BoundingBoxInternal(PageIteratorLevel level,
+ int* left, int* top,
+ int* right, int* bottom) const {
+ if (Empty(level))
+ return false;
+ TBOX box;
+ PARA *para = nullptr;
+ switch (level) {
+ case RIL_BLOCK:
+ box = it_->block()->block->restricted_bounding_box(include_upper_dots_,
+ include_lower_dots_);
+ break;
+ case RIL_PARA:
+ para = it_->row()->row->para();
+ // Fall through.
+ case RIL_TEXTLINE:
+ box = it_->row()->row->restricted_bounding_box(include_upper_dots_,
+ include_lower_dots_);
+ break;
+ case RIL_WORD:
+ box = it_->word()->word->restricted_bounding_box(include_upper_dots_,
+ include_lower_dots_);
+ break;
+ case RIL_SYMBOL:
+ if (cblob_it_ == nullptr)
+ box = it_->word()->box_word->BlobBox(blob_index_);
+ else
+ box = cblob_it_->data()->bounding_box();
+ }
+ if (level == RIL_PARA) {
+ PageIterator other = *this;
+ other.Begin();
+ do {
+ if (other.it_->block() &&
+ other.it_->block()->block == it_->block()->block &&
+ other.it_->row() && other.it_->row()->row &&
+ other.it_->row()->row->para() == para) {
+ box = box.bounding_union(other.it_->row()->row->bounding_box());
+ }
+ } while (other.Next(RIL_TEXTLINE));
+ }
+ if (level != RIL_SYMBOL || cblob_it_ != nullptr)
+ box.rotate(it_->block()->block->re_rotation());
+ // Now we have a box in tesseract coordinates relative to the image rectangle,
+ // we have to convert the coords to a top-down system.
+ const int pix_height = pixGetHeight(tesseract_->pix_binary());
+ const int pix_width = pixGetWidth(tesseract_->pix_binary());
+ *left = ClipToRange(static_cast<int>(box.left()), 0, pix_width);
+ *top = ClipToRange(pix_height - box.top(), 0, pix_height);
+ *right = ClipToRange(static_cast<int>(box.right()), *left, pix_width);
+ *bottom = ClipToRange(pix_height - box.bottom(), *top, pix_height);
+ return true;
+}
+
+/**
+ * Returns the bounding rectangle of the current object at the given level in
+ * coordinates of the original image.
+ * See comment on coordinate system above.
+ * Returns false if there is no such object at the current position.
+ */
+bool PageIterator::BoundingBox(PageIteratorLevel level,
+ int* left, int* top,
+ int* right, int* bottom) const {
+ return BoundingBox(level, 0, left, top, right, bottom);
+}
+
+bool PageIterator::BoundingBox(PageIteratorLevel level, const int padding,
+ int* left, int* top,
+ int* right, int* bottom) const {
+ if (!BoundingBoxInternal(level, left, top, right, bottom))
+ return false;
+ // Convert to the coordinate system of the original image.
+ *left = ClipToRange(*left / scale_ + rect_left_ - padding,
+ rect_left_, rect_left_ + rect_width_);
+ *top = ClipToRange(*top / scale_ + rect_top_ - padding,
+ rect_top_, rect_top_ + rect_height_);
+ *right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_ + padding,
+ *left, rect_left_ + rect_width_);
+ *bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_ + padding,
+ *top, rect_top_ + rect_height_);
+ return true;
+}
+
+/** Return that there is no such object at a given level. */
+bool PageIterator::Empty(PageIteratorLevel level) const {
+ if (it_->block() == nullptr) return true; // Already at the end!
+ if (it_->word() == nullptr && level != RIL_BLOCK) return true; // image block
+ if (level == RIL_SYMBOL && blob_index_ >= word_length_)
+ return true; // Zero length word, or already at the end of it.
+ return false;
+}
+
+/** Returns the type of the current block.
+ * See tesseract/publictypes.h for PolyBlockType. */
+PolyBlockType PageIterator::BlockType() const {
+ if (it_->block() == nullptr || it_->block()->block == nullptr)
+ return PT_UNKNOWN; // Already at the end!
+ if (it_->block()->block->pdblk.poly_block() == nullptr)
+ return PT_FLOWING_TEXT; // No layout analysis used - assume text.
+ return it_->block()->block->pdblk.poly_block()->isA();
+}
+
+/** Returns the polygon outline of the current block. The returned Pta must
+ * be ptaDestroy-ed after use. */
+Pta* PageIterator::BlockPolygon() const {
+ if (it_->block() == nullptr || it_->block()->block == nullptr)
+ return nullptr; // Already at the end!
+ if (it_->block()->block->pdblk.poly_block() == nullptr)
+ return nullptr; // No layout analysis used - no polygon.
+ // Copy polygon, so we can unrotate it to image coordinates.
+ POLY_BLOCK* internal_poly = it_->block()->block->pdblk.poly_block();
+ ICOORDELT_LIST vertices;
+ vertices.deep_copy(internal_poly->points(), ICOORDELT::deep_copy);
+ POLY_BLOCK poly(&vertices, internal_poly->isA());
+ poly.rotate(it_->block()->block->re_rotation());
+ ICOORDELT_IT it(poly.points());
+ Pta* pta = ptaCreate(it.length());
+ int num_pts = 0;
+ for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++num_pts) {
+ ICOORD* pt = it.data();
+ // Convert to top-down coords within the input image.
+ int x = static_cast<float>(pt->x()) / scale_ + rect_left_;
+ int y = rect_top_ + rect_height_ - static_cast<float>(pt->y()) / scale_;
+ x = ClipToRange(x, rect_left_, rect_left_ + rect_width_);
+ y = ClipToRange(y, rect_top_, rect_top_ + rect_height_);
+ ptaAddPt(pta, x, y);
+ }
+ return pta;
+}
+
+/**
+ * Returns a binary image of the current object at the given level.
+ * The position and size match the return from BoundingBoxInternal, and so this
+ * could be upscaled with respect to the original input image.
+ * Use pixDestroy to delete the image after use.
+ * The following methods are used to generate the images:
+ * RIL_BLOCK: mask the page image with the block polygon.
+ * RIL_TEXTLINE: Clip the rectangle of the line box from the page image.
+ * TODO(rays) fix this to generate and use a line polygon.
+ * RIL_WORD: Clip the rectangle of the word box from the page image.
+ * RIL_SYMBOL: Render the symbol outline to an image for cblobs (prior
+ * to recognition) or the bounding box otherwise.
+ * A reconstruction of the original image (using xor to check for double
+ * representation) should be reasonably accurate,
+ * apart from removed noise, at the block level. Below the block level, the
+ * reconstruction will be missing images and line separators.
+ * At the symbol level, kerned characters will be invade the bounding box
+ * if rendered after recognition, making an xor reconstruction inaccurate, but
+ * an or construction better. Before recognition, symbol-level reconstruction
+ * should be good, even with xor, since the images come from the connected
+ * components.
+ */
+Pix* PageIterator::GetBinaryImage(PageIteratorLevel level) const {
+ int left, top, right, bottom;
+ if (!BoundingBoxInternal(level, &left, &top, &right, &bottom))
+ return nullptr;
+ if (level == RIL_SYMBOL && cblob_it_ != nullptr &&
+ cblob_it_->data()->area() != 0)
+ return cblob_it_->data()->render();
+ Box* box = boxCreate(left, top, right - left, bottom - top);
+ Pix* pix = pixClipRectangle(tesseract_->pix_binary(), box, nullptr);
+ boxDestroy(&box);
+ if (level == RIL_BLOCK || level == RIL_PARA) {
+ // Clip to the block polygon as well.
+ TBOX mask_box;
+ Pix* mask = it_->block()->block->render_mask(&mask_box);
+ int mask_x = left - mask_box.left();
+ int mask_y = top - (tesseract_->ImageHeight() - mask_box.top());
+ // AND the mask and pix, putting the result in pix.
+ pixRasterop(pix, std::max(0, -mask_x), std::max(0, -mask_y), pixGetWidth(pix),
+ pixGetHeight(pix), PIX_SRC & PIX_DST, mask, std::max(0, mask_x),
+ std::max(0, mask_y));
+ pixDestroy(&mask);
+ }
+ return pix;
+}
+
+/**
+ * Returns an image of the current object at the given level in greyscale
+ * if available in the input. To guarantee a binary image use BinaryImage.
+ * NOTE that in order to give the best possible image, the bounds are
+ * expanded slightly over the binary connected component, by the supplied
+ * padding, so the top-left position of the returned image is returned
+ * in (left,top). These will most likely not match the coordinates
+ * returned by BoundingBox.
+ * If you do not supply an original image, you will get a binary one.
+ * Use pixDestroy to delete the image after use.
+ */
+Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
+ Pix* original_img,
+ int* left, int* top) const {
+ int right, bottom;
+ if (!BoundingBox(level, left, top, &right, &bottom))
+ return nullptr;
+ if (original_img == nullptr)
+ return GetBinaryImage(level);
+
+ // Expand the box.
+ *left = std::max(*left - padding, 0);
+ *top = std::max(*top - padding, 0);
+ right = std::min(right + padding, rect_width_);
+ bottom = std::min(bottom + padding, rect_height_);
+ Box* box = boxCreate(*left, *top, right - *left, bottom - *top);
+ Pix* grey_pix = pixClipRectangle(original_img, box, nullptr);
+ boxDestroy(&box);
+ if (level == RIL_BLOCK || level == RIL_PARA) {
+ // Clip to the block polygon as well.
+ TBOX mask_box;
+ Pix* mask = it_->block()->block->render_mask(&mask_box);
+ // Copy the mask registered correctly into an image the size of grey_pix.
+ int mask_x = *left - mask_box.left();
+ int mask_y = *top - (pixGetHeight(original_img) - mask_box.top());
+ int width = pixGetWidth(grey_pix);
+ int height = pixGetHeight(grey_pix);
+ Pix* resized_mask = pixCreate(width, height, 1);
+ pixRasterop(resized_mask, std::max(0, -mask_x), std::max(0, -mask_y), width, height,
+ PIX_SRC, mask, std::max(0, mask_x), std::max(0, mask_y));
+ pixDestroy(&mask);
+ pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1,
+ 2 * padding + 1);
+ pixInvert(resized_mask, resized_mask);
+ pixSetMasked(grey_pix, resized_mask, UINT32_MAX);
+ pixDestroy(&resized_mask);
+ }
+ return grey_pix;
+}
+
+/**
+ * Returns the baseline of the current object at the given level.
+ * The baseline is the line that passes through (x1, y1) and (x2, y2).
+ * WARNING: with vertical text, baselines may be vertical!
+ */
+bool PageIterator::Baseline(PageIteratorLevel level,
+ int* x1, int* y1, int* x2, int* y2) const {
+ if (it_->word() == nullptr) return false; // Already at the end!
+ ROW* row = it_->row()->row;
+ WERD* word = it_->word()->word;
+ TBOX box = (level == RIL_WORD || level == RIL_SYMBOL)
+ ? word->bounding_box()
+ : row->bounding_box();
+ int left = box.left();
+ ICOORD startpt(left, static_cast<int16_t>(row->base_line(left) + 0.5));
+ int right = box.right();
+ ICOORD endpt(right, static_cast<int16_t>(row->base_line(right) + 0.5));
+ // Rotate to image coordinates and convert to global image coords.
+ startpt.rotate(it_->block()->block->re_rotation());
+ endpt.rotate(it_->block()->block->re_rotation());
+ *x1 = startpt.x() / scale_ + rect_left_;
+ *y1 = (rect_height_ - startpt.y()) / scale_ + rect_top_;
+ *x2 = endpt.x() / scale_ + rect_left_;
+ *y2 = (rect_height_ - endpt.y()) / scale_ + rect_top_;
+ return true;
+}
+
+void PageIterator::Orientation(tesseract::Orientation *orientation,
+ tesseract::WritingDirection *writing_direction,
+ tesseract::TextlineOrder *textline_order,
+ float *deskew_angle) const {
+ BLOCK* block = it_->block()->block;
+
+ // Orientation
+ FCOORD up_in_image(0.0, 1.0);
+ up_in_image.unrotate(block->classify_rotation());
+ up_in_image.rotate(block->re_rotation());
+
+ if (up_in_image.x() == 0.0F) {
+ if (up_in_image.y() > 0.0F) {
+ *orientation = ORIENTATION_PAGE_UP;
+ } else {
+ *orientation = ORIENTATION_PAGE_DOWN;
+ }
+ } else if (up_in_image.x() > 0.0F) {
+ *orientation = ORIENTATION_PAGE_RIGHT;
+ } else {
+ *orientation = ORIENTATION_PAGE_LEFT;
+ }
+
+ // Writing direction
+ bool is_vertical_text = (block->classify_rotation().x() == 0.0);
+ bool right_to_left = block->right_to_left();
+ *writing_direction =
+ is_vertical_text
+ ? WRITING_DIRECTION_TOP_TO_BOTTOM
+ : (right_to_left
+ ? WRITING_DIRECTION_RIGHT_TO_LEFT
+ : WRITING_DIRECTION_LEFT_TO_RIGHT);
+
+ // Textline Order
+ const bool is_mongolian = false; // TODO(eger): fix me
+ *textline_order = is_vertical_text
+ ? (is_mongolian
+ ? TEXTLINE_ORDER_LEFT_TO_RIGHT
+ : TEXTLINE_ORDER_RIGHT_TO_LEFT)
+ : TEXTLINE_ORDER_TOP_TO_BOTTOM;
+
+ // Deskew angle
+ FCOORD skew = block->skew(); // true horizontal for textlines
+ *deskew_angle = -skew.angle();
+}
+
+void PageIterator::ParagraphInfo(tesseract::ParagraphJustification *just,
+ bool *is_list_item,
+ bool *is_crown,
+ int *first_line_indent) const {
+ *just = tesseract::JUSTIFICATION_UNKNOWN;
+ if (!it_->row() || !it_->row()->row || !it_->row()->row->para() ||
+ !it_->row()->row->para()->model)
+ return;
+
+ PARA *para = it_->row()->row->para();
+ *is_list_item = para->is_list_item;
+ *is_crown = para->is_very_first_or_continuation;
+ *first_line_indent = para->model->first_indent() -
+ para->model->body_indent();
+ *just = para->model->justification();
+}
+
+/**
+ * Sets up the internal data for iterating the blobs of a new word, then
+ * moves the iterator to the given offset.
+ */
+void PageIterator::BeginWord(int offset) {
+ WERD_RES* word_res = it_->word();
+ if (word_res == nullptr) {
+ // This is a non-text block, so there is no word.
+ word_length_ = 0;
+ blob_index_ = 0;
+ word_ = nullptr;
+ return;
+ }
+ if (word_res->best_choice != nullptr) {
+ // Recognition has been done, so we are using the box_word, which
+ // is already baseline denormalized.
+ word_length_ = word_res->best_choice->length();
+ if (word_res->box_word != nullptr) {
+ if (word_res->box_word->length() != word_length_) {
+ tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ",
+ word_length_, word_res->best_choice->unichar_string().c_str(),
+ word_res->box_word->length());
+ word_res->box_word->bounding_box().print();
+ }
+ ASSERT_HOST(word_res->box_word->length() == word_length_);
+ }
+ word_ = nullptr;
+ // We will be iterating the box_word.
+ delete cblob_it_;
+ cblob_it_ = nullptr;
+ } else {
+ // No recognition yet, so a "symbol" is a cblob.
+ word_ = word_res->word;
+ ASSERT_HOST(word_->cblob_list() != nullptr);
+ word_length_ = word_->cblob_list()->length();
+ if (cblob_it_ == nullptr) cblob_it_ = new C_BLOB_IT;
+ cblob_it_->set_to_list(word_->cblob_list());
+ }
+ for (blob_index_ = 0; blob_index_ < offset; ++blob_index_) {
+ if (cblob_it_ != nullptr)
+ cblob_it_->forward();
+ }
+}
+
+bool PageIterator::SetWordBlamerBundle(BlamerBundle *blamer_bundle) {
+ if (it_->word() != nullptr) {
+ it_->word()->blamer_bundle = blamer_bundle;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+} // namespace tesseract.
diff --git a/tesseract/src/ccmain/pagesegmain.cpp b/tesseract/src/ccmain/pagesegmain.cpp
new file mode 100644
index 00000000..d3a32fab
--- /dev/null
+++ b/tesseract/src/ccmain/pagesegmain.cpp
@@ -0,0 +1,420 @@
+/**********************************************************************
+ * File: pagesegmain.cpp
+ * Description: Top-level page segmenter for Tesseract.
+ * Author: Ray Smith
+ *
+ * (C) Copyright 2008, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifdef _WIN32
+#ifndef unlink
+#include <io.h>
+#endif
+#else
+#include <unistd.h>
+#endif // _WIN32
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#include "allheaders.h"
+#include "blobbox.h"
+#include "blread.h"
+#include "colfind.h"
+#include "debugpixa.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#include "equationdetect.h"
+#endif
+#include "imagefind.h"
+#include "linefind.h"
+#include "makerow.h"
+#include <tesseract/osdetect.h>
+#include "tabvector.h"
+#include "tesseractclass.h"
+#include "tessvars.h"
+#include "textord.h"
+#include "tordmain.h"
+#include "wordseg.h"
+
+namespace tesseract {
+
+// Max erosions to perform in removing an enclosing circle.
+const int kMaxCircleErosions = 8;
+
+// Helper to remove an enclosing circle from an image.
+// If there isn't one, then the image will most likely get badly mangled.
+// The returned pix must be pixDestroyed after use. nullptr may be returned
+// if the image doesn't meet the trivial conditions that it uses to determine
+// success.
+static Pix* RemoveEnclosingCircle(Pix* pixs) {
+ Pix* pixsi = pixInvert(nullptr, pixs);
+ Pix* pixc = pixCreateTemplate(pixs);
+ pixSetOrClearBorder(pixc, 1, 1, 1, 1, PIX_SET);
+ pixSeedfillBinary(pixc, pixc, pixsi, 4);
+ pixInvert(pixc, pixc);
+ pixDestroy(&pixsi);
+ Pix* pixt = pixAnd(nullptr, pixs, pixc);
+ l_int32 max_count;
+ pixCountConnComp(pixt, 8, &max_count);
+ // The count has to go up before we start looking for the minimum.
+ l_int32 min_count = INT32_MAX;
+ Pix* pixout = nullptr;
+ for (int i = 1; i < kMaxCircleErosions; i++) {
+ pixDestroy(&pixt);
+ pixErodeBrick(pixc, pixc, 3, 3);
+ pixt = pixAnd(nullptr, pixs, pixc);
+ l_int32 count;
+ pixCountConnComp(pixt, 8, &count);
+ if (i == 1 || count > max_count) {
+ max_count = count;
+ min_count = count;
+ } else if (count < min_count) {
+ min_count = count;
+ pixDestroy(&pixout);
+ pixout = pixCopy(nullptr, pixt); // Save the best.
+ } else if (count >= min_count) {
+ break; // We have passed by the best.
+ }
+ }
+ pixDestroy(&pixt);
+ pixDestroy(&pixc);
+ return pixout;
+}
+
+/**
+ * Segment the page according to the current value of tessedit_pageseg_mode.
+ * pix_binary_ is used as the source image and should not be nullptr.
+ * On return the blocks list owns all the constructed page layout.
+ */
+int Tesseract::SegmentPage(const char* input_file, BLOCK_LIST* blocks,
+ Tesseract* osd_tess, OSResults* osr) {
+ ASSERT_HOST(pix_binary_ != nullptr);
+ int width = pixGetWidth(pix_binary_);
+ int height = pixGetHeight(pix_binary_);
+ // Get page segmentation mode.
+ auto pageseg_mode = static_cast<PageSegMode>(
+ static_cast<int>(tessedit_pageseg_mode));
+ // If a UNLV zone file can be found, use that instead of segmentation.
+ if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
+ input_file != nullptr && input_file[0] != '\0') {
+ std::string name = input_file;
+ const char* lastdot = strrchr(name.c_str(), '.');
+ if (lastdot != nullptr)
+ name[lastdot - name.c_str()] = '\0';
+ read_unlv_file(name, width, height, blocks);
+ }
+ if (blocks->empty()) {
+ // No UNLV file present. Work according to the PageSegMode.
+ // First make a single block covering the whole image.
+ BLOCK_IT block_it(blocks);
+ auto* block = new BLOCK("", true, 0, 0, 0, 0, width, height);
+ block->set_right_to_left(right_to_left());
+ block_it.add_to_end(block);
+ } else {
+ // UNLV file present. Use PSM_SINGLE_BLOCK.
+ pageseg_mode = PSM_SINGLE_BLOCK;
+ }
+ // The diacritic_blobs holds noise blobs that may be diacritics. They
+ // are separated out on areas of the image that seem noisy and short-circuit
+ // the layout process, going straight from the initial partition creation
+ // right through to after word segmentation, where they are added to the
+ // rej_cblobs list of the most appropriate word. From there classification
+ // will determine whether they are used.
+ BLOBNBOX_LIST diacritic_blobs;
+ int auto_page_seg_ret_val = 0;
+ TO_BLOCK_LIST to_blocks;
+ if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
+ PSM_SPARSE(pageseg_mode)) {
+ auto_page_seg_ret_val = AutoPageSeg(
+ pageseg_mode, blocks, &to_blocks,
+ enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr);
+ if (pageseg_mode == PSM_OSD_ONLY)
+ return auto_page_seg_ret_val;
+ // To create blobs from the image region bounds uncomment this line:
+ // to_blocks.clear(); // Uncomment to go back to the old mode.
+ } else {
+ deskew_ = FCOORD(1.0f, 0.0f);
+ reskew_ = FCOORD(1.0f, 0.0f);
+ if (pageseg_mode == PSM_CIRCLE_WORD) {
+ Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
+ if (pixcleaned != nullptr) {
+ pixDestroy(&pix_binary_);
+ pix_binary_ = pixcleaned;
+ }
+ }
+ }
+
+ if (auto_page_seg_ret_val < 0) {
+ return -1;
+ }
+
+ if (blocks->empty()) {
+ if (textord_debug_tabfind)
+ tprintf("Empty page\n");
+ return 0; // AutoPageSeg found an empty page.
+ }
+ bool splitting =
+ pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT;
+ bool cjk_mode = textord_use_cjk_fp_model;
+
+ textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
+ pix_thresholds_, pix_grey_, splitting || cjk_mode,
+ &diacritic_blobs, blocks, &to_blocks);
+ return auto_page_seg_ret_val;
+}
+
+/**
+ * Auto page segmentation. Divide the page image into blocks of uniform
+ * text linespacing and images.
+ *
+ * Resolution (in ppi) is derived from the input image.
+ *
+ * The output goes in the blocks list with corresponding TO_BLOCKs in the
+ * to_blocks list.
+ *
+ * If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide
+ * the image into columns, but multiple blocks are still made if the text is
+ * of non-uniform linespacing.
+ *
+ * If diacritic_blobs is non-null, then diacritics/noise blobs, that would
+ * confuse layout analysis by causing textline overlap, are placed there,
+ * with the expectation that they will be reassigned to words later and
+ * noise/diacriticness determined via classification.
+ *
+ * If osd (orientation and script detection) is true then that is performed
+ * as well. If only_osd is true, then only orientation and script detection is
+ * performed. If osd is desired, (osd or only_osd) then osr_tess must be
+ * another Tesseract that was initialized especially for osd, and the results
+ * will be output into osr (orientation and script result).
+ */
+int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
+ TO_BLOCK_LIST* to_blocks,
+ BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess,
+ OSResults* osr) {
+ Pix* photomask_pix = nullptr;
+ Pix* musicmask_pix = nullptr;
+ // The blocks made by the ColumnFinder. Moved to blocks before return.
+ BLOCK_LIST found_blocks;
+ TO_BLOCK_LIST temp_blocks;
+
+ ColumnFinder* finder = SetupPageSegAndDetectOrientation(
+ pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,
+ pageseg_apply_music_mask ? &musicmask_pix : nullptr);
+ int result = 0;
+ if (finder != nullptr) {
+ TO_BLOCK_IT to_block_it(&temp_blocks);
+ TO_BLOCK* to_block = to_block_it.data();
+ if (musicmask_pix != nullptr) {
+ // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
+ // blocks separately. For now combine with photomask_pix.
+ pixOr(photomask_pix, photomask_pix, musicmask_pix);
+ }
+ #ifndef DISABLED_LEGACY_ENGINE
+ if (equ_detect_) {
+ finder->SetEquationDetect(equ_detect_);
+ }
+ #endif // ndef DISABLED_LEGACY_ENGINE
+ result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_,
+ to_block, photomask_pix, pix_thresholds_,
+ pix_grey_, &pixa_debug_, &found_blocks,
+ diacritic_blobs, to_blocks);
+ if (result >= 0)
+ finder->GetDeskewVectors(&deskew_, &reskew_);
+ delete finder;
+ }
+ pixDestroy(&photomask_pix);
+ pixDestroy(&musicmask_pix);
+ if (result < 0) return result;
+
+ blocks->clear();
+ BLOCK_IT block_it(blocks);
+ // Move the found blocks to the input/output blocks.
+ block_it.add_list_after(&found_blocks);
+ return result;
+}
+
+// Helper adds all the scripts from sid_set converted to ids from osd_set to
+// allowed_ids.
+static void AddAllScriptsConverted(const UNICHARSET& sid_set,
+ const UNICHARSET& osd_set,
+ std::vector<int>* allowed_ids) {
+ for (int i = 0; i < sid_set.get_script_table_size(); ++i) {
+ if (i != sid_set.null_sid()) {
+ const char* script = sid_set.get_script_from_script_id(i);
+ allowed_ids->push_back(osd_set.get_script_id_from_name(script));
+ }
+ }
+}
+
+/**
+ * Sets up auto page segmentation, determines the orientation, and corrects it.
+ * Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to
+ * facilitate testing.
+ * photo_mask_pix is a pointer to a nullptr pointer that will be filled on return
+ * with the leptonica photo mask, which must be pixDestroyed by the caller.
+ * to_blocks is an empty list that will be filled with (usually a single)
+ * block that is used during layout analysis. This ugly API is required
+ * because of the possibility of a unlv zone file.
+ * TODO(rays) clean this up.
+ * See AutoPageSeg for other arguments.
+ * The returned ColumnFinder must be deleted after use.
+ */
+ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
+ PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess,
+ OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
+ Pix** music_mask_pix) {
+ int vertical_x = 0;
+ int vertical_y = 1;
+ TabVector_LIST v_lines;
+ TabVector_LIST h_lines;
+ ICOORD bleft(0, 0);
+
+ ASSERT_HOST(pix_binary_ != nullptr);
+ if (tessedit_dump_pageseg_images) {
+ pixa_debug_.AddPix(pix_binary_, "PageSegInput");
+ }
+ // Leptonica is used to find the rule/separator lines in the input.
+ LineFinder::FindAndRemoveLines(source_resolution_,
+ textord_tabfind_show_vlines, pix_binary_,
+ &vertical_x, &vertical_y, music_mask_pix,
+ &v_lines, &h_lines);
+ if (tessedit_dump_pageseg_images) {
+ pixa_debug_.AddPix(pix_binary_, "NoLines");
+ }
+ // Leptonica is used to find a mask of the photo regions in the input.
+ *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
+ if (tessedit_dump_pageseg_images) {
+ Pix* pix_no_image_ = nullptr;
+ if (*photo_mask_pix != nullptr) {
+ pix_no_image_ = pixSubtract(nullptr, pix_binary_, *photo_mask_pix);
+ } else {
+ pix_no_image_ = pixClone(pix_binary_);
+ }
+ pixa_debug_.AddPix(pix_no_image_, "NoImages");
+ pixDestroy(&pix_no_image_);
+ }
+ if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear();
+
+ // The rest of the algorithm uses the usual connected components.
+ textord_.find_components(pix_binary_, blocks, to_blocks);
+
+ TO_BLOCK_IT to_block_it(to_blocks);
+ // There must be exactly one input block.
+ // TODO(rays) handle new textline finding with a UNLV zone file.
+ ASSERT_HOST(to_blocks->singleton());
+ TO_BLOCK* to_block = to_block_it.data();
+ TBOX blkbox = to_block->block->pdblk.bounding_box();
+ ColumnFinder* finder = nullptr;
+ int estimated_resolution = source_resolution_;
+ if (source_resolution_ == kMinCredibleResolution) {
+ // Try to estimate resolution from typical body text size.
+ int res = IntCastRounded(to_block->line_size * kResolutionEstimationFactor);
+ if (res > estimated_resolution && res < kMaxCredibleResolution) {
+ estimated_resolution = res;
+ tprintf("Estimating resolution as %d\n", estimated_resolution);
+ }
+ }
+
+ if (to_block->line_size >= 2) {
+ finder = new ColumnFinder(static_cast<int>(to_block->line_size),
+ blkbox.botleft(), blkbox.topright(),
+ estimated_resolution, textord_use_cjk_fp_model,
+ textord_tabfind_aligned_gap_fraction, &v_lines,
+ &h_lines, vertical_x, vertical_y);
+
+ finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+ if (equ_detect_) {
+ equ_detect_->LabelSpecialText(to_block);
+ }
+
+ BLOBNBOX_CLIST osd_blobs;
+ // osd_orientation is the number of 90 degree rotations to make the
+ // characters upright. (See tesseract/osdetect.h for precise definition.)
+ // We want the text lines horizontal, (vertical text indicates vertical
+ // textlines) which may conflict (eg vertically written CJK).
+ int osd_orientation = 0;
+ bool vertical_text = textord_tabfind_force_vertical_text ||
+ pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
+ if (!vertical_text && textord_tabfind_vertical_text &&
+ PSM_ORIENTATION_ENABLED(pageseg_mode)) {
+ vertical_text =
+ finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio,
+ to_block, &osd_blobs);
+ }
+ if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != nullptr && osr != nullptr) {
+ std::vector<int> osd_scripts;
+ if (osd_tess != this) {
+ // We are running osd as part of layout analysis, so constrain the
+ // scripts to those allowed by *this.
+ AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
+ for (int s = 0; s < sub_langs_.size(); ++s) {
+ AddAllScriptsConverted(sub_langs_[s]->unicharset,
+ osd_tess->unicharset, &osd_scripts);
+ }
+ }
+ os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
+ if (pageseg_mode == PSM_OSD_ONLY) {
+ delete finder;
+ return nullptr;
+ }
+ osd_orientation = osr->best_result.orientation_id;
+ double osd_score = osr->orientations[osd_orientation];
+ double osd_margin = min_orientation_margin * 2;
+ for (int i = 0; i < 4; ++i) {
+ if (i != osd_orientation &&
+ osd_score - osr->orientations[i] < osd_margin) {
+ osd_margin = osd_score - osr->orientations[i];
+ }
+ }
+ int best_script_id = osr->best_result.script_id;
+ const char* best_script_str =
+ osd_tess->unicharset.get_script_from_script_id(best_script_id);
+ bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
+ best_script_id == osd_tess->unicharset.hiragana_sid() ||
+ best_script_id == osd_tess->unicharset.katakana_sid() ||
+ strcmp("Japanese", best_script_str) == 0 ||
+ strcmp("Korean", best_script_str) == 0 ||
+ strcmp("Hangul", best_script_str) == 0;
+ if (cjk) {
+ finder->set_cjk_script(true);
+ }
+ if (osd_margin < min_orientation_margin) {
+ // The margin is weak.
+ if (!cjk && !vertical_text && osd_orientation == 2) {
+ // upside down latin text is improbable with such a weak margin.
+ tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
+ "Don't rotate.\n", osd_margin);
+ osd_orientation = 0;
+ } else {
+ tprintf(
+ "OSD: Weak margin (%.2f) for %d blob text block, "
+ "but using orientation anyway: %d\n",
+ osd_margin, osd_blobs.length(), osd_orientation);
+ }
+ }
+ }
+ osd_blobs.shallow_clear();
+ finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
+
+#endif // ndef DISABLED_LEGACY_ENGINE
+ }
+
+ return finder;
+}
+
+} // namespace tesseract.
diff --git a/tesseract/src/ccmain/pagewalk.cpp b/tesseract/src/ccmain/pagewalk.cpp
new file mode 100644
index 00000000..a02fe5f4
--- /dev/null
+++ b/tesseract/src/ccmain/pagewalk.cpp
@@ -0,0 +1,43 @@
+/**********************************************************************
+ * File: pagewalk.cpp (Formerly walkers.c)
+ * Description: Block list processors
+ * Author: Phil Cheatle
+ * Created: Thu Oct 10 16:25:24 BST 1991
+ *
+ * (C) Copyright 1991, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "pageres.h"
+#include "tesseractclass.h"
+
+namespace tesseract {
+/**
+ * @name process_selected_words()
+ *
+ * Walk the current block list applying the specified word processor function
+ * to each word that overlaps the selection_box.
+ */
+void Tesseract::process_selected_words(
+ PAGE_RES* page_res, // blocks to check
+ TBOX& selection_box,
+ bool (tesseract::Tesseract::* word_processor)(PAGE_RES_IT* pr_it)) {
+ for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != nullptr;
+ page_res_it.forward()) {
+ WERD* word = page_res_it.word()->word;
+ if (word->bounding_box().overlap(selection_box)) {
+ if (!(this->*word_processor)(&page_res_it))
+ return;
+ }
+ }
+}
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/par_control.cpp b/tesseract/src/ccmain/par_control.cpp
new file mode 100644
index 00000000..c1c17298
--- /dev/null
+++ b/tesseract/src/ccmain/par_control.cpp
@@ -0,0 +1,73 @@
+///////////////////////////////////////////////////////////////////////
+// File: par_control.cpp
+// Description: Control code for parallel implementation.
+// Author: Ray Smith
+//
+// (C) Copyright 2013, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "tesseractclass.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif // _OPENMP
+
+namespace tesseract {
+
+struct BlobData {
+ BlobData() = default;
+ BlobData(int index, Tesseract* tess, const WERD_RES& word)
+ : blob(word.chopped_word->blobs[index]),
+ tesseract(tess),
+ choices(&(*word.ratings)(index, index)) {}
+
+ TBLOB* blob = nullptr;
+ Tesseract* tesseract = nullptr;
+ BLOB_CHOICE_LIST** choices = nullptr;
+};
+
+void Tesseract::PrerecAllWordsPar(const std::vector<WordData>& words) {
+ // Prepare all the blobs.
+ std::vector<BlobData> blobs;
+ for (size_t w = 0; w < words.size(); ++w) {
+ if (words[w].word->ratings != nullptr &&
+ words[w].word->ratings->get(0, 0) == nullptr) {
+ for (int s = 0; s < words[w].lang_words.size(); ++s) {
+ Tesseract* sub = s < sub_langs_.size() ? sub_langs_[s] : this;
+ const WERD_RES& word = *words[w].lang_words[s];
+ for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
+ blobs.push_back(BlobData(b, sub, word));
+ }
+ }
+ }
+ }
+ // Pre-classify all the blobs.
+ if (tessedit_parallelize > 1) {
+#ifdef _OPENMP
+#pragma omp parallel for num_threads(10)
+#endif // _OPENMP
+ for (size_t b = 0; b < blobs.size(); ++b) {
+ *blobs[b].choices =
+ blobs[b].tesseract->classify_blob(blobs[b].blob, "par",
+ ScrollView::WHITE, nullptr);
+ }
+ } else {
+ // TODO(AMD) parallelize this.
+ for (size_t b = 0; b < blobs.size(); ++b) {
+ *blobs[b].choices =
+ blobs[b].tesseract->classify_blob(blobs[b].blob, "par",
+ ScrollView::WHITE, nullptr);
+ }
+ }
+}
+
+} // namespace tesseract.
diff --git a/tesseract/src/ccmain/paragraphs.cpp b/tesseract/src/ccmain/paragraphs.cpp
new file mode 100644
index 00000000..28576579
--- /dev/null
+++ b/tesseract/src/ccmain/paragraphs.cpp
@@ -0,0 +1,2590 @@
+/**********************************************************************
+ * File: paragraphs.cpp
+ * Description: Paragraph detection for tesseract.
+ * Author: David Eger
+ *
+ * (C) Copyright 2011, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "paragraphs.h"
+
+#include "genericvector.h" // for GenericVector, GenericVectorEqEq
+#include "helpers.h" // for UpdateRange, ClipToRange
+#include "host.h" // for NearlyEqual
+#include "mutableiterator.h" // for MutableIterator
+#include "ocrblock.h" // for BLOCK
+#include "ocrpara.h" // for ParagraphModel, PARA, PARA_IT, PARA...
+#include "ocrrow.h" // for ROW
+#include "pageres.h" // for PAGE_RES_IT, WERD_RES, ROW_RES, BLO...
+#include "paragraphs_internal.h" // for RowScratchRegisters, SetOfModels
+#include "pdblock.h" // for PDBLK
+#include "polyblk.h" // for POLY_BLOCK
+#include "ratngs.h" // for WERD_CHOICE
+#include "rect.h" // for TBOX
+#include "statistc.h" // for STATS
+#include "strngs.h" // for STRING
+#include "tprintf.h" // for tprintf
+#include "unicharset.h" // for UNICHARSET
+#include "werd.h" // for WERD, W_REP_CHAR
+
+#include <tesseract/pageiterator.h> // for PageIterator
+#include <tesseract/publictypes.h> // for JUSTIFICATION_LEFT, JUSTIFICATION_R...
+#include <tesseract/unichar.h> // for UNICHAR, UNICHAR_ID
+
+#include <cctype> // for isspace
+#include <cmath> // for abs
+#include <cstdio> // for snprintf
+#include <cstdlib> // for abs
+#include <cstring> // for strchr, strlen
+#include <algorithm> // for max
+#include <memory> // for unique_ptr
+
+static const char * const kRLE = "\u202A"; // Right-to-Left Embedding
+static const char * const kPDF = "\u202C"; // Pop Directional Formatting
+
+namespace tesseract {
+
+// Special "weak" ParagraphModels.
+const ParagraphModel *kCrownLeft
+ = reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD111F));
+const ParagraphModel *kCrownRight
+ = reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD888F));
+
+// Do the text and geometry of two rows support a paragraph break between them?
+static bool LikelyParagraphStart(const RowScratchRegisters &before,
+ const RowScratchRegisters &after,
+ tesseract::ParagraphJustification j);
+
+// Given the width of a typical space between words, what is the threshold
+// by which by which we think left and right alignments for paragraphs
+// can vary and still be aligned.
+static int Epsilon(int space_pix) {
+ return space_pix * 4 / 5;
+}
+
+static bool AcceptableRowArgs(
+ int debug_level, int min_num_rows, const char *function_name,
+ const GenericVector<RowScratchRegisters> *rows,
+ int row_start, int row_end) {
+ if (row_start < 0 || row_end > rows->size() || row_start > row_end) {
+ tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n",
+ row_start, row_end, rows->size());
+ return false;
+ }
+ if (row_end - row_start < min_num_rows) {
+ if (debug_level > 1) {
+ tprintf("# Too few rows[%d, %d) for %s.\n",
+ row_start, row_end, function_name);
+ }
+ return false;
+ }
+ return true;
+}
+
+// =============================== Debug Code ================================
+
+// Convert an integer to a decimal string.
+static STRING StrOf(int num) {
+ char buffer[30];
+ snprintf(buffer, sizeof(buffer), "%d", num);
+ return STRING(buffer);
+}
+
+// Given a row-major matrix of unicode text and a column separator, print
+// a formatted table. For ASCII, we get good column alignment.
+static void PrintTable(const std::vector<std::vector<STRING> > &rows,
+ const STRING &colsep) {
+ std::vector<int> max_col_widths;
+ for (const auto& row : rows) {
+ int num_columns = row.size();
+ for (int c = 0; c < num_columns; c++) {
+ int num_unicodes = 0;
+ for (int i = 0; i < row[c].size(); i++) {
+ if ((row[c][i] & 0xC0) != 0x80) num_unicodes++;
+ }
+ if (c >= max_col_widths.size()) {
+ max_col_widths.push_back(num_unicodes);
+ } else {
+ if (num_unicodes > max_col_widths[c])
+ max_col_widths[c] = num_unicodes;
+ }
+ }
+ }
+
+ std::vector<STRING> col_width_patterns;
+ for (int c = 0; c < max_col_widths.size(); c++) {
+ col_width_patterns.push_back(
+ STRING("%-") + StrOf(max_col_widths[c]) + "s");
+ }
+
+ for (int r = 0; r < rows.size(); r++) {
+ for (int c = 0; c < rows[r].size(); c++) {
+ if (c > 0)
+ tprintf("%s", colsep.c_str());
+ tprintf(col_width_patterns[c].c_str(), rows[r][c].c_str());
+ }
+ tprintf("\n");
+ }
+}
+
+static STRING RtlEmbed(const STRING &word, bool rtlify) {
+ if (rtlify)
+ return STRING(kRLE) + word + STRING(kPDF);
+ return word;
+}
+
+// Print the current thoughts of the paragraph detector.
+static void PrintDetectorState(const ParagraphTheory &theory,
+ const GenericVector<RowScratchRegisters> &rows) {
+ std::vector<std::vector<STRING> > output;
+ output.push_back(std::vector<STRING>());
+ output.back().push_back("#row");
+ output.back().push_back("space");
+ output.back().push_back("..");
+ output.back().push_back("lword[widthSEL]");
+ output.back().push_back("rword[widthSEL]");
+ RowScratchRegisters::AppendDebugHeaderFields(&output.back());
+ output.back().push_back("text");
+
+ for (int i = 0; i < rows.size(); i++) {
+ output.push_back(std::vector<STRING>());
+ std::vector<STRING> &row = output.back();
+ const RowInfo& ri = *rows[i].ri_;
+ row.push_back(StrOf(i));
+ row.push_back(StrOf(ri.average_interword_space));
+ row.push_back(ri.has_leaders ? ".." : " ");
+ row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) +
+ "[" + StrOf(ri.lword_box.width()) +
+ (ri.lword_likely_starts_idea ? "S" : "s") +
+ (ri.lword_likely_ends_idea ? "E" : "e") +
+ (ri.lword_indicates_list_item ? "L" : "l") +
+ "]");
+ row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) +
+ "[" + StrOf(ri.rword_box.width()) +
+ (ri.rword_likely_starts_idea ? "S" : "s") +
+ (ri.rword_likely_ends_idea ? "E" : "e") +
+ (ri.rword_indicates_list_item ? "L" : "l") +
+ "]");
+ rows[i].AppendDebugInfo(theory, &row);
+ row.push_back(RtlEmbed(ri.text, !ri.ltr));
+ }
+ PrintTable(output, " ");
+
+ tprintf("Active Paragraph Models:\n");
+ unsigned m = 0;
+ for (const auto& model : theory.models()) {
+ tprintf(" %d: %s\n", ++m, model->ToString().c_str());
+ }
+}
+
+static void DebugDump(
+ bool should_print,
+ const STRING &phase,
+ const ParagraphTheory &theory,
+ const GenericVector<RowScratchRegisters> &rows) {
+ if (!should_print)
+ return;
+ tprintf("# %s\n", phase.c_str());
+ PrintDetectorState(theory, rows);
+}
+
+// Print out the text for rows[row_start, row_end)
+static void PrintRowRange(const GenericVector<RowScratchRegisters> &rows,
+ int row_start, int row_end) {
+ tprintf("======================================\n");
+ for (int row = row_start; row < row_end; row++) {
+ tprintf("%s\n", rows[row].ri_->text.c_str());
+ }
+ tprintf("======================================\n");
+}
+
+// ============= Brain Dead Language Model (ASCII Version) ===================
+
+static bool IsLatinLetter(int ch) {
+ return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
+}
+
+static bool IsDigitLike(int ch) {
+ return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
+}
+
+static bool IsOpeningPunct(int ch) {
+ return strchr("'\"({[", ch) != nullptr;
+}
+
+static bool IsTerminalPunct(int ch) {
+ return strchr(":'\".?!]})", ch) != nullptr;
+}
+
+// Return a pointer after consuming as much text as qualifies as roman numeral.
+static const char *SkipChars(const char *str, const char *toskip) {
+ while (*str != '\0' && strchr(toskip, *str)) { str++; }
+ return str;
+}
+
+static const char *SkipChars(const char *str, bool (*skip)(int)) {
+ while (*str != '\0' && skip(*str)) { str++; }
+ return str;
+}
+
+static const char *SkipOne(const char *str, const char *toskip) {
+ if (*str != '\0' && strchr(toskip, *str)) return str + 1;
+ return str;
+}
+
+// Return whether it is very likely that this is a numeral marker that could
+// start a list item. Some examples include:
+// A I iii. VI (2) 3.5. [C-4]
+static bool LikelyListNumeral(const STRING &word) {
+ const char *kRomans = "ivxlmdIVXLMD";
+ const char *kDigits = "012345789";
+ const char *kOpen = "[{(";
+ const char *kSep = ":;-.,";
+ const char *kClose = "]})";
+
+ int num_segments = 0;
+ const char *pos = word.c_str();
+ while (*pos != '\0' && num_segments < 3) {
+ // skip up to two open parens.
+ const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
+ const char *numeral_end = SkipChars(numeral_start, kRomans);
+ if (numeral_end != numeral_start) {
+ // Got Roman Numeral. Great.
+ } else {
+ numeral_end = SkipChars(numeral_start, kDigits);
+ if (numeral_end == numeral_start) {
+ // If there's a single latin letter, we can use that.
+ numeral_end = SkipChars(numeral_start, IsLatinLetter);
+ if (numeral_end - numeral_start != 1)
+ break;
+ }
+ }
+ // We got some sort of numeral.
+ num_segments++;
+ // Skip any trailing parens or punctuation.
+ pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
+ if (pos == numeral_end)
+ break;
+ }
+ return *pos == '\0';
+}
+
+static bool LikelyListMark(const STRING &word) {
+ const char *kListMarks = "0Oo*.,+.";
+ return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr;
+}
+
+bool AsciiLikelyListItem(const STRING &word) {
+ return LikelyListMark(word) || LikelyListNumeral(word);
+}
+
+// ========== Brain Dead Language Model (Tesseract Version) ================
+
+// Return the first Unicode Codepoint from werd[pos].
+int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
+ if (!u || !werd || pos > werd->length())
+ return 0;
+ return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
+}
+
+// A useful helper class for finding the first j >= i so that word[j]
+// does not have given character type.
+class UnicodeSpanSkipper {
+ public:
+ UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
+ : u_(unicharset), word_(word) { wordlen_ = word->length(); }
+
+ // Given an input position, return the first position >= pos not punc.
+ int SkipPunc(int pos);
+ // Given an input position, return the first position >= pos not digit.
+ int SkipDigits(int pos);
+ // Given an input position, return the first position >= pos not roman.
+ int SkipRomans(int pos);
+ // Given an input position, return the first position >= pos not alpha.
+ int SkipAlpha(int pos);
+
+ private:
+ const UNICHARSET *u_;
+ const WERD_CHOICE *word_;
+ int wordlen_;
+};
+
+int UnicodeSpanSkipper::SkipPunc(int pos) {
+ while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) pos++;
+ return pos;
+}
+
+int UnicodeSpanSkipper::SkipDigits(int pos) {
+ while (pos < wordlen_ && (u_->get_isdigit(word_->unichar_id(pos)) ||
+ IsDigitLike(UnicodeFor(u_, word_, pos)))) pos++;
+ return pos;
+}
+
+int UnicodeSpanSkipper::SkipRomans(int pos) {
+ const char *kRomans = "ivxlmdIVXLMD";
+ while (pos < wordlen_) {
+ int ch = UnicodeFor(u_, word_, pos);
+ if (ch >= 0xF0 || strchr(kRomans, ch) == nullptr) break;
+ pos++;
+ }
+ return pos;
+}
+
+int UnicodeSpanSkipper::SkipAlpha(int pos) {
+ while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) pos++;
+ return pos;
+}
+
+static bool LikelyListMarkUnicode(int ch) {
+ if (ch < 0x80) {
+ STRING single_ch;
+ single_ch += ch;
+ return LikelyListMark(single_ch);
+ }
+ switch (ch) {
+ // TODO(eger) expand this list of unicodes as needed.
+ case 0x00B0: // degree sign
+ case 0x2022: // bullet
+ case 0x25E6: // white bullet
+ case 0x00B7: // middle dot
+ case 0x25A1: // white square
+ case 0x25A0: // black square
+ case 0x25AA: // black small square
+ case 0x2B1D: // black very small square
+ case 0x25BA: // black right-pointing pointer
+ case 0x25CF: // black circle
+ case 0x25CB: // white circle
+ return true;
+ default:
+ break; // fall through
+ }
+ return false;
+}
+
+// Return whether it is very likely that this is a numeral marker that could
+// start a list item. Some examples include:
+// A I iii. VI (2) 3.5. [C-4]
+static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
+ if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0)))
+ return true;
+
+ UnicodeSpanSkipper m(u, werd);
+ int num_segments = 0;
+ int pos = 0;
+ while (pos < werd->length() && num_segments < 3) {
+ int numeral_start = m.SkipPunc(pos);
+ if (numeral_start > pos + 1) break;
+ int numeral_end = m.SkipRomans(numeral_start);
+ if (numeral_end == numeral_start) {
+ numeral_end = m.SkipDigits(numeral_start);
+ if (numeral_end == numeral_start) {
+ // If there's a single latin letter, we can use that.
+ numeral_end = m.SkipAlpha(numeral_start);
+ if (numeral_end - numeral_start != 1)
+ break;
+ }
+ }
+ // We got some sort of numeral.
+ num_segments++;
+ // Skip any trailing punctuation.
+ pos = m.SkipPunc(numeral_end);
+ if (pos == numeral_end)
+ break;
+ }
+ return pos == werd->length();
+}
+
+// ========= Brain Dead Language Model (combined entry points) ================
+
+// Given the leftmost word of a line either as a Tesseract unicharset + werd
+// or a utf8 string, set the following attributes for it:
+// is_list - this word might be a list number or bullet.
+// starts_idea - this word is likely to start a sentence.
+// ends_idea - this word is likely to end a sentence.
+void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
+ const STRING &utf8,
+ bool *is_list, bool *starts_idea, bool *ends_idea) {
+ *is_list = false;
+ *starts_idea = false;
+ *ends_idea = false;
+ if (utf8.size() == 0 || (werd != nullptr && werd->length() == 0)) { // Empty
+ *ends_idea = true;
+ return;
+ }
+
+ if (unicharset && werd) { // We have a proper werd and unicharset so use it.
+ if (UniLikelyListItem(unicharset, werd)) {
+ *is_list = true;
+ *starts_idea = true;
+ *ends_idea = true;
+ }
+ if (unicharset->get_isupper(werd->unichar_id(0))) {
+ *starts_idea = true;
+ }
+ if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
+ *starts_idea = true;
+ *ends_idea = true;
+ }
+ } else { // Assume utf8 is mostly ASCII
+ if (AsciiLikelyListItem(utf8)) {
+ *is_list = true;
+ *starts_idea = true;
+ }
+ int start_letter = utf8[0];
+ if (IsOpeningPunct(start_letter)) {
+ *starts_idea = true;
+ }
+ if (IsTerminalPunct(start_letter)) {
+ *ends_idea = true;
+ }
+ if (start_letter >= 'A' && start_letter <= 'Z') {
+ *starts_idea = true;
+ }
+ }
+}
+
+// Given the rightmost word of a line either as a Tesseract unicharset + werd
+// or a utf8 string, set the following attributes for it:
+// is_list - this word might be a list number or bullet.
+// starts_idea - this word is likely to start a sentence.
+// ends_idea - this word is likely to end a sentence.
+void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
+ const STRING &utf8,
+ bool *is_list, bool *starts_idea, bool *ends_idea) {
+ *is_list = false;
+ *starts_idea = false;
+ *ends_idea = false;
+ if (utf8.size() == 0 || (werd != nullptr && werd->length() == 0)) { // Empty
+ *ends_idea = true;
+ return;
+ }
+
+ if (unicharset && werd) { // We have a proper werd and unicharset so use it.
+ if (UniLikelyListItem(unicharset, werd)) {
+ *is_list = true;
+ *starts_idea = true;
+ }
+ UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
+ if (unicharset->get_ispunctuation(last_letter)) {
+ *ends_idea = true;
+ }
+ } else { // Assume utf8 is mostly ASCII
+ if (AsciiLikelyListItem(utf8)) {
+ *is_list = true;
+ *starts_idea = true;
+ }
+ int last_letter = utf8[utf8.size() - 1];
+ if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
+ *ends_idea = true;
+ }
+ }
+}
+
+// =============== Implementation of RowScratchRegisters =====================
+/* static */
+void RowScratchRegisters::AppendDebugHeaderFields(
+ std::vector<STRING> *header) {
+ header->push_back("[lmarg,lind;rind,rmarg]");
+ header->push_back("model");
+}
+
+void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,
+ std::vector<STRING> *dbg) const {
+ char s[30];
+ snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]",
+ lmargin_, lindent_, rindent_, rmargin_);
+ dbg->push_back(s);
+ STRING model_string;
+ model_string += static_cast<char>(GetLineType());
+ model_string += ":";
+
+ int model_numbers = 0;
+ for (int h = 0; h < hypotheses_.size(); h++) {
+ if (hypotheses_[h].model == nullptr)
+ continue;
+ if (model_numbers > 0)
+ model_string += ",";
+ if (StrongModel(hypotheses_[h].model)) {
+ model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model));
+ } else if (hypotheses_[h].model == kCrownLeft) {
+ model_string += "CrL";
+ } else if (hypotheses_[h].model == kCrownRight) {
+ model_string += "CrR";
+ }
+ model_numbers++;
+ }
+ if (model_numbers == 0)
+ model_string += "0";
+
+ dbg->push_back(model_string);
+}
+
+void RowScratchRegisters::Init(const RowInfo &row) {
+ ri_ = &row;
+ lmargin_ = 0;
+ lindent_ = row.pix_ldistance;
+ rmargin_ = 0;
+ rindent_ = row.pix_rdistance;
+}
+
+LineType RowScratchRegisters::GetLineType() const {
+ if (hypotheses_.empty())
+ return LT_UNKNOWN;
+ bool has_start = false;
+ bool has_body = false;
+ for (int i = 0; i < hypotheses_.size(); i++) {
+ switch (hypotheses_[i].ty) {
+ case LT_START: has_start = true; break;
+ case LT_BODY: has_body = true; break;
+ default:
+ tprintf("Encountered bad value in hypothesis list: %c\n",
+ hypotheses_[i].ty);
+ break;
+ }
+ }
+ if (has_start && has_body)
+ return LT_MULTIPLE;
+ return has_start ? LT_START : LT_BODY;
+}
+
+LineType RowScratchRegisters::GetLineType(const ParagraphModel *model) const {
+ if (hypotheses_.empty())
+ return LT_UNKNOWN;
+ bool has_start = false;
+ bool has_body = false;
+ for (int i = 0; i < hypotheses_.size(); i++) {
+ if (hypotheses_[i].model != model)
+ continue;
+ switch (hypotheses_[i].ty) {
+ case LT_START: has_start = true; break;
+ case LT_BODY: has_body = true; break;
+ default:
+ tprintf("Encountered bad value in hypothesis list: %c\n",
+ hypotheses_[i].ty);
+ break;
+ }
+ }
+ if (has_start && has_body)
+ return LT_MULTIPLE;
+ return has_start ? LT_START : LT_BODY;
+}
+
+void RowScratchRegisters::SetStartLine() {
+ LineType current_lt = GetLineType();
+ if (current_lt != LT_UNKNOWN && current_lt != LT_START) {
+ tprintf("Trying to set a line to be START when it's already BODY.\n");
+ }
+ if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
+ hypotheses_.push_back_new(LineHypothesis(LT_START, nullptr));
+ }
+}
+
+void RowScratchRegisters::SetBodyLine() {
+ LineType current_lt = GetLineType();
+ if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) {
+ tprintf("Trying to set a line to be BODY when it's already START.\n");
+ }
+ if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
+ hypotheses_.push_back_new(LineHypothesis(LT_BODY, nullptr));
+ }
+}
+
+void RowScratchRegisters::AddStartLine(const ParagraphModel *model) {
+ hypotheses_.push_back_new(LineHypothesis(LT_START, model));
+ int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, nullptr));
+ if (old_idx >= 0)
+ hypotheses_.remove(old_idx);
+}
+
+void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) {
+ hypotheses_.push_back_new(LineHypothesis(LT_BODY, model));
+ int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, nullptr));
+ if (old_idx >= 0)
+ hypotheses_.remove(old_idx);
+}
+
+void RowScratchRegisters::StartHypotheses(SetOfModels *models) const {
+ for (int h = 0; h < hypotheses_.size(); h++) {
+ if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model))
+ models->push_back_new(hypotheses_[h].model);
+ }
+}
+
+void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const {
+ for (int h = 0; h < hypotheses_.size(); h++) {
+ if (StrongModel(hypotheses_[h].model))
+ models->push_back_new(hypotheses_[h].model);
+ }
+}
+
+void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const {
+ for (int h = 0; h < hypotheses_.size(); h++) {
+ if (hypotheses_[h].model != nullptr)
+ models->push_back_new(hypotheses_[h].model);
+ }
+}
+
+const ParagraphModel *RowScratchRegisters::UniqueStartHypothesis() const {
+ if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START)
+ return nullptr;
+ return hypotheses_[0].model;
+}
+
+const ParagraphModel *RowScratchRegisters::UniqueBodyHypothesis() const {
+ if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY)
+ return nullptr;
+ return hypotheses_[0].model;
+}
+
+// Discard any hypotheses whose model is not in the given list.
+void RowScratchRegisters::DiscardNonMatchingHypotheses(
+ const SetOfModels &models) {
+ if (models.empty())
+ return;
+ for (int h = hypotheses_.size() - 1; h >= 0; h--) {
+ if (!models.contains(hypotheses_[h].model)) {
+ hypotheses_.remove(h);
+ }
+ }
+}
+
+// ============ Geometry based Paragraph Detection Algorithm =================
+
+struct Cluster {
+ Cluster() : center(0), count(0) {}
+ Cluster(int cen, int num) : center(cen), count(num) {}
+
+ int center; // The center of the cluster.
+ int count; // The number of entries within the cluster.
+};
+
+class SimpleClusterer {
+ public:
+ explicit SimpleClusterer(int max_cluster_width)
+ : max_cluster_width_(max_cluster_width) {}
+ void Add(int value) { values_.push_back(value); }
+ int size() const { return values_.size(); }
+ void GetClusters(GenericVector<Cluster> *clusters);
+
+ private:
+ int max_cluster_width_;
+ GenericVector<int> values_;
+};
+
+// Return the index of the cluster closest to value.
+static int ClosestCluster(const GenericVector<Cluster> &clusters, int value) {
+ int best_index = 0;
+ for (int i = 0; i < clusters.size(); i++) {
+ if (abs(value - clusters[i].center) <
+ abs(value - clusters[best_index].center))
+ best_index = i;
+ }
+ return best_index;
+}
+
+void SimpleClusterer::GetClusters(GenericVector<Cluster> *clusters) {
+ clusters->clear();
+ values_.sort();
+ for (int i = 0; i < values_.size();) {
+ int orig_i = i;
+ int lo = values_[i];
+ int hi = lo;
+ while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {
+ hi = values_[i];
+ }
+ clusters->push_back(Cluster((hi + lo) / 2, i - orig_i));
+ }
+}
+
+// Calculate left- and right-indent tab stop values seen in
+// rows[row_start, row_end) given a tolerance of tolerance.
+static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows,
+ int row_start, int row_end, int tolerance,
+ GenericVector<Cluster> *left_tabs,
+ GenericVector<Cluster> *right_tabs) {
+ if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
+ return;
+ // First pass: toss all left and right indents into clusterers.
+ SimpleClusterer initial_lefts(tolerance);
+ SimpleClusterer initial_rights(tolerance);
+ GenericVector<Cluster> initial_left_tabs;
+ GenericVector<Cluster> initial_right_tabs;
+ for (int i = row_start; i < row_end; i++) {
+ initial_lefts.Add((*rows)[i].lindent_);
+ initial_rights.Add((*rows)[i].rindent_);
+ }
+ initial_lefts.GetClusters(&initial_left_tabs);
+ initial_rights.GetClusters(&initial_right_tabs);
+
+ // Second pass: cluster only lines that are not "stray"
+ // An example of a stray line is a page number -- a line whose start
+ // and end tab-stops are far outside the typical start and end tab-stops
+ // for the block.
+ // Put another way, we only cluster data from lines whose start or end
+ // tab stop is frequent.
+ SimpleClusterer lefts(tolerance);
+ SimpleClusterer rights(tolerance);
+
+ // Outlier elimination. We might want to switch this to test outlier-ness
+ // based on how strange a position an outlier is in instead of or in addition
+ // to how rare it is. These outliers get re-added if we end up having too
+ // few tab stops, to work with, however.
+ int infrequent_enough_to_ignore = 0;
+ if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;
+ if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;
+
+ for (int i = row_start; i < row_end; i++) {
+ int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
+ int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
+ if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
+ initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
+ lefts.Add((*rows)[i].lindent_);
+ rights.Add((*rows)[i].rindent_);
+ }
+ }
+ lefts.GetClusters(left_tabs);
+ rights.GetClusters(right_tabs);
+
+ if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
+ (right_tabs->size() == 1 && left_tabs->size() >= 4)) {
+ // One side is really ragged, and the other only has one tab stop,
+ // so those "insignificant outliers" are probably important, actually.
+ // This often happens on a page of an index. Add back in the ones
+ // we omitted in the first pass.
+ for (int i = row_start; i < row_end; i++) {
+ int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
+ int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
+ if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
+ initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
+ lefts.Add((*rows)[i].lindent_);
+ rights.Add((*rows)[i].rindent_);
+ }
+ }
+ }
+ lefts.GetClusters(left_tabs);
+ rights.GetClusters(right_tabs);
+
+ // If one side is almost a two-indent aligned side, and the other clearly
+ // isn't, try to prune out the least frequent tab stop from that side.
+ if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
+ int to_prune = -1;
+ for (int i = left_tabs->size() - 1; i >= 0; i--) {
+ if (to_prune < 0 ||
+ (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
+ to_prune = i;
+ }
+ }
+ if (to_prune >= 0 &&
+ (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
+ left_tabs->remove(to_prune);
+ }
+ }
+ if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
+ int to_prune = -1;
+ for (int i = right_tabs->size() - 1; i >= 0; i--) {
+ if (to_prune < 0 ||
+ (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
+ to_prune = i;
+ }
+ }
+ if (to_prune >= 0 &&
+ (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
+ right_tabs->remove(to_prune);
+ }
+ }
+}
+
+// Given a paragraph model mark rows[row_start, row_end) as said model
+// start or body lines.
+//
+// Case 1: model->first_indent_ != model->body_indent_
+// Differentiating the paragraph start lines from the paragraph body lines in
+// this case is easy, we just see how far each line is indented.
+//
+// Case 2: model->first_indent_ == model->body_indent_
+// Here, we find end-of-paragraph lines by looking for "short lines."
+// What constitutes a "short line" changes depending on whether the text
+// ragged-right[left] or fully justified (aligned left and right).
+//
+// Case 2a: Ragged Right (or Left) text. (eop_threshold == 0)
+// We have a new paragraph it the first word would have at the end
+// of the previous line.
+//
+// Case 2b: Fully Justified. (eop_threshold > 0)
+// We mark a line as short (end of paragraph) if the offside indent
+// is greater than eop_threshold.
+static void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows,
+ int row_start, int row_end,
+ const ParagraphModel *model,
+ bool ltr, int eop_threshold) {
+ if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
+ return;
+ for (int row = row_start; row < row_end; row++) {
+ bool valid_first = ValidFirstLine(rows, row, model);
+ bool valid_body = ValidBodyLine(rows, row, model);
+ if (valid_first && !valid_body) {
+ (*rows)[row].AddStartLine(model);
+ } else if (valid_body && !valid_first) {
+ (*rows)[row].AddBodyLine(model);
+ } else if (valid_body && valid_first) {
+ bool after_eop = (row == row_start);
+ if (row > row_start) {
+ if (eop_threshold > 0) {
+ if (model->justification() == JUSTIFICATION_LEFT) {
+ after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
+ } else {
+ after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
+ }
+ } else {
+ after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row],
+ model->justification());
+ }
+ }
+ if (after_eop) {
+ (*rows)[row].AddStartLine(model);
+ } else {
+ (*rows)[row].AddBodyLine(model);
+ }
+ } else {
+ // Do nothing. Stray row.
+ }
+ }
+}
+
+// GeometricClassifierState holds all of the information we'll use while
+// trying to determine a paragraph model for the text lines in a block of
+// text:
+// + the rows under consideration [row_start, row_end)
+// + the common left- and right-indent tab stops
+// + does the block start out left-to-right or right-to-left
+// Further, this struct holds the data we amass for the (single) ParagraphModel
+// we'll assign to the text lines (assuming we get that far).
+struct GeometricClassifierState {
+ GeometricClassifierState(int dbg_level,
+ GenericVector<RowScratchRegisters> *r,
+ int r_start, int r_end)
+ : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) {
+ tolerance = InterwordSpace(*r, r_start, r_end);
+ CalculateTabStops(r, r_start, r_end, tolerance,
+ &left_tabs, &right_tabs);
+ if (debug_level >= 3) {
+ tprintf("Geometry: TabStop cluster tolerance = %d; "
+ "%d left tabs; %d right tabs\n",
+ tolerance, left_tabs.size(), right_tabs.size());
+ }
+ ltr = (*r)[r_start].ri_->ltr;
+ }
+
+ void AssumeLeftJustification() {
+ just = tesseract::JUSTIFICATION_LEFT;
+ margin = (*rows)[row_start].lmargin_;
+ }
+
+ void AssumeRightJustification() {
+ just = tesseract::JUSTIFICATION_RIGHT;
+ margin = (*rows)[row_start].rmargin_;
+ }
+
+ // Align tabs are the tab stops the text is aligned to.
+ const GenericVector<Cluster> &AlignTabs() const {
+ if (just == tesseract::JUSTIFICATION_RIGHT) return right_tabs;
+ return left_tabs;
+ }
+
+ // Offside tabs are the tab stops opposite the tabs used to align the text.
+ //
+ // Note that for a left-to-right text which is aligned to the right such as
+ // this function comment, the offside tabs are the horizontal tab stops
+ // marking the beginning of ("Note", "this" and "marking").
+ const GenericVector<Cluster> &OffsideTabs() const {
+ if (just == tesseract::JUSTIFICATION_RIGHT) return left_tabs;
+ return right_tabs;
+ }
+
+ // Return whether the i'th row extends from the leftmost left tab stop
+ // to the right most right tab stop.
+ bool IsFullRow(int i) const {
+ return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 &&
+ ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0;
+ }
+
+ int AlignsideTabIndex(int row_idx) const {
+ return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));
+ }
+
+ // Given what we know about the paragraph justification (just), would the
+ // first word of row_b have fit at the end of row_a?
+ bool FirstWordWouldHaveFit(int row_a, int row_b) {
+ return ::tesseract::FirstWordWouldHaveFit(
+ (*rows)[row_a], (*rows)[row_b], just);
+ }
+
+ void PrintRows() const { PrintRowRange(*rows, row_start, row_end); }
+
+ void Fail(int min_debug_level, const char *why) const {
+ if (debug_level < min_debug_level) return;
+ tprintf("# %s\n", why);
+ PrintRows();
+ }
+
+ ParagraphModel Model() const {
+ return ParagraphModel(just, margin, first_indent, body_indent, tolerance);
+ }
+
+ // We print out messages with a debug level at least as great as debug_level.
+ int debug_level = 0;
+
+ // The Geometric Classifier was asked to find a single paragraph model
+ // to fit the text rows (*rows)[row_start, row_end)
+ GenericVector<RowScratchRegisters> *rows;
+ int row_start = 0;
+ int row_end = 0;
+
+ // The amount by which we expect the text edge can vary and still be aligned.
+ int tolerance = 0;
+
+ // Is the script in this text block left-to-right?
+ // HORRIBLE ROUGH APPROXIMATION. TODO(eger): Improve
+ bool ltr = false;
+
+ // These left and right tab stops were determined to be the common tab
+ // stops for the given text.
+ GenericVector<Cluster> left_tabs;
+ GenericVector<Cluster> right_tabs;
+
+ // These are parameters we must determine to create a ParagraphModel.
+ tesseract::ParagraphJustification just = JUSTIFICATION_UNKNOWN;
+ int margin = 0;
+ int first_indent = 0;
+ int body_indent = 0;
+
+ // eop_threshold > 0 if the text is fully justified. See MarkRowsWithModel()
+ int eop_threshold = 0;
+};
+
+// Given a section of text where strong textual clues did not help identifying
+// paragraph breaks, and for which the left and right indents have exactly
+// three tab stops between them, attempt to find the paragraph breaks based
+// solely on the outline of the text and whether the script is left-to-right.
+//
+// Algorithm Detail:
+// The selected rows are in the form of a rectangle except
+// for some number of "short lines" of the same length:
+//
+// (A1) xxxxxxxxxxxxx (B1) xxxxxxxxxxxx
+// xxxxxxxxxxx xxxxxxxxxx # A "short" line.
+// xxxxxxxxxxxxx xxxxxxxxxxxx
+// xxxxxxxxxxxxx xxxxxxxxxxxx
+//
+// We have a slightly different situation if the only short
+// line is at the end of the excerpt.
+//
+// (A2) xxxxxxxxxxxxx (B2) xxxxxxxxxxxx
+// xxxxxxxxxxxxx xxxxxxxxxxxx
+// xxxxxxxxxxxxx xxxxxxxxxxxx
+// xxxxxxxxxxx xxxxxxxxxx # A "short" line.
+//
+// We'll interpret these as follows based on the reasoning in the comment for
+// GeometricClassify():
+// [script direction: first indent, body indent]
+// (A1) LtR: 2,0 RtL: 0,0 (B1) LtR: 0,0 RtL: 2,0
+// (A2) LtR: 2,0 RtL: CrR (B2) LtR: CrL RtL: 2,0
+static void GeometricClassifyThreeTabStopTextBlock(
+ int debug_level,
+ GeometricClassifierState &s,
+ ParagraphTheory *theory) {
+ int num_rows = s.row_end - s.row_start;
+ int num_full_rows = 0;
+ int last_row_full = 0;
+ for (int i = s.row_start; i < s.row_end; i++) {
+ if (s.IsFullRow(i)) {
+ num_full_rows++;
+ if (i == s.row_end - 1) last_row_full++;
+ }
+ }
+
+ if (num_full_rows < 0.7 * num_rows) {
+ s.Fail(1, "Not enough full lines to know which lines start paras.");
+ return;
+ }
+
+ // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
+ s.eop_threshold = 0;
+
+ if (s.ltr) {
+ s.AssumeLeftJustification();
+ } else {
+ s.AssumeRightJustification();
+ }
+
+ if (debug_level > 0) {
+ tprintf("# Not enough variety for clear outline classification. "
+ "Guessing these are %s aligned based on script.\n",
+ s.ltr ? "left" : "right");
+ s.PrintRows();
+ }
+
+ if (s.AlignTabs().size() == 2) { // case A1 or A2
+ s.first_indent = s.AlignTabs()[1].center;
+ s.body_indent = s.AlignTabs()[0].center;
+ } else { // case B1 or B2
+ if (num_rows - 1 == num_full_rows - last_row_full) {
+ // case B2
+ const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
+ (*s.rows)[s.row_start].AddStartLine(model);
+ for (int i = s.row_start + 1; i < s.row_end; i++) {
+ (*s.rows)[i].AddBodyLine(model);
+ }
+ return;
+ } else {
+ // case B1
+ s.first_indent = s.body_indent = s.AlignTabs()[0].center;
+ s.eop_threshold = (s.OffsideTabs()[0].center +
+ s.OffsideTabs()[1].center) / 2;
+ }
+ }
+ const ParagraphModel *model = theory->AddModel(s.Model());
+ MarkRowsWithModel(s.rows, s.row_start, s.row_end, model,
+ s.ltr, s.eop_threshold);
+ return;
+}
+
+// This function is called if strong textual clues were not available, but
+// the caller hopes that the paragraph breaks will be super obvious just
+// by the outline of the text.
+//
+// The particularly difficult case is figuring out what's going on if you
+// don't have enough short paragraph end lines to tell us what's going on.
+//
+// For instance, let's say you have the following outline:
+//
+// (A1) xxxxxxxxxxxxxxxxxxxxxx
+// xxxxxxxxxxxxxxxxxxxx
+// xxxxxxxxxxxxxxxxxxxxxx
+// xxxxxxxxxxxxxxxxxxxxxx
+//
+// Even if we know that the text is left-to-right and so will probably be
+// left-aligned, both of the following are possible texts:
+//
+// (A1a) 1. Here our list item
+// with two full lines.
+// 2. Here a second item.
+// 3. Here our third one.
+//
+// (A1b) so ends paragraph one.
+// Here starts another
+// paragraph we want to
+// read. This continues
+//
+// These examples are obvious from the text and should have been caught
+// by the StrongEvidenceClassify pass. However, for languages where we don't
+// have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
+// it's worth guessing that (A1b) is the correct interpretation if there are
+// far more "full" lines than "short" lines.
+static void GeometricClassify(int debug_level,
+ GenericVector<RowScratchRegisters> *rows,
+ int row_start, int row_end,
+ ParagraphTheory *theory) {
+ if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
+ return;
+ if (debug_level > 1) {
+ tprintf("###############################################\n");
+ tprintf("##### GeometricClassify( rows[%d:%d) ) ####\n",
+ row_start, row_end);
+ tprintf("###############################################\n");
+ }
+ RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
+
+ GeometricClassifierState s(debug_level, rows, row_start, row_end);
+ if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
+ s.Fail(2, "Too much variety for simple outline classification.");
+ return;
+ }
+ if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
+ s.Fail(1, "Not enough variety for simple outline classification.");
+ return;
+ }
+ if (s.left_tabs.size() + s.right_tabs.size() == 3) {
+ GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
+ return;
+ }
+
+ // At this point, we know that one side has at least two tab stops, and the
+ // other side has one or two tab stops.
+ // Left to determine:
+ // (1) Which is the body indent and which is the first line indent?
+ // (2) Is the text fully justified?
+
+ // If one side happens to have three or more tab stops, assume that side
+ // is opposite of the aligned side.
+ if (s.right_tabs.size() > 2) {
+ s.AssumeLeftJustification();
+ } else if (s.left_tabs.size() > 2) {
+ s.AssumeRightJustification();
+ } else if (s.ltr) { // guess based on script direction
+ s.AssumeLeftJustification();
+ } else {
+ s.AssumeRightJustification();
+ }
+
+ if (s.AlignTabs().size() == 2) {
+ // For each tab stop on the aligned side, how many of them appear
+ // to be paragraph start lines? [first lines]
+ int firsts[2] = {0, 0};
+ // Count the first line as a likely paragraph start line.
+ firsts[s.AlignsideTabIndex(s.row_start)]++;
+ // For each line, if the first word would have fit on the previous
+ // line count it as a likely paragraph start line.
+ bool jam_packed = true;
+ for (int i = s.row_start + 1; i < s.row_end; i++) {
+ if (s.FirstWordWouldHaveFit(i - 1, i)) {
+ firsts[s.AlignsideTabIndex(i)]++;
+ jam_packed = false;
+ }
+ }
+ // Make an extra accounting for the last line of the paragraph just
+ // in case it's the only short line in the block. That is, take its
+ // first word as typical and see if this looks like the *last* line
+ // of a paragraph. If so, mark the *other* indent as probably a first.
+ if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
+ firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
+ }
+
+ int percent0firsts, percent1firsts;
+ percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
+ percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;
+
+ // TODO(eger): Tune these constants if necessary.
+ if ((percent0firsts < 20 && 30 < percent1firsts) ||
+ percent0firsts + 30 < percent1firsts) {
+ s.first_indent = s.AlignTabs()[1].center;
+ s.body_indent = s.AlignTabs()[0].center;
+ } else if ((percent1firsts < 20 && 30 < percent0firsts) ||
+ percent1firsts + 30 < percent0firsts) {
+ s.first_indent = s.AlignTabs()[0].center;
+ s.body_indent = s.AlignTabs()[1].center;
+ } else {
+ // Ambiguous! Probably lineated (poetry)
+ if (debug_level > 1) {
+ tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
+ s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
+ tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
+ s.AlignTabs()[0].center, percent0firsts);
+ tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
+ s.AlignTabs()[1].center, percent1firsts);
+ s.PrintRows();
+ }
+ return;
+ }
+ } else {
+ // There's only one tab stop for the "aligned to" side.
+ s.first_indent = s.body_indent = s.AlignTabs()[0].center;
+ }
+
+ // At this point, we have our model.
+ const ParagraphModel *model = theory->AddModel(s.Model());
+
+ // Now all we have to do is figure out if the text is fully justified or not.
+ // eop_threshold: default to fully justified unless we see evidence below.
+ // See description on MarkRowsWithModel()
+ s.eop_threshold =
+ (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
+ // If the text is not fully justified, re-set the eop_threshold to 0.
+ if (s.AlignTabs().size() == 2) {
+ // Paragraphs with a paragraph-start indent.
+ for (int i = s.row_start; i < s.row_end - 1; i++) {
+ if (ValidFirstLine(s.rows, i + 1, model) &&
+ !NearlyEqual(s.OffsideTabs()[0].center,
+ (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
+ // We found a non-end-of-paragraph short line: not fully justified.
+ s.eop_threshold = 0;
+ break;
+ }
+ }
+ } else {
+ // Paragraphs with no paragraph-start indent.
+ for (int i = s.row_start; i < s.row_end - 1; i++) {
+ if (!s.FirstWordWouldHaveFit(i, i + 1) &&
+ !NearlyEqual(s.OffsideTabs()[0].center,
+ (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
+ // We found a non-end-of-paragraph short line: not fully justified.
+ s.eop_threshold = 0;
+ break;
+ }
+ }
+ }
+ MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
+}
+
+// =============== Implementation of ParagraphTheory =====================
+
+const ParagraphModel* ParagraphTheory::AddModel(const ParagraphModel &model) {
+ for (const auto& m : *models_) {
+ if (m->Comparable(model)) {
+ return m;
+ }
+ }
+ auto *m = new ParagraphModel(model);
+ models_->push_back(m);
+ models_we_added_.push_back_new(m);
+ return m;
+}
+
+void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {
+ size_t w = 0;
+ for (size_t r = 0; r < models_->size(); r++) {
+ ParagraphModel* m = (*models_)[r];
+ if (!used_models.contains(m) && models_we_added_.contains(m)) {
+ delete m;
+ } else {
+ if (r > w) {
+ (*models_)[w] = m;
+ }
+ w++;
+ }
+ }
+ models_->resize(w);
+}
+
+// Examine rows[start, end) and try to determine if an existing non-centered
+// paragraph model would fit them perfectly. If so, return a pointer to it.
+// If not, return nullptr.
+const ParagraphModel *ParagraphTheory::Fits(
+ const GenericVector<RowScratchRegisters> *rows, int start, int end) const {
+ for (const auto* model : *models_) {
+ if (model->justification() != JUSTIFICATION_CENTER &&
+ RowsFitModel(rows, start, end, model))
+ return model;
+ }
+ return nullptr;
+}
+
+void ParagraphTheory::NonCenteredModels(SetOfModels *models) {
+ for (const auto* model : *models_) {
+ if (model->justification() != JUSTIFICATION_CENTER)
+ models->push_back_new(model);
+ }
+}
+
+int ParagraphTheory::IndexOf(const ParagraphModel *model) const {
+ int i = 0;
+ for (const auto* m : *models_) {
+ if (m == model)
+ return i;
+ i++;
+ }
+ return -1;
+}
+
+bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows,
+ int row, const ParagraphModel *model) {
+ if (!StrongModel(model)) {
+ tprintf("ValidFirstLine() should only be called with strong models!\n");
+ }
+ return StrongModel(model) &&
+ model->ValidFirstLine(
+ (*rows)[row].lmargin_, (*rows)[row].lindent_,
+ (*rows)[row].rindent_, (*rows)[row].rmargin_);
+}
+
+bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows,
+ int row, const ParagraphModel *model) {
+ if (!StrongModel(model)) {
+ tprintf("ValidBodyLine() should only be called with strong models!\n");
+ }
+ return StrongModel(model) &&
+ model->ValidBodyLine(
+ (*rows)[row].lmargin_, (*rows)[row].lindent_,
+ (*rows)[row].rindent_, (*rows)[row].rmargin_);
+}
+
+bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows,
+ int a, int b, const ParagraphModel *model) {
+ if (model != kCrownRight && model != kCrownLeft) {
+ tprintf("CrownCompatible() should only be called with crown models!\n");
+ return false;
+ }
+ auto &row_a = (*rows)[a];
+ auto &row_b = (*rows)[b];
+ if (model == kCrownRight) {
+ return NearlyEqual(row_a.rindent_ + row_a.rmargin_,
+ row_b.rindent_ + row_b.rmargin_,
+ Epsilon(row_a.ri_->average_interword_space));
+ }
+ return NearlyEqual(row_a.lindent_ + row_a.lmargin_,
+ row_b.lindent_ + row_b.lmargin_,
+ Epsilon(row_a.ri_->average_interword_space));
+}
+
+
+// =============== Implementation of ParagraphModelSmearer ====================
+
+ParagraphModelSmearer::ParagraphModelSmearer(
+ GenericVector<RowScratchRegisters> *rows,
+ int row_start, int row_end, ParagraphTheory *theory)
+ : theory_(theory), rows_(rows), row_start_(row_start),
+ row_end_(row_end) {
+ if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
+ row_start_ = 0;
+ row_end_ = 0;
+ return;
+ }
+ open_models_.resize(open_models_.size() + row_end - row_start + 2);
+}
+
+// see paragraphs_internal.h
+void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
+ SetOfModels no_models;
+ if (row_start < row_start_) row_start = row_start_;
+ if (row_end > row_end_) row_end = row_end_;
+
+ for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end;
+ row++) {
+ if ((*rows_)[row].ri_->num_words == 0) {
+ OpenModels(row + 1) = no_models;
+ } else {
+ SetOfModels &opened = OpenModels(row);
+ (*rows_)[row].StartHypotheses(&opened);
+
+ // Which models survive the transition from row to row + 1?
+ SetOfModels still_open;
+ for (int m = 0; m < opened.size(); m++) {
+ if (ValidFirstLine(rows_, row, opened[m]) ||
+ ValidBodyLine(rows_, row, opened[m])) {
+ // This is basic filtering; we check likely paragraph starty-ness down
+ // below in Smear() -- you know, whether the first word would have fit
+ // and such.
+ still_open.push_back_new(opened[m]);
+ }
+ }
+ OpenModels(row + 1) = still_open;
+ }
+ }
+}
+
+// see paragraphs_internal.h
+void ParagraphModelSmearer::Smear() {
+ CalculateOpenModels(row_start_, row_end_);
+
+ // For each row which we're unsure about (that is, it is LT_UNKNOWN or
+ // we have multiple LT_START hypotheses), see if there's a model that
+ // was recently used (an "open" model) which might model it well.
+ for (int i = row_start_; i < row_end_; i++) {
+ RowScratchRegisters &row = (*rows_)[i];
+ if (row.ri_->num_words == 0)
+ continue;
+
+ // Step One:
+ // Figure out if there are "open" models which are left-alined or
+ // right-aligned. This is important for determining whether the
+ // "first" word in a row would fit at the "end" of the previous row.
+ bool left_align_open = false;
+ bool right_align_open = false;
+ for (int m = 0; m < OpenModels(i).size(); m++) {
+ switch (OpenModels(i)[m]->justification()) {
+ case JUSTIFICATION_LEFT: left_align_open = true; break;
+ case JUSTIFICATION_RIGHT: right_align_open = true; break;
+ default: left_align_open = right_align_open = true;
+ }
+ }
+ // Step Two:
+ // Use that knowledge to figure out if this row is likely to
+ // start a paragraph.
+ bool likely_start;
+ if (i == 0) {
+ likely_start = true;
+ } else {
+ if ((left_align_open && right_align_open) ||
+ (!left_align_open && !right_align_open)) {
+ likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
+ JUSTIFICATION_LEFT) ||
+ LikelyParagraphStart((*rows_)[i - 1], row,
+ JUSTIFICATION_RIGHT);
+ } else if (left_align_open) {
+ likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
+ JUSTIFICATION_LEFT);
+ } else {
+ likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
+ JUSTIFICATION_RIGHT);
+ }
+ }
+
+ // Step Three:
+ // If this text line seems like an obvious first line of an
+ // open model, or an obvious continuation of an existing
+ // modelled paragraph, mark it up.
+ if (likely_start) {
+ // Add Start Hypotheses for all Open models that fit.
+ for (int m = 0; m < OpenModels(i).size(); m++) {
+ if (ValidFirstLine(rows_, i, OpenModels(i)[m])) {
+ row.AddStartLine(OpenModels(i)[m]);
+ }
+ }
+ } else {
+ // Add relevant body line hypotheses.
+ SetOfModels last_line_models;
+ if (i > 0) {
+ (*rows_)[i - 1].StrongHypotheses(&last_line_models);
+ } else {
+ theory_->NonCenteredModels(&last_line_models);
+ }
+ for (int m = 0; m < last_line_models.size(); m++) {
+ const ParagraphModel *model = last_line_models[m];
+ if (ValidBodyLine(rows_, i, model))
+ row.AddBodyLine(model);
+ }
+ }
+
+ // Step Four:
+ // If we're still quite unsure about this line, go through all
+ // models in our theory and see if this row could be the start
+ // of any of our models.
+ if (row.GetLineType() == LT_UNKNOWN ||
+ (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) {
+ SetOfModels all_models;
+ theory_->NonCenteredModels(&all_models);
+ for (int m = 0; m < all_models.size(); m++) {
+ if (ValidFirstLine(rows_, i, all_models[m])) {
+ row.AddStartLine(all_models[m]);
+ }
+ }
+ }
+ // Step Five:
+ // Since we may have updated the hypotheses about this row, we need
+ // to recalculate the Open models for the rest of rows[i + 1, row_end)
+ if (row.GetLineType() != LT_UNKNOWN) {
+ CalculateOpenModels(i + 1, row_end_);
+ }
+ }
+}
+
+// ================ Main Paragraph Detection Algorithm =======================
+
+// Find out what ParagraphModels are actually used, and discard any
+// that are not.
+static void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows,
+ ParagraphTheory *theory) {
+ SetOfModels used_models;
+ for (int i = 0; i < rows.size(); i++) {
+ rows[i].StrongHypotheses(&used_models);
+ }
+ theory->DiscardUnusedModels(used_models);
+}
+
+// DowngradeWeakestToCrowns:
+// Forget any flush-{left, right} models unless we see two or more
+// of them in sequence.
+//
+// In pass 3, we start to classify even flush-left paragraphs (paragraphs
+// where the first line and body indent are the same) as having proper Models.
+// This is generally dangerous, since if you start imagining that flush-left
+// is a typical paragraph model when it is not, it will lead you to chop normal
+// indented paragraphs in the middle whenever a sentence happens to start on a
+// new line (see "This" above). What to do?
+// What we do is to take any paragraph which is flush left and is not
+// preceded by another paragraph of the same model and convert it to a "Crown"
+// paragraph. This is a weak pseudo-ParagraphModel which is a placeholder
+// for later. It means that the paragraph is flush, but it would be desirable
+// to mark it as the same model as following text if it fits. This downgrade
+// FlushLeft -> CrownLeft -> Model of following paragraph. Means that we
+// avoid making flush left Paragraph Models whenever we see a top-of-the-page
+// half-of-a-paragraph. and instead we mark it the same as normal body text.
+//
+// Implementation:
+//
+// Comb backwards through the row scratch registers, and turn any
+// sequences of body lines of equivalent type abutted against the beginning
+// or a body or start line of a different type into a crown paragraph.
+static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,
+ GenericVector<RowScratchRegisters> *rows) {
+ int start;
+ for (int end = rows->size(); end > 0; end = start) {
+ // Search back for a body line of a unique type.
+ const ParagraphModel *model = nullptr;
+ while (end > 0 &&
+ (model = (*rows)[end - 1].UniqueBodyHypothesis()) == nullptr) {
+ end--;
+ }
+ if (end == 0) break;
+ start = end - 1;
+ while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
+ start--; // walk back to the first line that is not the same body type.
+ }
+ if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
+ StrongModel(model) &&
+ NearlyEqual(model->first_indent(), model->body_indent(),
+ model->tolerance())) {
+ start--;
+ }
+ start++;
+ // Now rows[start, end) is a sequence of unique body hypotheses of model.
+ if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER)
+ continue;
+ if (!StrongModel(model)) {
+ while (start > 0 &&
+ CrownCompatible(rows, start - 1, start, model))
+ start--;
+ }
+ if (start == 0 ||
+ (!StrongModel(model)) ||
+ (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
+ // crownify rows[start, end)
+ const ParagraphModel *crown_model = model;
+ if (StrongModel(model)) {
+ if (model->justification() == JUSTIFICATION_LEFT)
+ crown_model = kCrownLeft;
+ else
+ crown_model = kCrownRight;
+ }
+ (*rows)[start].SetUnknown();
+ (*rows)[start].AddStartLine(crown_model);
+ for (int row = start + 1; row < end; row++) {
+ (*rows)[row].SetUnknown();
+ (*rows)[row].AddBodyLine(crown_model);
+ }
+ }
+ }
+ DiscardUnusedModels(*rows, theory);
+}
+
+
+// Clear all hypotheses about lines [start, end) and reset margins.
+//
+// The empty space between the left of a row and the block boundary (and
+// similarly for the right) is split into two pieces: margin and indent.
+// In initial processing, we assume the block is tight and the margin for
+// all lines is set to zero. However, if our first pass does not yield
+// models for everything, it may be due to an inset paragraph like a
+// block-quote. In that case, we make a second pass over that unmarked
+// section of the page and reset the "margin" portion of the empty space
+// to the common amount of space at the ends of the lines under consid-
+// eration. This would be equivalent to percentile set to 0. However,
+// sometimes we have a single character sticking out in the right margin
+// of a text block (like the 'r' in 'for' on line 3 above), and we can
+// really just ignore it as an outlier. To express this, we allow the
+// user to specify the percentile (0..100) of indent values to use as
+// the common margin for each row in the run of rows[start, end).
+void RecomputeMarginsAndClearHypotheses(
+ GenericVector<RowScratchRegisters> *rows, int start, int end,
+ int percentile) {
+ if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
+ return;
+
+ int lmin, lmax, rmin, rmax;
+ lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
+ rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
+ for (int i = start; i < end; i++) {
+ RowScratchRegisters &sr = (*rows)[i];
+ sr.SetUnknown();
+ if (sr.ri_->num_words == 0)
+ continue;
+ UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
+ UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
+ }
+ STATS lefts(lmin, lmax + 1);
+ STATS rights(rmin, rmax + 1);
+ for (int i = start; i < end; i++) {
+ RowScratchRegisters &sr = (*rows)[i];
+ if (sr.ri_->num_words == 0)
+ continue;
+ lefts.add(sr.lmargin_ + sr.lindent_, 1);
+ rights.add(sr.rmargin_ + sr.rindent_, 1);
+ }
+ int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
+ int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
+ for (int i = start; i < end; i++) {
+ RowScratchRegisters &sr = (*rows)[i];
+ int ldelta = ignorable_left - sr.lmargin_;
+ sr.lmargin_ += ldelta;
+ sr.lindent_ -= ldelta;
+ int rdelta = ignorable_right - sr.rmargin_;
+ sr.rmargin_ += rdelta;
+ sr.rindent_ -= rdelta;
+ }
+}
+
+// Return the median inter-word space in rows[row_start, row_end).
+int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,
+ int row_start, int row_end) {
+ if (row_end < row_start + 1) return 1;
+ int word_height = (rows[row_start].ri_->lword_box.height() +
+ rows[row_end - 1].ri_->lword_box.height()) / 2;
+ int word_width = (rows[row_start].ri_->lword_box.width() +
+ rows[row_end - 1].ri_->lword_box.width()) / 2;
+ STATS spacing_widths(0, 5 + word_width);
+ for (int i = row_start; i < row_end; i++) {
+ if (rows[i].ri_->num_words > 1) {
+ spacing_widths.add(rows[i].ri_->average_interword_space, 1);
+ }
+ }
+ int minimum_reasonable_space = word_height / 3;
+ if (minimum_reasonable_space < 2)
+ minimum_reasonable_space = 2;
+ int median = spacing_widths.median();
+ return (median > minimum_reasonable_space)
+ ? median : minimum_reasonable_space;
+}
+
+// Return whether the first word on the after line can fit in the space at
+// the end of the before line (knowing which way the text is aligned and read).
+bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
+ const RowScratchRegisters &after,
+ tesseract::ParagraphJustification justification) {
+ if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
+ return true;
+
+ if (justification == JUSTIFICATION_UNKNOWN) {
+ tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
+ }
+ int available_space;
+ if (justification == JUSTIFICATION_CENTER) {
+ available_space = before.lindent_ + before.rindent_;
+ } else {
+ available_space = before.OffsideIndent(justification);
+ }
+ available_space -= before.ri_->average_interword_space;
+
+ if (before.ri_->ltr)
+ return after.ri_->lword_box.width() < available_space;
+ return after.ri_->rword_box.width() < available_space;
+}
+
+// Return whether the first word on the after line can fit in the space at
+// the end of the before line (not knowing which way the text goes) in a left
+// or right alignment.
+bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
+ const RowScratchRegisters &after) {
+ if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
+ return true;
+
+ int available_space = before.lindent_;
+ if (before.rindent_ > available_space)
+ available_space = before.rindent_;
+ available_space -= before.ri_->average_interword_space;
+
+ if (before.ri_->ltr)
+ return after.ri_->lword_box.width() < available_space;
+ return after.ri_->rword_box.width() < available_space;
+}
+
+static bool TextSupportsBreak(const RowScratchRegisters &before,
+ const RowScratchRegisters &after) {
+ if (before.ri_->ltr) {
+ return before.ri_->rword_likely_ends_idea &&
+ after.ri_->lword_likely_starts_idea;
+ } else {
+ return before.ri_->lword_likely_ends_idea &&
+ after.ri_->rword_likely_starts_idea;
+ }
+}
+
+static bool LikelyParagraphStart(const RowScratchRegisters &before,
+ const RowScratchRegisters &after,
+ tesseract::ParagraphJustification j) {
+ return before.ri_->num_words == 0 ||
+ (FirstWordWouldHaveFit(before, after, j) &&
+ TextSupportsBreak(before, after));
+}
+
+// Examine rows[start, end) and try to determine what sort of ParagraphModel
+// would fit them as a single paragraph.
+// If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN.
+// If the rows given could be a consistent start to a paragraph, set *consistent
+// true.
+static ParagraphModel InternalParagraphModelByOutline(
+ const GenericVector<RowScratchRegisters> *rows,
+ int start, int end, int tolerance, bool *consistent) {
+ int ltr_line_count = 0;
+ for (int i = start; i < end; i++) {
+ ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
+ }
+ bool ltr = (ltr_line_count >= (end - start) / 2);
+
+ *consistent = true;
+ if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
+ return ParagraphModel();
+
+ // Ensure the caller only passed us a region with a common rmargin and
+ // lmargin.
+ int lmargin = (*rows)[start].lmargin_;
+ int rmargin = (*rows)[start].rmargin_;
+ int lmin, lmax, rmin, rmax, cmin, cmax;
+ lmin = lmax = (*rows)[start + 1].lindent_;
+ rmin = rmax = (*rows)[start + 1].rindent_;
+ cmin = cmax = 0;
+ for (int i = start + 1; i < end; i++) {
+ if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
+ tprintf("Margins don't match! Software error.\n");
+ *consistent = false;
+ return ParagraphModel();
+ }
+ UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
+ UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
+ UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
+ }
+ int ldiff = lmax - lmin;
+ int rdiff = rmax - rmin;
+ int cdiff = cmax - cmin;
+ if (rdiff > tolerance && ldiff > tolerance) {
+ if (cdiff < tolerance * 2) {
+ if (end - start < 3)
+ return ParagraphModel();
+ return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
+ }
+ *consistent = false;
+ return ParagraphModel();
+ }
+ if (end - start < 3) // Don't return a model for two line paras.
+ return ParagraphModel();
+
+ // These booleans keep us from saying something is aligned left when the body
+ // left variance is too large.
+ bool body_admits_left_alignment = ldiff < tolerance;
+ bool body_admits_right_alignment = rdiff < tolerance;
+
+ ParagraphModel left_model =
+ ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
+ (lmin + lmax) / 2, tolerance);
+ ParagraphModel right_model =
+ ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
+ (rmin + rmax) / 2, tolerance);
+
+ // These booleans keep us from having an indent on the "wrong side" for the
+ // first line.
+ bool text_admits_left_alignment = ltr || left_model.is_flush();
+ bool text_admits_right_alignment = !ltr || right_model.is_flush();
+
+ // At least one of the edges is less than tolerance in variance.
+ // If the other is obviously ragged, it can't be the one aligned to.
+ // [Note the last line is included in this raggedness.]
+ if (tolerance < rdiff) {
+ if (body_admits_left_alignment && text_admits_left_alignment)
+ return left_model;
+ *consistent = false;
+ return ParagraphModel();
+ }
+ if (tolerance < ldiff) {
+ if (body_admits_right_alignment && text_admits_right_alignment)
+ return right_model;
+ *consistent = false;
+ return ParagraphModel();
+ }
+
+ // At this point, we know the body text doesn't vary much on either side.
+
+ // If the first line juts out oddly in one direction or the other,
+ // that likely indicates the side aligned to.
+ int first_left = (*rows)[start].lindent_;
+ int first_right = (*rows)[start].rindent_;
+
+ if (ltr && body_admits_left_alignment &&
+ (first_left < lmin || first_left > lmax))
+ return left_model;
+ if (!ltr && body_admits_right_alignment &&
+ (first_right < rmin || first_right > rmax))
+ return right_model;
+
+ *consistent = false;
+ return ParagraphModel();
+}
+
+// Examine rows[start, end) and try to determine what sort of ParagraphModel
+// would fit them as a single paragraph. If nothing fits,
+// justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
+// output if we're debugging.
+static ParagraphModel ParagraphModelByOutline(
+ int debug_level,
+ const GenericVector<RowScratchRegisters> *rows,
+ int start, int end, int tolerance) {
+ bool unused_consistent;
+ ParagraphModel retval = InternalParagraphModelByOutline(
+ rows, start, end, tolerance, &unused_consistent);
+ if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
+ tprintf("Could not determine a model for this paragraph:\n");
+ PrintRowRange(*rows, start, end);
+ }
+ return retval;
+}
+
+// Do rows[start, end) form a single instance of the given paragraph model?
+bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows,
+ int start, int end, const ParagraphModel *model) {
+ if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
+ return false;
+ if (!ValidFirstLine(rows, start, model)) return false;
+ for (int i = start + 1 ; i < end; i++) {
+ if (!ValidBodyLine(rows, i, model)) return false;
+ }
+ return true;
+}
+
+// Examine rows[row_start, row_end) as an independent section of text,
+// and mark rows that are exceptionally clear as start-of-paragraph
+// and paragraph-body lines.
+//
+// We presume that any lines surrounding rows[row_start, row_end) may
+// have wildly different paragraph models, so we don't key any data off
+// of those lines.
+//
+// We only take the very strongest signals, as we don't want to get
+// confused and marking up centered text, poetry, or source code as
+// clearly part of a typical paragraph.
+static void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows,
+ int row_start, int row_end) {
+ // Record patently obvious body text.
+ for (int i = row_start + 1; i < row_end; i++) {
+ const RowScratchRegisters &prev = (*rows)[i - 1];
+ RowScratchRegisters &curr = (*rows)[i];
+ tesseract::ParagraphJustification typical_justification =
+ prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
+ if (!curr.ri_->rword_likely_starts_idea &&
+ !curr.ri_->lword_likely_starts_idea &&
+ !FirstWordWouldHaveFit(prev, curr, typical_justification)) {
+ curr.SetBodyLine();
+ }
+ }
+
+ // Record patently obvious start paragraph lines.
+ //
+ // It's an extremely good signal of the start of a paragraph that
+ // the first word would have fit on the end of the previous line.
+ // However, applying just that signal would have us mark random
+ // start lines of lineated text (poetry and source code) and some
+ // centered headings as paragraph start lines. Therefore, we use
+ // a second qualification for a paragraph start: Not only should
+ // the first word of this line have fit on the previous line,
+ // but also, this line should go full to the right of the block,
+ // disallowing a subsequent word from having fit on this line.
+
+ // First row:
+ {
+ RowScratchRegisters &curr = (*rows)[row_start];
+ RowScratchRegisters &next = (*rows)[row_start + 1];
+ tesseract::ParagraphJustification j =
+ curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
+ if (curr.GetLineType() == LT_UNKNOWN &&
+ !FirstWordWouldHaveFit(curr, next, j) &&
+ (curr.ri_->lword_likely_starts_idea ||
+ curr.ri_->rword_likely_starts_idea)) {
+ curr.SetStartLine();
+ }
+ }
+ // Middle rows
+ for (int i = row_start + 1; i < row_end - 1; i++) {
+ RowScratchRegisters &prev = (*rows)[i - 1];
+ RowScratchRegisters &curr = (*rows)[i];
+ RowScratchRegisters &next = (*rows)[i + 1];
+ tesseract::ParagraphJustification j =
+ curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
+ if (curr.GetLineType() == LT_UNKNOWN &&
+ !FirstWordWouldHaveFit(curr, next, j) &&
+ LikelyParagraphStart(prev, curr, j)) {
+ curr.SetStartLine();
+ }
+ }
+ // Last row
+ { // the short circuit at the top means we have at least two lines.
+ RowScratchRegisters &prev = (*rows)[row_end - 2];
+ RowScratchRegisters &curr = (*rows)[row_end - 1];
+ tesseract::ParagraphJustification j =
+ curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
+ if (curr.GetLineType() == LT_UNKNOWN &&
+ !FirstWordWouldHaveFit(curr, curr, j) &&
+ LikelyParagraphStart(prev, curr, j)) {
+ curr.SetStartLine();
+ }
+ }
+}
+
+// Look for sequences of a start line followed by some body lines in
+// rows[row_start, row_end) and create ParagraphModels for them if
+// they seem coherent.
+static void ModelStrongEvidence(int debug_level,
+ GenericVector<RowScratchRegisters> *rows,
+ int row_start, int row_end,
+ bool allow_flush_models,
+ ParagraphTheory *theory) {
+ if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
+ return;
+
+ int start = row_start;
+ while (start < row_end) {
+ while (start < row_end && (*rows)[start].GetLineType() != LT_START)
+ start++;
+ if (start >= row_end - 1)
+ break;
+
+ int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
+ int end = start;
+ ParagraphModel last_model;
+ bool next_consistent;
+ do {
+ ++end;
+ // rows[row, end) was consistent.
+ // If rows[row, end + 1) is not consistent,
+ // just model rows[row, end)
+ if (end < row_end - 1) {
+ RowScratchRegisters &next = (*rows)[end];
+ LineType lt = next.GetLineType();
+ next_consistent = lt == LT_BODY ||
+ (lt == LT_UNKNOWN &&
+ !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
+ } else {
+ next_consistent = false;
+ }
+ if (next_consistent) {
+ ParagraphModel next_model = InternalParagraphModelByOutline(
+ rows, start, end + 1, tolerance, &next_consistent);
+ if (((*rows)[start].ri_->ltr &&
+ last_model.justification() == JUSTIFICATION_LEFT &&
+ next_model.justification() != JUSTIFICATION_LEFT) ||
+ (!(*rows)[start].ri_->ltr &&
+ last_model.justification() == JUSTIFICATION_RIGHT &&
+ next_model.justification() != JUSTIFICATION_RIGHT)) {
+ next_consistent = false;
+ }
+ last_model = next_model;
+ } else {
+ next_consistent = false;
+ }
+ } while (next_consistent && end < row_end);
+ // At this point, rows[start, end) looked like it could have been a
+ // single paragraph. If we can make a good ParagraphModel for it,
+ // do so and mark this sequence with that model.
+ if (end > start + 1) {
+ // emit a new paragraph if we have more than one line.
+ const ParagraphModel *model = nullptr;
+ ParagraphModel new_model = ParagraphModelByOutline(
+ debug_level, rows, start, end,
+ Epsilon(InterwordSpace(*rows, start, end)));
+ if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
+ // couldn't create a good model, oh well.
+ } else if (new_model.is_flush()) {
+ if (end == start + 2) {
+ // It's very likely we just got two paragraph starts in a row.
+ end = start + 1;
+ } else if (start == row_start) {
+ // Mark this as a Crown.
+ if (new_model.justification() == JUSTIFICATION_LEFT) {
+ model = kCrownLeft;
+ } else {
+ model = kCrownRight;
+ }
+ } else if (allow_flush_models) {
+ model = theory->AddModel(new_model);
+ }
+ } else {
+ model = theory->AddModel(new_model);
+ }
+ if (model) {
+ (*rows)[start].AddStartLine(model);
+ for (int i = start + 1; i < end; i++) {
+ (*rows)[i].AddBodyLine(model);
+ }
+ }
+ }
+ start = end;
+ }
+}
+
+// We examine rows[row_start, row_end) and do the following:
+// (1) Clear all existing hypotheses for the rows being considered.
+// (2) Mark up any rows as exceptionally likely to be paragraph starts
+// or paragraph body lines as such using both geometric and textual
+// clues.
+// (3) Form models for any sequence of start + continuation lines.
+// (4) Smear the paragraph models to cover surrounding text.
+static void StrongEvidenceClassify(int debug_level,
+ GenericVector<RowScratchRegisters> *rows,
+ int row_start, int row_end,
+ ParagraphTheory *theory) {
+ if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
+ return;
+
+ if (debug_level > 1) {
+ tprintf("#############################################\n");
+ tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
+ tprintf("#############################################\n");
+ }
+
+ RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
+ MarkStrongEvidence(rows, row_start, row_end);
+
+ DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);
+
+ // Create paragraph models.
+ ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);
+
+ DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);
+
+ // At this point, some rows are marked up as paragraphs with model numbers,
+ // and some rows are marked up as either LT_START or LT_BODY. Now let's
+ // smear any good paragraph hypotheses forward and backward.
+ ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
+ smearer.Smear();
+}
+
+static void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows,
+ int row_start, int row_end,
+ ParagraphTheory *theory) {
+ for (int i = row_start + 1; i < row_end - 1; i++) {
+ if ((*rows)[i - 1].ri_->has_leaders &&
+ (*rows)[i].ri_->has_leaders &&
+ (*rows)[i + 1].ri_->has_leaders) {
+ const ParagraphModel *model = theory->AddModel(
+ ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
+ (*rows)[i].AddStartLine(model);
+ }
+ }
+}
+
+// Collect sequences of unique hypotheses in row registers and create proper
+// paragraphs for them, referencing the paragraphs in row_owners.
+static void ConvertHypothesizedModelRunsToParagraphs(
+ int debug_level,
+ GenericVector<RowScratchRegisters> &rows,
+ GenericVector<PARA *> *row_owners,
+ ParagraphTheory *theory) {
+ int end = rows.size();
+ int start;
+ for (; end > 0; end = start) {
+ start = end - 1;
+ const ParagraphModel *model = nullptr;
+ // TODO(eger): Be smarter about dealing with multiple hypotheses.
+ bool single_line_paragraph = false;
+ SetOfModels models;
+ rows[start].NonNullHypotheses(&models);
+ if (!models.empty()) {
+ model = models[0];
+ if (rows[start].GetLineType(model) != LT_BODY)
+ single_line_paragraph = true;
+ }
+ if (model && !single_line_paragraph) {
+ // walk back looking for more body lines and then a start line.
+ while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
+ // do nothing
+ }
+ if (start < 0 || rows[start].GetLineType(model) != LT_START) {
+ model = nullptr;
+ }
+ }
+ if (model == nullptr) {
+ continue;
+ }
+ // rows[start, end) should be a paragraph.
+ PARA *p = new PARA();
+ if (model == kCrownLeft || model == kCrownRight) {
+ p->is_very_first_or_continuation = true;
+ // Crown paragraph.
+ // If we can find an existing ParagraphModel that fits, use it,
+ // else create a new one.
+ for (int row = end; row < rows.size(); row++) {
+ if ((*row_owners)[row] &&
+ (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
+ (start == 0 ||
+ ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
+ model = (*row_owners)[row]->model;
+ break;
+ }
+ }
+ if (model == kCrownLeft) {
+ // No subsequent model fits, so cons one up.
+ model = theory->AddModel(ParagraphModel(
+ JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_,
+ 0, 0, Epsilon(rows[start].ri_->average_interword_space)));
+ } else if (model == kCrownRight) {
+ // No subsequent model fits, so cons one up.
+ model = theory->AddModel(ParagraphModel(
+ JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_,
+ 0, 0, Epsilon(rows[start].ri_->average_interword_space)));
+ }
+ }
+ rows[start].SetUnknown();
+ rows[start].AddStartLine(model);
+ for (int i = start + 1; i < end; i++) {
+ rows[i].SetUnknown();
+ rows[i].AddBodyLine(model);
+ }
+ p->model = model;
+ p->has_drop_cap = rows[start].ri_->has_drop_cap;
+ p->is_list_item =
+ model->justification() == JUSTIFICATION_RIGHT
+ ? rows[start].ri_->rword_indicates_list_item
+ : rows[start].ri_->lword_indicates_list_item;
+ for (int row = start; row < end; row++) {
+ if ((*row_owners)[row] != nullptr) {
+ tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
+ "more than once!\n");
+ delete (*row_owners)[row];
+ }
+ (*row_owners)[row] = p;
+ }
+ }
+}
+
+struct Interval {
+ Interval() : begin(0), end(0) {}
+ Interval(int b, int e) : begin(b), end(e) {}
+
+ int begin;
+ int end;
+};
+
+// Return whether rows[row] appears to be stranded, meaning that the evidence
+// for this row is very weak due to context. For instance, two lines of source
+// code may happen to be indented at the same tab vector as body text starts,
+// leading us to think they are two start-of-paragraph lines. This is not
+// optimal. However, we also don't want to mark a sequence of short dialog
+// as "weak," so our heuristic is:
+// (1) If a line is surrounded by lines of unknown type, it's weak.
+// (2) If two lines in a row are start lines for a given paragraph type, but
+// after that the same paragraph type does not continue, they're weak.
+static bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows,
+ int row) {
+ SetOfModels row_models;
+ rows[row].StrongHypotheses(&row_models);
+
+ for (int m = 0; m < row_models.size(); m++) {
+ bool all_starts = rows[row].GetLineType();
+ int run_length = 1;
+ bool continues = true;
+ for (int i = row - 1; i >= 0 && continues; i--) {
+ SetOfModels models;
+ rows[i].NonNullHypotheses(&models);
+ switch (rows[i].GetLineType(row_models[m])) {
+ case LT_START: run_length++; break;
+ case LT_MULTIPLE: // explicit fall-through
+ case LT_BODY: run_length++; all_starts = false; break;
+ case LT_UNKNOWN: // explicit fall-through
+ default: continues = false;
+ }
+ }
+ continues = true;
+ for (int i = row + 1; i < rows.size() && continues; i++) {
+ SetOfModels models;
+ rows[i].NonNullHypotheses(&models);
+ switch (rows[i].GetLineType(row_models[m])) {
+ case LT_START: run_length++; break;
+ case LT_MULTIPLE: // explicit fall-through
+ case LT_BODY: run_length++; all_starts = false; break;
+ case LT_UNKNOWN: // explicit fall-through
+ default: continues = false;
+ }
+ }
+ if (run_length > 2 || (!all_starts && run_length > 1)) return false;
+ }
+ return true;
+}
+
+// Go through rows[row_start, row_end) and gather up sequences that need better
+// classification.
+// + Sequences of non-empty rows without hypotheses.
+// + Crown paragraphs not immediately followed by a strongly modeled line.
+// + Single line paragraphs surrounded by text that doesn't match the
+// model.
+static void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows,
+ GenericVector<Interval> *to_fix,
+ int row_start, int row_end) {
+ to_fix->clear();
+ for (int i = row_start; i < row_end; i++) {
+ bool needs_fixing = false;
+
+ SetOfModels models;
+ SetOfModels models_w_crowns;
+ rows[i].StrongHypotheses(&models);
+ rows[i].NonNullHypotheses(&models_w_crowns);
+ if (models.empty() && !models_w_crowns.empty()) {
+ // Crown paragraph. Is it followed by a modeled line?
+ for (int end = i + 1; end < rows.size(); end++) {
+ SetOfModels end_models;
+ SetOfModels strong_end_models;
+ rows[end].NonNullHypotheses(&end_models);
+ rows[end].StrongHypotheses(&strong_end_models);
+ if (end_models.empty()) {
+ needs_fixing = true;
+ break;
+ } else if (!strong_end_models.empty()) {
+ needs_fixing = false;
+ break;
+ }
+ }
+ } else if (models.empty() && rows[i].ri_->num_words > 0) {
+ // No models at all.
+ needs_fixing = true;
+ }
+
+ if (!needs_fixing && !models.empty()) {
+ needs_fixing = RowIsStranded(rows, i);
+ }
+
+ if (needs_fixing) {
+ if (!to_fix->empty() && to_fix->back().end == i - 1)
+ to_fix->back().end = i;
+ else
+ to_fix->push_back(Interval(i, i));
+ }
+ }
+ // Convert inclusive intervals to half-open intervals.
+ for (int i = 0; i < to_fix->size(); i++) {
+ (*to_fix)[i].end = (*to_fix)[i].end + 1;
+ }
+}
+
+// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
+// normalize each row_owner to point to an actual PARA, and output the
+// paragraphs in order onto paragraphs.
+void CanonicalizeDetectionResults(
+ GenericVector<PARA *> *row_owners,
+ PARA_LIST *paragraphs) {
+ GenericVector<PARA *> &rows = *row_owners;
+ paragraphs->clear();
+ PARA_IT out(paragraphs);
+ PARA *formerly_null = nullptr;
+ for (int i = 0; i < rows.size(); i++) {
+ if (rows[i] == nullptr) {
+ if (i == 0 || rows[i - 1] != formerly_null) {
+ rows[i] = formerly_null = new PARA();
+ } else {
+ rows[i] = formerly_null;
+ continue;
+ }
+ } else if (i > 0 && rows[i - 1] == rows[i]) {
+ continue;
+ }
+ out.add_after_then_move(rows[i]);
+ }
+}
+
+// Main entry point for Paragraph Detection Algorithm.
+//
+// Given a set of equally spaced textlines (described by row_infos),
+// Split them into paragraphs.
+//
+// Output:
+// row_owners - one pointer for each row, to the paragraph it belongs to.
+// paragraphs - this is the actual list of PARA objects.
+// models - the list of paragraph models referenced by the PARA objects.
+// caller is responsible for deleting the models.
+void DetectParagraphs(int debug_level,
+ std::vector<RowInfo> *row_infos,
+ GenericVector<PARA *> *row_owners,
+ PARA_LIST *paragraphs,
+ std::vector<ParagraphModel *> *models) {
+ GenericVector<RowScratchRegisters> rows;
+ ParagraphTheory theory(models);
+
+ // Initialize row_owners to be a bunch of nullptr pointers.
+ row_owners->init_to_size(row_infos->size(), nullptr);
+
+ // Set up row scratch registers for the main algorithm.
+ rows.init_to_size(row_infos->size(), RowScratchRegisters());
+ for (int i = 0; i < row_infos->size(); i++) {
+ rows[i].Init((*row_infos)[i]);
+ }
+
+ // Pass 1:
+ // Detect sequences of lines that all contain leader dots (.....)
+ // These are likely Tables of Contents. If there are three text lines in
+ // a row with leader dots, it's pretty safe to say the middle one should
+ // be a paragraph of its own.
+ SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
+
+ DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
+
+ GenericVector<Interval> leftovers;
+ LeftoverSegments(rows, &leftovers, 0, rows.size());
+ for (int i = 0; i < leftovers.size(); i++) {
+ // Pass 2a:
+ // Find any strongly evidenced start-of-paragraph lines. If they're
+ // followed by two lines that look like body lines, make a paragraph
+ // model for that and see if that model applies throughout the text
+ // (that is, "smear" it).
+ StrongEvidenceClassify(debug_level, &rows,
+ leftovers[i].begin, leftovers[i].end, &theory);
+
+ // Pass 2b:
+ // If we had any luck in pass 2a, we got part of the page and didn't
+ // know how to classify a few runs of rows. Take the segments that
+ // didn't find a model and reprocess them individually.
+ GenericVector<Interval> leftovers2;
+ LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
+ bool pass2a_was_useful = leftovers2.size() > 1 ||
+ (leftovers2.size() == 1 &&
+ (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));
+ if (pass2a_was_useful) {
+ for (int j = 0; j < leftovers2.size(); j++) {
+ StrongEvidenceClassify(debug_level, &rows,
+ leftovers2[j].begin, leftovers2[j].end,
+ &theory);
+ }
+ }
+ }
+
+ DebugDump(debug_level > 1, "End of Pass 2", theory, rows);
+
+ // Pass 3:
+ // These are the dregs for which we didn't have enough strong textual
+ // and geometric clues to form matching models for. Let's see if
+ // the geometric clues are simple enough that we could just use those.
+ LeftoverSegments(rows, &leftovers, 0, rows.size());
+ for (int i = 0; i < leftovers.size(); i++) {
+ GeometricClassify(debug_level, &rows,
+ leftovers[i].begin, leftovers[i].end, &theory);
+ }
+
+ // Undo any flush models for which there's little evidence.
+ DowngradeWeakestToCrowns(debug_level, &theory, &rows);
+
+ DebugDump(debug_level > 1, "End of Pass 3", theory, rows);
+
+ // Pass 4:
+ // Take everything that's still not marked up well and clear all markings.
+ LeftoverSegments(rows, &leftovers, 0, rows.size());
+ for (int i = 0; i < leftovers.size(); i++) {
+ for (int j = leftovers[i].begin; j < leftovers[i].end; j++) {
+ rows[j].SetUnknown();
+ }
+ }
+
+ DebugDump(debug_level > 1, "End of Pass 4", theory, rows);
+
+ // Convert all of the unique hypothesis runs to PARAs.
+ ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners,
+ &theory);
+
+ DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);
+
+ // Finally, clean up any dangling nullptr row paragraph parents.
+ CanonicalizeDetectionResults(row_owners, paragraphs);
+}
+
+// ============ Code interfacing with the rest of Tesseract ==================
+
+static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it,
+ RowInfo *info) {
+ // Set up text, lword_text, and rword_text (mostly for debug printing).
+ STRING fake_text;
+ PageIterator pit(static_cast<const PageIterator&>(it));
+ bool first_word = true;
+ if (!pit.Empty(RIL_WORD)) {
+ do {
+ fake_text += "x";
+ if (first_word) info->lword_text += "x";
+ info->rword_text += "x";
+ if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&
+ !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) {
+ fake_text += " ";
+ info->rword_text = "";
+ first_word = false;
+ }
+ } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) &&
+ pit.Next(RIL_SYMBOL));
+ }
+ if (fake_text.size() == 0) return;
+
+ int lspaces = info->pix_ldistance / info->average_interword_space;
+ for (int i = 0; i < lspaces; i++) {
+ info->text += ' ';
+ }
+ info->text += fake_text;
+
+ // Set up lword_box, rword_box, and num_words.
+ PAGE_RES_IT page_res_it = *it.PageResIt();
+ WERD_RES *word_res = page_res_it.restart_row();
+ ROW_RES *this_row = page_res_it.row();
+
+ WERD_RES *lword = nullptr;
+ WERD_RES *rword = nullptr;
+ info->num_words = 0;
+ do {
+ if (word_res) {
+ if (!lword) lword = word_res;
+ if (rword != word_res) info->num_words++;
+ rword = word_res;
+ }
+ word_res = page_res_it.forward();
+ } while (page_res_it.row() == this_row);
+
+ if (lword) info->lword_box = lword->word->bounding_box();
+ if (rword) info->rword_box = rword->word->bounding_box();
+}
+
+
+// Given a Tesseract Iterator pointing to a text line, fill in the paragraph
+// detector RowInfo with all relevant information from the row.
+static void InitializeRowInfo(bool after_recognition,
+ const MutableIterator &it, RowInfo *info) {
+ if (it.PageResIt()->row() != nullptr) {
+ ROW *row = it.PageResIt()->row()->row;
+ info->pix_ldistance = row->lmargin();
+ info->pix_rdistance = row->rmargin();
+ info->average_interword_space =
+ row->space() > 0 ? row->space() : std::max(static_cast<int>(row->x_height()), 1);
+ info->pix_xheight = row->x_height();
+ info->has_leaders = false;
+ info->has_drop_cap = row->has_drop_cap();
+ info->ltr = true; // set below depending on word scripts
+ } else {
+ info->pix_ldistance = info->pix_rdistance = 0;
+ info->average_interword_space = 1;
+ info->pix_xheight = 1.0;
+ info->has_leaders = false;
+ info->has_drop_cap = false;
+ info->ltr = true;
+ }
+
+ info->num_words = 0;
+ info->lword_indicates_list_item = false;
+ info->lword_likely_starts_idea = false;
+ info->lword_likely_ends_idea = false;
+ info->rword_indicates_list_item = false;
+ info->rword_likely_starts_idea = false;
+ info->rword_likely_ends_idea = false;
+ info->has_leaders = false;
+ info->ltr = true;
+
+ if (!after_recognition) {
+ InitializeTextAndBoxesPreRecognition(it, info);
+ return;
+ }
+ info->text = "";
+ const std::unique_ptr<const char[]> text(it.GetUTF8Text(RIL_TEXTLINE));
+ int trailing_ws_idx = strlen(text.get()); // strip trailing space
+ while (trailing_ws_idx > 0 &&
+ // isspace() only takes ASCII
+ isascii(text[trailing_ws_idx - 1]) &&
+ isspace(text[trailing_ws_idx - 1]))
+ trailing_ws_idx--;
+ if (trailing_ws_idx > 0) {
+ int lspaces = info->pix_ldistance / info->average_interword_space;
+ for (int i = 0; i < lspaces; i++)
+ info->text += ' ';
+ for (int i = 0; i < trailing_ws_idx; i++)
+ info->text += text[i];
+ }
+
+ if (info->text.size() == 0) {
+ return;
+ }
+
+ PAGE_RES_IT page_res_it = *it.PageResIt();
+ GenericVector<WERD_RES *> werds;
+ WERD_RES *word_res = page_res_it.restart_row();
+ ROW_RES *this_row = page_res_it.row();
+ int num_leaders = 0;
+ int ltr = 0;
+ int rtl = 0;
+ do {
+ if (word_res && word_res->best_choice->unichar_string().length() > 0) {
+ werds.push_back(word_res);
+ ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
+ rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
+ if (word_res->word->flag(W_REP_CHAR)) num_leaders++;
+ }
+ word_res = page_res_it.forward();
+ } while (page_res_it.row() == this_row);
+ info->ltr = ltr >= rtl;
+ info->has_leaders = num_leaders > 3;
+ info->num_words = werds.size();
+ if (!werds.empty()) {
+ WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
+ info->lword_text = lword->best_choice->unichar_string().c_str();
+ info->rword_text = rword->best_choice->unichar_string().c_str();
+ info->lword_box = lword->word->bounding_box();
+ info->rword_box = rword->word->bounding_box();
+ LeftWordAttributes(lword->uch_set, lword->best_choice,
+ info->lword_text,
+ &info->lword_indicates_list_item,
+ &info->lword_likely_starts_idea,
+ &info->lword_likely_ends_idea);
+ RightWordAttributes(rword->uch_set, rword->best_choice,
+ info->rword_text,
+ &info->rword_indicates_list_item,
+ &info->rword_likely_starts_idea,
+ &info->rword_likely_ends_idea);
+ }
+}
+
+// This is called after rows have been identified and words are recognized.
+// Much of this could be implemented before word recognition, but text helps
+// to identify bulleted lists and gives good signals for sentence boundaries.
+void DetectParagraphs(int debug_level,
+ bool after_text_recognition,
+ const MutableIterator *block_start,
+ std::vector<ParagraphModel *> *models) {
+ // Clear out any preconceived notions.
+ if (block_start->Empty(RIL_TEXTLINE)) {
+ return;
+ }
+ BLOCK *block = block_start->PageResIt()->block()->block;
+ block->para_list()->clear();
+ bool is_image_block = block->pdblk.poly_block() && !block->pdblk.poly_block()->IsText();
+
+ // Convert the Tesseract structures to RowInfos
+ // for the paragraph detection algorithm.
+ MutableIterator row(*block_start);
+ if (row.Empty(RIL_TEXTLINE))
+ return; // end of input already.
+
+ std::vector<RowInfo> row_infos;
+ do {
+ if (!row.PageResIt()->row())
+ continue; // empty row.
+ row.PageResIt()->row()->row->set_para(nullptr);
+ row_infos.push_back(RowInfo());
+ RowInfo &ri = row_infos.back();
+ InitializeRowInfo(after_text_recognition, row, &ri);
+ } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) &&
+ row.Next(RIL_TEXTLINE));
+
+ // If we're called before text recognition, we might not have
+ // tight block bounding boxes, so trim by the minimum on each side.
+ if (!row_infos.empty()) {
+ int min_lmargin = row_infos[0].pix_ldistance;
+ int min_rmargin = row_infos[0].pix_rdistance;
+ for (int i = 1; i < row_infos.size(); i++) {
+ if (row_infos[i].pix_ldistance < min_lmargin)
+ min_lmargin = row_infos[i].pix_ldistance;
+ if (row_infos[i].pix_rdistance < min_rmargin)
+ min_rmargin = row_infos[i].pix_rdistance;
+ }
+ if (min_lmargin > 0 || min_rmargin > 0) {
+ for (int i = 0; i < row_infos.size(); i++) {
+ row_infos[i].pix_ldistance -= min_lmargin;
+ row_infos[i].pix_rdistance -= min_rmargin;
+ }
+ }
+ }
+
+ // Run the paragraph detection algorithm.
+ GenericVector<PARA *> row_owners;
+ GenericVector<PARA *> the_paragraphs;
+ if (!is_image_block) {
+ DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(),
+ models);
+ } else {
+ row_owners.init_to_size(row_infos.size(), nullptr);
+ CanonicalizeDetectionResults(&row_owners, block->para_list());
+ }
+
+ // Now stitch in the row_owners into the rows.
+ row = *block_start;
+ for (int i = 0; i < row_owners.size(); i++) {
+ while (!row.PageResIt()->row())
+ row.Next(RIL_TEXTLINE);
+ row.PageResIt()->row()->row->set_para(row_owners[i]);
+ row.Next(RIL_TEXTLINE);
+ }
+}
+
+} // namespace
diff --git a/tesseract/src/ccmain/paragraphs.h b/tesseract/src/ccmain/paragraphs.h
new file mode 100644
index 00000000..edf9b8cc
--- /dev/null
+++ b/tesseract/src/ccmain/paragraphs.h
@@ -0,0 +1,110 @@
+/**********************************************************************
+ * File: paragraphs.h
+ * Description: Paragraph Detection data structures.
+ * Author: David Eger
+ * Created: 25 February 2011
+ *
+ * (C) Copyright 2011, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_
+#define TESSERACT_CCMAIN_PARAGRAPHS_H_
+
+#include "rect.h" // for TBOX
+#include "strngs.h" // for STRING
+#include <list>
+
+namespace tesseract {
+
+class MutableIterator;
+class ParagraphModel;
+class PARA_LIST;
+struct PARA;
+
+template <typename T> class GenericVector;
+
+// This structure captures all information needed about a text line for the
+// purposes of paragraph detection. It is meant to be exceedingly light-weight
+// so that we can easily test paragraph detection independent of the rest of
+// Tesseract.
+class RowInfo {
+ public:
+ // Constant data derived from Tesseract output.
+ STRING text; // the full UTF-8 text of the line.
+ bool ltr; // whether the majority of the text is left-to-right
+ // TODO(eger) make this more fine-grained.
+
+ bool has_leaders; // does the line contain leader dots (.....)?
+ bool has_drop_cap; // does the line have a drop cap?
+ int pix_ldistance; // distance to the left pblock boundary in pixels
+ int pix_rdistance; // distance to the right pblock boundary in pixels
+ float pix_xheight; // guessed xheight for the line
+ int average_interword_space; // average space between words in pixels.
+
+ int num_words;
+ TBOX lword_box; // in normalized (horiz text rows) space
+ TBOX rword_box; // in normalized (horiz text rows) space
+
+ STRING lword_text; // the UTF-8 text of the leftmost werd
+ STRING rword_text; // the UTF-8 text of the rightmost werd
+
+ // The text of a paragraph typically starts with the start of an idea and
+ // ends with the end of an idea. Here we define paragraph as something that
+ // may have a first line indent and a body indent which may be different.
+ // Typical words that start an idea are:
+ // 1. Words in western scripts that start with
+ // a capital letter, for example "The"
+ // 2. Bulleted or numbered list items, for
+ // example "2."
+ // Typical words which end an idea are words ending in punctuation marks. In
+ // this vocabulary, each list item is represented as a paragraph.
+ bool lword_indicates_list_item;
+ bool lword_likely_starts_idea;
+ bool lword_likely_ends_idea;
+
+ bool rword_indicates_list_item;
+ bool rword_likely_starts_idea;
+ bool rword_likely_ends_idea;
+};
+
+// Main entry point for Paragraph Detection Algorithm.
+//
+// Given a set of equally spaced textlines (described by row_infos),
+// Split them into paragraphs. See http://goto/paragraphstalk
+//
+// Output:
+// row_owners - one pointer for each row, to the paragraph it belongs to.
+// paragraphs - this is the actual list of PARA objects.
+// models - the list of paragraph models referenced by the PARA objects.
+// caller is responsible for deleting the models.
+TESS_API
+void DetectParagraphs(int debug_level,
+ std::vector<RowInfo> *row_infos,
+ GenericVector<PARA *> *row_owners,
+ PARA_LIST *paragraphs,
+ std::vector<ParagraphModel *> *models);
+
+// Given a MutableIterator to the start of a block, run DetectParagraphs on
+// that block and commit the results to the underlying ROW and BLOCK structs,
+// saving the ParagraphModels in models. Caller owns the models.
+// We use unicharset during the function to answer questions such as "is the
+// first letter of this word upper case?"
+TESS_API
+void DetectParagraphs(int debug_level,
+ bool after_text_recognition,
+ const MutableIterator *block_start,
+ std::vector<ParagraphModel *> *models);
+
+} // namespace
+
+#endif // TESSERACT_CCMAIN_PARAGRAPHS_H_
diff --git a/tesseract/src/ccmain/paragraphs_internal.h b/tesseract/src/ccmain/paragraphs_internal.h
new file mode 100644
index 00000000..be1e2c9b
--- /dev/null
+++ b/tesseract/src/ccmain/paragraphs_internal.h
@@ -0,0 +1,314 @@
+/**********************************************************************
+ * File: paragraphs_internal.h
+ * Description: Paragraph Detection internal data structures.
+ * Author: David Eger
+ *
+ * (C) Copyright 2011, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
+#define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
+
+#include "paragraphs.h"
+#include <tesseract/publictypes.h> // for ParagraphJustification
+
+// NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
+// DATA STRUCTURES OR FUNCTIONS IN THIS FILE.
+
+namespace tesseract {
+
+class UNICHARSET;
+class WERD_CHOICE;
+
+// Return whether the given word is likely to be a list item start word.
+TESS_API
+bool AsciiLikelyListItem(const STRING &word);
+
+// Return the first Unicode Codepoint from werd[pos].
+int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
+
+// Set right word attributes given either a unicharset and werd or a utf8
+// string.
+TESS_API
+void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
+ const STRING &utf8,
+ bool *is_list, bool *starts_idea, bool *ends_idea);
+
+// Set left word attributes given either a unicharset and werd or a utf8 string.
+TESS_API
+void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
+ const STRING &utf8,
+ bool *is_list, bool *starts_idea, bool *ends_idea);
+
+enum LineType {
+ LT_START = 'S', // First line of a paragraph.
+ LT_BODY = 'C', // Continuation line of a paragraph.
+ LT_UNKNOWN = 'U', // No clues.
+ LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY.
+};
+
+// The first paragraph in a page of body text is often un-indented.
+// This is a typographic convention which is common to indicate either that:
+// (1) The paragraph is the continuation of a previous paragraph, or
+// (2) The paragraph is the first paragraph in a chapter.
+//
+// I refer to such paragraphs as "crown"s, and the output of the paragraph
+// detection algorithm attempts to give them the same paragraph model as
+// the rest of the body text.
+//
+// Nonetheless, while building hypotheses, it is useful to mark the lines
+// of crown paragraphs temporarily as crowns, either aligned left or right.
+extern const ParagraphModel *kCrownLeft;
+extern const ParagraphModel *kCrownRight;
+
+inline bool StrongModel(const ParagraphModel *model) {
+ return model != nullptr && model != kCrownLeft && model != kCrownRight;
+}
+
+struct LineHypothesis {
+ LineHypothesis() : ty(LT_UNKNOWN), model(nullptr) {}
+ LineHypothesis(LineType line_type, const ParagraphModel *m)
+ : ty(line_type), model(m) {}
+ LineHypothesis(const LineHypothesis &other)
+ : ty(other.ty), model(other.model) {}
+
+ // Copy assignment operator.
+ LineHypothesis& operator=(const LineHypothesis& other) {
+ ty = other.ty;
+ model = other.model;
+ return *this;
+ }
+
+ bool operator==(const LineHypothesis &other) const {
+ return ty == other.ty && model == other.model;
+ }
+
+ LineType ty;
+ const ParagraphModel *model;
+};
+
+class ParagraphTheory; // Forward Declaration
+
+using SetOfModels = GenericVector<const ParagraphModel *>;
+
+// Row Scratch Registers are data generated by the paragraph detection
+// algorithm based on a RowInfo input.
+class RowScratchRegisters {
+ public:
+ // We presume row will outlive us.
+ void Init(const RowInfo &row);
+
+ LineType GetLineType() const;
+
+ LineType GetLineType(const ParagraphModel *model) const;
+
+ // Mark this as a start line type, sans model. This is useful for the
+ // initial marking of probable body lines or paragraph start lines.
+ void SetStartLine();
+
+ // Mark this as a body line type, sans model. This is useful for the
+ // initial marking of probably body lines or paragraph start lines.
+ void SetBodyLine();
+
+ // Record that this row fits as a paragraph start line in the given model,
+ void AddStartLine(const ParagraphModel *model);
+ // Record that this row fits as a paragraph body line in the given model,
+ void AddBodyLine(const ParagraphModel *model);
+
+ // Clear all hypotheses about this line.
+ void SetUnknown() { hypotheses_.truncate(0); }
+
+ // Append all hypotheses of strong models that match this row as a start.
+ void StartHypotheses(SetOfModels *models) const;
+
+ // Append all hypotheses of strong models matching this row.
+ void StrongHypotheses(SetOfModels *models) const;
+
+ // Append all hypotheses for this row.
+ void NonNullHypotheses(SetOfModels *models) const;
+
+ // Discard any hypotheses whose model is not in the given list.
+ void DiscardNonMatchingHypotheses(const SetOfModels &models);
+
+ // If we have only one hypothesis and that is that this line is a paragraph
+ // start line of a certain model, return that model. Else return nullptr.
+ const ParagraphModel *UniqueStartHypothesis() const;
+
+ // If we have only one hypothesis and that is that this line is a paragraph
+ // body line of a certain model, return that model. Else return nullptr.
+ const ParagraphModel *UniqueBodyHypothesis() const;
+
+ // Return the indentation for the side opposite of the aligned side.
+ int OffsideIndent(tesseract::ParagraphJustification just) const {
+ switch (just) {
+ case tesseract::JUSTIFICATION_RIGHT: return lindent_;
+ case tesseract::JUSTIFICATION_LEFT: return rindent_;
+ default: return lindent_ > rindent_ ? lindent_ : rindent_;
+ }
+ }
+
+ // Return the indentation for the side the text is aligned to.
+ int AlignsideIndent(tesseract::ParagraphJustification just) const {
+ switch (just) {
+ case tesseract::JUSTIFICATION_RIGHT: return rindent_;
+ case tesseract::JUSTIFICATION_LEFT: return lindent_;
+ default: return lindent_ > rindent_ ? lindent_ : rindent_;
+ }
+ }
+
+ // Append header fields to a vector of row headings.
+ static void AppendDebugHeaderFields(std::vector<STRING> *header);
+
+ // Append data for this row to a vector of debug strings.
+ void AppendDebugInfo(const ParagraphTheory &theory,
+ std::vector<STRING> *dbg) const;
+
+ const RowInfo *ri_;
+
+ // These four constants form a horizontal box model for the white space
+ // on the edges of each line. At each point in the algorithm, the following
+ // shall hold:
+ // ri_->pix_ldistance = lmargin_ + lindent_
+ // ri_->pix_rdistance = rindent_ + rmargin_
+ int lmargin_;
+ int lindent_;
+ int rindent_;
+ int rmargin_;
+
+ private:
+ // Hypotheses of either LT_START or LT_BODY
+ GenericVector<LineHypothesis> hypotheses_;
+};
+
+// A collection of convenience functions for wrapping the set of
+// Paragraph Models we believe correctly model the paragraphs in the image.
+class ParagraphTheory {
+ public:
+ // We presume models will outlive us, and that models will take ownership
+ // of any ParagraphModel *'s we add.
+ explicit ParagraphTheory(std::vector<ParagraphModel *> *models)
+ : models_(models) {}
+ std::vector<ParagraphModel *> &models() { return *models_; }
+ const std::vector<ParagraphModel *> &models() const { return *models_; }
+
+ // Return an existing model if one that is Comparable() can be found.
+ // Else, allocate a new copy of model to save and return a pointer to it.
+ const ParagraphModel *AddModel(const ParagraphModel &model);
+
+ // Discard any models we've made that are not in the list of used models.
+ void DiscardUnusedModels(const SetOfModels &used_models);
+
+ // Return the set of all non-centered models.
+ void NonCenteredModels(SetOfModels *models);
+
+ // If any of the non-centered paragraph models we know about fit
+ // rows[start, end), return it. Else nullptr.
+ const ParagraphModel *Fits(const GenericVector<RowScratchRegisters> *rows,
+ int start, int end) const;
+
+ int IndexOf(const ParagraphModel *model) const;
+
+ private:
+ std::vector<ParagraphModel *> *models_;
+ GenericVector<ParagraphModel *> models_we_added_;
+};
+
+bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows,
+ int row, const ParagraphModel *model);
+bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows,
+ int row, const ParagraphModel *model);
+bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows,
+ int a, int b, const ParagraphModel *model);
+
+// A class for smearing Paragraph Model hypotheses to surrounding rows.
+// The idea here is that StrongEvidenceClassify first marks only exceedingly
+// obvious start and body rows and constructs models of them. Thereafter,
+// we may have left over unmarked lines (mostly end-of-paragraph lines) which
+// were too short to have much confidence about, but which fit the models we've
+// constructed perfectly and which we ought to mark. This class is used to
+// "smear" our models over the text.
+class ParagraphModelSmearer {
+ public:
+ ParagraphModelSmearer(GenericVector<RowScratchRegisters> *rows,
+ int row_start, int row_end,
+ ParagraphTheory *theory);
+
+ // Smear forward paragraph models from existing row markings to subsequent
+ // text lines if they fit, and mark any thereafter still unmodeled rows
+ // with any model in the theory that fits them.
+ void Smear();
+
+ private:
+ // Record in open_models_ for rows [start_row, end_row) the list of models
+ // currently open at each row.
+ // A model is still open in a row if some previous row has said model as a
+ // start hypothesis, and all rows since (including this row) would fit as
+ // either a body or start line in that model.
+ void CalculateOpenModels(int row_start, int row_end);
+
+ SetOfModels &OpenModels(int row) {
+ return open_models_[row - row_start_ + 1];
+ }
+
+ ParagraphTheory *theory_;
+ GenericVector<RowScratchRegisters> *rows_;
+ int row_start_;
+ int row_end_;
+
+ // open_models_ corresponds to rows[start_row_ - 1, end_row_]
+ //
+ // open_models_: Contains models which there was an active (open) paragraph
+ // as of the previous line and for which the left and right
+ // indents admit the possibility that this text line continues
+ // to fit the same model.
+ // TODO(eger): Think about whether we can get rid of "Open" models and just
+ // use the current hypotheses on RowScratchRegisters.
+ std::vector<SetOfModels> open_models_;
+};
+
+// Clear all hypotheses about lines [start, end) and reset the margins to the
+// percentile (0..100) value of the left and right row edges for this run of
+// rows.
+void RecomputeMarginsAndClearHypotheses(
+ GenericVector<RowScratchRegisters> *rows, int start, int end,
+ int percentile);
+
+// Return the median inter-word space in rows[row_start, row_end).
+int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,
+ int row_start, int row_end);
+
+// Return whether the first word on the after line can fit in the space at
+// the end of the before line (knowing which way the text is aligned and read).
+bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
+ const RowScratchRegisters &after,
+ tesseract::ParagraphJustification justification);
+
+// Return whether the first word on the after line can fit in the space at
+// the end of the before line (not knowing the text alignment).
+bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
+ const RowScratchRegisters &after);
+
+// Do rows[start, end) form a single instance of the given paragraph model?
+bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows,
+ int start, int end, const ParagraphModel *model);
+
+// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
+// normalize each row_owner to point to an actual PARA, and output the
+// paragraphs in order onto paragraphs.
+void CanonicalizeDetectionResults(
+ GenericVector<PARA *> *row_owners,
+ PARA_LIST *paragraphs);
+
+} // namespace
+
+#endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
diff --git a/tesseract/src/ccmain/paramsd.cpp b/tesseract/src/ccmain/paramsd.cpp
new file mode 100644
index 00000000..9c8b8990
--- /dev/null
+++ b/tesseract/src/ccmain/paramsd.cpp
@@ -0,0 +1,365 @@
+///////////////////////////////////////////////////////////////////////
+// File: paramsd.cpp
+// Description: Tesseract parameter Editor
+// Author: Joern Wanke
+//
+// (C) Copyright 2007, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+//
+// The parameters editor is used to edit all the parameters used within
+// tesseract from the ui.
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#ifndef GRAPHICS_DISABLED
+
+#include "paramsd.h"
+#include "params.h" // for ParamsVectors, StringParam, BoolParam
+#include "scrollview.h" // for SVEvent, ScrollView, SVET_POPUP
+#include "svmnode.h" // for SVMenuNode
+#include "tesseractclass.h" // for Tesseract
+
+#include "genericvector.h" // for GenericVector
+
+#include <cstdio> // for fclose, fopen, fprintf, sprintf, FILE
+#include <cstdlib> // for atoi
+#include <cstring> // for strcmp, strcspn, strlen, strncpy
+#include <locale> // for std::locale::classic
+#include <map> // for map, _Rb_tree_iterator, map<>::iterator
+#include <memory> // for unique_ptr
+#include <sstream> // for std::stringstream
+#include <utility> // for pair
+
+namespace tesseract {
+
+#define VARDIR "configs/" /*parameters files */
+#define MAX_ITEMS_IN_SUBMENU 30
+
+// The following variables should remain static globals, since they
+// are used by debug editor, which uses a single Tesseract instance.
+//
+// Contains the mappings from unique VC ids to their actual pointers.
+static std::map<int, ParamContent*> vcMap;
+static int nrParams = 0;
+static int writeCommands[2];
+
+ELISTIZE(ParamContent)
+
+// Constructors for the various ParamTypes.
+ParamContent::ParamContent(tesseract::StringParam* it) {
+ my_id_ = nrParams;
+ nrParams++;
+ param_type_ = VT_STRING;
+ sIt = it;
+ vcMap[my_id_] = this;
+}
+// Constructors for the various ParamTypes.
+ParamContent::ParamContent(tesseract::IntParam* it) {
+ my_id_ = nrParams;
+ nrParams++;
+ param_type_ = VT_INTEGER;
+ iIt = it;
+ vcMap[my_id_] = this;
+}
+// Constructors for the various ParamTypes.
+ParamContent::ParamContent(tesseract::BoolParam* it) {
+ my_id_ = nrParams;
+ nrParams++;
+ param_type_ = VT_BOOLEAN;
+ bIt = it;
+ vcMap[my_id_] = this;
+}
+// Constructors for the various ParamTypes.
+ParamContent::ParamContent(tesseract::DoubleParam* it) {
+ my_id_ = nrParams;
+ nrParams++;
+ param_type_ = VT_DOUBLE;
+ dIt = it;
+ vcMap[my_id_] = this;
+}
+
+// Gets a VC object identified by its ID.
+ParamContent* ParamContent::GetParamContentById(int id) {
+ return vcMap[id];
+}
+
+// Copy the first N words from the source string to the target string.
+// Words are delimited by "_".
+void ParamsEditor::GetFirstWords(
+ const char *s, // source string
+ int n, // number of words
+ char *t // target string
+ ) {
+ int full_length = strlen(s);
+ int reqd_len = 0; // No. of chars requird
+ const char *next_word = s;
+
+ while ((n > 0) && reqd_len < full_length) {
+ reqd_len += strcspn(next_word, "_") + 1;
+ next_word += reqd_len;
+ n--;
+ }
+ strncpy(t, s, reqd_len);
+ t[reqd_len] = '\0'; // ensure null terminal
+}
+
+// Getter for the name.
+const char* ParamContent::GetName() const {
+ if (param_type_ == VT_INTEGER) { return iIt->name_str(); }
+ else if (param_type_ == VT_BOOLEAN) { return bIt->name_str(); }
+ else if (param_type_ == VT_DOUBLE) { return dIt->name_str(); }
+ else if (param_type_ == VT_STRING) { return sIt->name_str(); }
+ else
+ return "ERROR: ParamContent::GetName()";
+}
+
+// Getter for the description.
+const char* ParamContent::GetDescription() const {
+ if (param_type_ == VT_INTEGER) { return iIt->info_str(); }
+ else if (param_type_ == VT_BOOLEAN) { return bIt->info_str(); }
+ else if (param_type_ == VT_DOUBLE) { return dIt->info_str(); }
+ else if (param_type_ == VT_STRING) { return sIt->info_str(); }
+ else return nullptr;
+}
+
+// Getter for the value.
+STRING ParamContent::GetValue() const {
+ STRING result;
+ if (param_type_ == VT_INTEGER) {
+ result.add_str_int("", *iIt);
+ } else if (param_type_ == VT_BOOLEAN) {
+ result.add_str_int("", *bIt);
+ } else if (param_type_ == VT_DOUBLE) {
+ result.add_str_double("", *dIt);
+ } else if (param_type_ == VT_STRING) {
+ if (STRING(*(sIt)).c_str() != nullptr) {
+ result = sIt->c_str();
+ } else {
+ result = "Null";
+ }
+ }
+ return result;
+}
+
+// Setter for the value.
+void ParamContent::SetValue(const char* val) {
+// TODO (wanke) Test if the values actually are properly converted.
+// (Quickly visible impacts?)
+ changed_ = true;
+ if (param_type_ == VT_INTEGER) {
+ iIt->set_value(atoi(val));
+ } else if (param_type_ == VT_BOOLEAN) {
+ bIt->set_value(atoi(val));
+ } else if (param_type_ == VT_DOUBLE) {
+ std::stringstream stream(val);
+ // Use "C" locale for reading double value.
+ stream.imbue(std::locale::classic());
+ double d = 0;
+ stream >> d;
+ dIt->set_value(d);
+ } else if (param_type_ == VT_STRING) {
+ sIt->set_value(val);
+ }
+}
+
+// Gets the up to the first 3 prefixes from s (split by _).
+// For example, tesseract_foo_bar will be split into tesseract,foo and bar.
+void ParamsEditor::GetPrefixes(const char* s, STRING* level_one,
+ STRING* level_two,
+ STRING* level_three) {
+ std::unique_ptr<char[]> p(new char[1024]);
+ GetFirstWords(s, 1, p.get());
+ *level_one = p.get();
+ GetFirstWords(s, 2, p.get());
+ *level_two = p.get();
+ GetFirstWords(s, 3, p.get());
+ *level_three = p.get();
+}
+
+// Compare two VC objects by their name.
+int ParamContent::Compare(const void* v1, const void* v2) {
+ const ParamContent* one = *static_cast<const ParamContent* const*>(v1);
+ const ParamContent* two = *static_cast<const ParamContent* const*>(v2);
+ return strcmp(one->GetName(), two->GetName());
+}
+
+// Find all editable parameters used within tesseract and create a
+// SVMenuNode tree from it.
+// TODO (wanke): This is actually sort of hackish.
+SVMenuNode* ParamsEditor::BuildListOfAllLeaves(tesseract::Tesseract *tess) {
+ auto* mr = new SVMenuNode();
+ ParamContent_LIST vclist;
+ ParamContent_IT vc_it(&vclist);
+ // Amount counts the number of entries for a specific char*.
+ // TODO(rays) get rid of the use of std::map.
+ std::map<const char*, int> amount;
+
+ // Add all parameters to a list.
+ int v, i;
+ int num_iterations = (tess->params() == nullptr) ? 1 : 2;
+ for (v = 0; v < num_iterations; ++v) {
+ tesseract::ParamsVectors *vec = (v == 0) ? GlobalParams() : tess->params();
+ for (i = 0; i < vec->int_params.size(); ++i) {
+ vc_it.add_after_then_move(new ParamContent(vec->int_params[i]));
+ }
+ for (i = 0; i < vec->bool_params.size(); ++i) {
+ vc_it.add_after_then_move(new ParamContent(vec->bool_params[i]));
+ }
+ for (i = 0; i < vec->string_params.size(); ++i) {
+ vc_it.add_after_then_move(new ParamContent(vec->string_params[i]));
+ }
+ for (i = 0; i < vec->double_params.size(); ++i) {
+ vc_it.add_after_then_move(new ParamContent(vec->double_params[i]));
+ }
+ }
+
+ // Count the # of entries starting with a specific prefix.
+ for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
+ ParamContent* vc = vc_it.data();
+ STRING tag;
+ STRING tag2;
+ STRING tag3;
+
+ GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);
+ amount[tag.c_str()]++;
+ amount[tag2.c_str()]++;
+ amount[tag3.c_str()]++;
+ }
+
+ vclist.sort(ParamContent::Compare); // Sort the list alphabetically.
+
+ SVMenuNode* other = mr->AddChild("OTHER");
+
+ // go through the list again and this time create the menu structure.
+ vc_it.move_to_first();
+ for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
+ ParamContent* vc = vc_it.data();
+ STRING tag;
+ STRING tag2;
+ STRING tag3;
+ GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);
+
+ if (amount[tag.c_str()] == 1) {
+ other->AddChild(vc->GetName(), vc->GetId(), vc->GetValue().c_str(),
+ vc->GetDescription());
+ } else { // More than one would use this submenu -> create submenu.
+ SVMenuNode* sv = mr->AddChild(tag.c_str());
+ if ((amount[tag.c_str()] <= MAX_ITEMS_IN_SUBMENU) ||
+ (amount[tag2.c_str()] <= 1)) {
+ sv->AddChild(vc->GetName(), vc->GetId(),
+ vc->GetValue().c_str(), vc->GetDescription());
+ } else { // Make subsubmenus.
+ SVMenuNode* sv2 = sv->AddChild(tag2.c_str());
+ sv2->AddChild(vc->GetName(), vc->GetId(),
+ vc->GetValue().c_str(), vc->GetDescription());
+ }
+ }
+ }
+ return mr;
+}
+
+// Event listener. Waits for SVET_POPUP events and processes them.
+void ParamsEditor::Notify(const SVEvent* sve) {
+ if (sve->type == SVET_POPUP) { // only catch SVET_POPUP!
+ char* param = sve->parameter;
+ if (sve->command_id == writeCommands[0]) {
+ WriteParams(param, false);
+ } else if (sve->command_id == writeCommands[1]) {
+ WriteParams(param, true);
+ } else {
+ ParamContent* vc = ParamContent::GetParamContentById(
+ sve->command_id);
+ vc->SetValue(param);
+ sv_window_->AddMessage("Setting %s to %s",
+ vc->GetName(), vc->GetValue().c_str());
+ }
+ }
+}
+
+// Integrate the parameters editor as popupmenu into the existing scrollview
+// window (usually the pg editor). If sv == null, create a new empty
+// empty window and attach the parameters editor to that window (ugly).
+ParamsEditor::ParamsEditor(tesseract::Tesseract* tess,
+ ScrollView* sv) {
+ if (sv == nullptr) {
+ const char* name = "ParamEditorMAIN";
+ sv = new ScrollView(name, 1, 1, 200, 200, 300, 200);
+ }
+
+ sv_window_ = sv;
+
+ //Only one event handler per window.
+ //sv->AddEventHandler((SVEventHandler*) this);
+
+ SVMenuNode* svMenuRoot = BuildListOfAllLeaves(tess);
+
+ STRING paramfile;
+ paramfile = tess->datadir;
+ paramfile += VARDIR; // parameters dir
+ paramfile += "edited"; // actual name
+
+ SVMenuNode* std_menu = svMenuRoot->AddChild ("Build Config File");
+
+ writeCommands[0] = nrParams+1;
+ std_menu->AddChild("All Parameters", writeCommands[0],
+ paramfile.c_str(), "Config file name?");
+
+ writeCommands[1] = nrParams+2;
+ std_menu->AddChild ("changed_ Parameters Only", writeCommands[1],
+ paramfile.c_str(), "Config file name?");
+
+ svMenuRoot->BuildMenu(sv, false);
+}
+
+
+// Write all (changed_) parameters to a config file.
+void ParamsEditor::WriteParams(char *filename,
+ bool changes_only) {
+ FILE *fp; // input file
+ char msg_str[255];
+ // if file exists
+ if ((fp = fopen (filename, "rb")) != nullptr) {
+ fclose(fp);
+ sprintf (msg_str, "Overwrite file " "%s" "? (Y/N)", filename);
+ int a = sv_window_->ShowYesNoDialog(msg_str);
+ if (a == 'n') {
+ return;
+ } // don't write
+ }
+
+
+ fp = fopen (filename, "wb"); // can we write to it?
+ if (fp == nullptr) {
+ sv_window_->AddMessage(
+ "Can't write to file "
+ "%s"
+ "",
+ filename);
+ return;
+ }
+ for (auto& iter : vcMap) {
+ ParamContent* cur = iter.second;
+ if (!changes_only || cur->HasChanged()) {
+ fprintf(fp, "%-25s %-12s # %s\n",
+ cur->GetName(), cur->GetValue().c_str(), cur->GetDescription());
+ }
+ }
+ fclose(fp);
+}
+
+} // namespace tesseract
+
+#endif // !GRAPHICS_DISABLED
diff --git a/tesseract/src/ccmain/paramsd.h b/tesseract/src/ccmain/paramsd.h
new file mode 100644
index 00000000..c8019c1c
--- /dev/null
+++ b/tesseract/src/ccmain/paramsd.h
@@ -0,0 +1,134 @@
+///////////////////////////////////////////////////////////////////////
+// File: paramsd.h
+// Description: Tesseract parameter editor
+// Author: Joern Wanke
+//
+// (C) Copyright 2007, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+//
+// Tesseract parameter editor is used to edit all the parameters used
+// within tesseract from the ui.
+#ifndef TESSERACT_CCMAIN_PARAMSD_H_
+#define TESSERACT_CCMAIN_PARAMSD_H_
+
+#ifndef GRAPHICS_DISABLED
+
+#include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
+#include "scrollview.h" // for ScrollView (ptr only), SVEvent (ptr only)
+#include "strngs.h" // for STRING
+
+namespace tesseract {
+
+class SVMenuNode;
+
+class BoolParam;
+class DoubleParam;
+class IntParam;
+class StringParam;
+class Tesseract;
+
+// A list of all possible parameter types used.
+enum ParamType {
+ VT_INTEGER,
+ VT_BOOLEAN,
+ VT_STRING,
+ VT_DOUBLE
+};
+
+// A rather hackish helper structure which can take any kind of parameter input
+// (defined by ParamType) and do a couple of common operations on them, like
+// comparisond or getting its value. It is used in the context of the
+// ParamsEditor as a bridge from the internal tesseract parameters to the
+// ones displayed by the ScrollView server.
+class ParamContent : public ELIST_LINK {
+ public:
+ // Compare two VC objects by their name.
+ static int Compare(const void* v1, const void* v2);
+
+ // Gets a VC object identified by its ID.
+ static ParamContent* GetParamContentById(int id);
+
+ // Constructors for the various ParamTypes.
+ ParamContent() = default;
+ explicit ParamContent(tesseract::StringParam* it);
+ explicit ParamContent(tesseract::IntParam* it);
+ explicit ParamContent(tesseract::BoolParam* it);
+ explicit ParamContent(tesseract::DoubleParam* it);
+
+
+ // Getters and Setters.
+ void SetValue(const char* val);
+ STRING GetValue() const;
+ const char* GetName() const;
+ const char* GetDescription() const;
+
+ int GetId() { return my_id_; }
+ bool HasChanged() { return changed_; }
+
+ private:
+ // The unique ID of this VC object.
+ int my_id_;
+ // Whether the parameter was changed_ and thus needs to be rewritten.
+ bool changed_ = false;
+ // The actual ParamType of this VC object.
+ ParamType param_type_;
+
+ union {
+ tesseract::StringParam* sIt;
+ tesseract::IntParam* iIt;
+ tesseract::BoolParam* bIt;
+ tesseract::DoubleParam* dIt;
+ };
+};
+
+ELISTIZEH(ParamContent)
+
+// The parameters editor enables the user to edit all the parameters used within
+// tesseract. It can be invoked on its own, but is supposed to be invoked by
+// the program editor.
+class ParamsEditor : public SVEventHandler {
+ public:
+ // Integrate the parameters editor as popupmenu into the existing scrollview
+ // window (usually the pg editor). If sv == null, create a new empty
+ // empty window and attach the parameter editor to that window (ugly).
+ explicit ParamsEditor(tesseract::Tesseract*, ScrollView* sv = nullptr);
+
+ // Event listener. Waits for SVET_POPUP events and processes them.
+ void Notify(const SVEvent* sve) override;
+
+ private:
+ // Gets the up to the first 3 prefixes from s (split by _).
+ // For example, tesseract_foo_bar will be split into tesseract,foo and bar.
+ void GetPrefixes(const char* s, STRING* level_one,
+ STRING* level_two, STRING* level_three);
+
+ // Gets the first n words (split by _) and puts them in t.
+ // For example, tesseract_foo_bar with N=2 will yield tesseract_foo_.
+ void GetFirstWords(const char *s, // source string
+ int n, // number of words
+ char *t); // target string
+
+ // Find all editable parameters used within tesseract and create a
+ // SVMenuNode tree from it.
+ SVMenuNode *BuildListOfAllLeaves(tesseract::Tesseract *tess);
+
+ // Write all (changed_) parameters to a config file.
+ void WriteParams(char* filename, bool changes_only);
+
+ ScrollView* sv_window_;
+};
+
+} // namespace tesseract
+
+#endif // !GRAPHICS_DISABLED
+#endif // TESSERACT_CCMAIN_PARAMSD_H_
diff --git a/tesseract/src/ccmain/pgedit.cpp b/tesseract/src/ccmain/pgedit.cpp
new file mode 100644
index 00000000..b00b5f64
--- /dev/null
+++ b/tesseract/src/ccmain/pgedit.cpp
@@ -0,0 +1,981 @@
+/**********************************************************************
+ * File: pgedit.cpp (Formerly pgeditor.c)
+ * Description: Page structure file editor
+ * Author: Phil Cheatle
+ *
+ *(C) Copyright 1991, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0(the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http:// www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#include "pgedit.h"
+
+#include "blread.h"
+#include "control.h"
+#include "paramsd.h"
+#include "pageres.h"
+#include "tordmain.h"
+#include "scrollview.h"
+#include "svmnode.h"
+#include "statistc.h"
+#include "tesseractclass.h"
+#include "werdit.h"
+
+#include <cctype>
+#include <cmath>
+
+#ifndef GRAPHICS_DISABLED
+namespace tesseract {
+#define ASC_HEIGHT (2 * kBlnBaselineOffset + kBlnXHeight)
+#define X_HEIGHT (kBlnBaselineOffset + kBlnXHeight)
+#define BL_HEIGHT kBlnBaselineOffset
+#define DESC_HEIGHT 0
+
+enum CMD_EVENTS
+{
+ NULL_CMD_EVENT,
+ CHANGE_DISP_CMD_EVENT,
+ DUMP_WERD_CMD_EVENT,
+ SHOW_POINT_CMD_EVENT,
+ SHOW_BLN_WERD_CMD_EVENT,
+ DEBUG_WERD_CMD_EVENT,
+ BLAMER_CMD_EVENT,
+ BOUNDING_BOX_CMD_EVENT,
+ CORRECT_TEXT_CMD_EVENT,
+ POLYGONAL_CMD_EVENT,
+ BL_NORM_CMD_EVENT,
+ BITMAP_CMD_EVENT,
+ IMAGE_CMD_EVENT,
+ BLOCKS_CMD_EVENT,
+ BASELINES_CMD_EVENT,
+ UNIFORM_DISP_CMD_EVENT,
+ REFRESH_CMD_EVENT,
+ QUIT_CMD_EVENT,
+ RECOG_WERDS,
+ RECOG_PSEUDO,
+ SHOW_BLOB_FEATURES,
+ SHOW_SUBSCRIPT_CMD_EVENT,
+ SHOW_SUPERSCRIPT_CMD_EVENT,
+ SHOW_ITALIC_CMD_EVENT,
+ SHOW_BOLD_CMD_EVENT,
+ SHOW_UNDERLINE_CMD_EVENT,
+ SHOW_FIXEDPITCH_CMD_EVENT,
+ SHOW_SERIF_CMD_EVENT,
+ SHOW_SMALLCAPS_CMD_EVENT,
+ SHOW_DROPCAPS_CMD_EVENT,
+};
+
+enum ColorationMode {
+ CM_RAINBOW,
+ CM_SUBSCRIPT,
+ CM_SUPERSCRIPT,
+ CM_ITALIC,
+ CM_BOLD,
+ CM_UNDERLINE,
+ CM_FIXEDPITCH,
+ CM_SERIF,
+ CM_SMALLCAPS,
+ CM_DROPCAPS
+};
+
+/*
+ *
+ * Some global data
+ *
+ */
+
+static ScrollView* image_win;
+static ParamsEditor* pe;
+static bool stillRunning = false;
+
+static ScrollView* bln_word_window = nullptr; // baseline norm words
+
+static CMD_EVENTS mode = CHANGE_DISP_CMD_EVENT; // selected words op
+
+static bool recog_done = false; // recog_all_words was called
+
+// These variables should remain global, since they are only used for the
+// debug mode (in which only a single Tesseract thread/instance will exist).
+static BITS16 word_display_mode;
+static ColorationMode color_mode = CM_RAINBOW;
+static bool display_image = false;
+static bool display_blocks = false;
+static bool display_baselines = false;
+
+static PAGE_RES *current_page_res = nullptr;
+
+STRING_VAR(editor_image_win_name, "EditorImage",
+ "Editor image window name");
+INT_VAR(editor_image_xpos, 590, "Editor image X Pos");
+INT_VAR(editor_image_ypos, 10, "Editor image Y Pos");
+static INT_VAR(editor_image_menuheight, 50, "Add to image height for menu bar");
+INT_VAR(editor_image_word_bb_color, ScrollView::BLUE,
+ "Word bounding box colour");
+INT_VAR(editor_image_blob_bb_color, ScrollView::YELLOW,
+ "Blob bounding box colour");
+INT_VAR(editor_image_text_color, ScrollView::WHITE,
+ "Correct text colour");
+
+STRING_VAR(editor_dbwin_name, "EditorDBWin",
+ "Editor debug window name");
+INT_VAR(editor_dbwin_xpos, 50, "Editor debug window X Pos");
+INT_VAR(editor_dbwin_ypos, 500, "Editor debug window Y Pos");
+INT_VAR(editor_dbwin_height, 24, "Editor debug window height");
+INT_VAR(editor_dbwin_width, 80, "Editor debug window width");
+
+STRING_VAR(editor_word_name, "BlnWords", "BL normalized word window");
+INT_VAR(editor_word_xpos, 60, "Word window X Pos");
+INT_VAR(editor_word_ypos, 510, "Word window Y Pos");
+INT_VAR(editor_word_height, 240, "Word window height");
+INT_VAR(editor_word_width, 655, "Word window width");
+
+/**
+ * show_point()
+ *
+ * Show coords of point, blob bounding box, word bounding box and offset from
+ * row baseline
+ */
+
+static void show_point(PAGE_RES* page_res, float x, float y) {
+ FCOORD pt(x, y);
+ PAGE_RES_IT pr_it(page_res);
+
+ const int kBufsize = 512;
+ char msg[kBufsize];
+ char *msg_ptr = msg;
+
+ msg_ptr += sprintf(msg_ptr, "Pt:(%0.3f, %0.3f) ", x, y);
+
+ for (WERD_RES* word = pr_it.word(); word != nullptr; word = pr_it.forward()) {
+ if (pr_it.row() != pr_it.prev_row() &&
+ pr_it.row()->row->bounding_box().contains(pt)) {
+ msg_ptr += sprintf(msg_ptr, "BL(x)=%0.3f ",
+ pr_it.row()->row->base_line(x));
+ }
+ if (word->word->bounding_box().contains(pt)) {
+ TBOX box = word->word->bounding_box();
+ msg_ptr += sprintf(msg_ptr, "Wd(%d, %d)/(%d, %d) ",
+ box.left(), box.bottom(),
+ box.right(), box.top());
+ C_BLOB_IT cblob_it(word->word->cblob_list());
+ for (cblob_it.mark_cycle_pt();
+ !cblob_it.cycled_list();
+ cblob_it.forward()) {
+ C_BLOB* cblob = cblob_it.data();
+ box = cblob->bounding_box();
+ if (box.contains(pt)) {
+ msg_ptr += sprintf(msg_ptr,
+ "CBlb(%d, %d)/(%d, %d) ",
+ box.left(), box.bottom(),
+ box.right(), box.top());
+ }
+ }
+ }
+ }
+ image_win->AddMessage(msg);
+}
+
+/**
+ * pgeditor_msg()
+ *
+ * Display a message - in the command window if there is one, or to stdout
+ */
+
+static void pgeditor_msg( // message display
+ const char *msg) {
+ image_win->AddMessage(msg);
+}
+
+class BlnEventHandler : public SVEventHandler {
+ public:
+ void Notify(const SVEvent* sv_event) override {
+ if (sv_event->type == SVET_DESTROY)
+ bln_word_window = nullptr;
+ else if (sv_event->type == SVET_CLICK)
+ show_point(current_page_res, sv_event->x, sv_event->y);
+ }
+};
+
+/**
+ * bln_word_window_handle()
+ *
+ * @return a WINDOW for the word window, creating it if necessary
+ */
+static ScrollView* bln_word_window_handle() { // return handle
+ // not opened yet
+ if (bln_word_window == nullptr) {
+ pgeditor_msg("Creating BLN word window...");
+ bln_word_window = new ScrollView(editor_word_name.c_str(),
+ editor_word_xpos, editor_word_ypos, editor_word_width,
+ editor_word_height, 4000, 4000, true);
+ auto* a = new BlnEventHandler();
+ bln_word_window->AddEventHandler(a);
+ pgeditor_msg("Creating BLN word window...Done");
+ }
+ return bln_word_window;
+}
+
+/**
+ * build_image_window()
+ *
+ * Destroy the existing image window if there is one. Work out how big the
+ * new window needs to be. Create it and re-display.
+ */
+
+static void build_image_window(int width, int height) {
+ delete image_win;
+ image_win = new ScrollView(editor_image_win_name.c_str(),
+ editor_image_xpos, editor_image_ypos,
+ width + 1,
+ height + editor_image_menuheight + 1,
+ width,
+ height,
+ true);
+}
+
+/**
+ * display_bln_lines()
+ *
+ * Display normalized baseline, x-height, ascender limit and descender limit
+ */
+
+static void display_bln_lines(ScrollView* window, ScrollView::Color colour,
+ float scale_factor, float y_offset,
+ float minx, float maxx) {
+ window->Pen(colour);
+ window->Line(minx, y_offset + scale_factor * DESC_HEIGHT,
+ maxx, y_offset + scale_factor * DESC_HEIGHT);
+ window->Line(minx, y_offset + scale_factor * BL_HEIGHT,
+ maxx, y_offset + scale_factor * BL_HEIGHT);
+ window->Line(minx, y_offset + scale_factor * X_HEIGHT,
+ maxx, y_offset + scale_factor * X_HEIGHT);
+ window->Line(minx, y_offset + scale_factor * ASC_HEIGHT,
+ maxx, y_offset + scale_factor * ASC_HEIGHT);
+}
+
+/**
+ * notify()
+ *
+ * Event handler that processes incoming events, either forwarding
+ * them to process_cmd_win_event or process_image_event.
+ *
+ */
+
+void PGEventHandler::Notify(const SVEvent* event) {
+ char myval = '0';
+ if (event->type == SVET_POPUP) {
+ pe->Notify(event);
+ } // These are handled by ParamsEditor
+ else if (event->type == SVET_EXIT) { stillRunning = false; }
+ else if (event->type == SVET_MENU) {
+ if (strcmp(event->parameter, "true") == 0) { myval = 'T'; }
+ else if (strcmp(event->parameter, "false") == 0) { myval = 'F'; }
+ tess_->process_cmd_win_event(event->command_id, &myval);
+ }
+ else {
+ tess_->process_image_event(*event);
+ }
+}
+
+/**
+ * build_menu()
+ *
+ * Construct the menu tree used by the command window
+ */
+SVMenuNode *Tesseract::build_menu_new() {
+ SVMenuNode* parent_menu;
+ auto* root_menu_item = new SVMenuNode();
+
+ SVMenuNode* modes_menu_item = root_menu_item->AddChild("MODES");
+
+ modes_menu_item->AddChild("Change Display", CHANGE_DISP_CMD_EVENT);
+ modes_menu_item->AddChild("Dump Word", DUMP_WERD_CMD_EVENT);
+ modes_menu_item->AddChild("Show Point", SHOW_POINT_CMD_EVENT);
+ modes_menu_item->AddChild("Show BL Norm Word", SHOW_BLN_WERD_CMD_EVENT);
+ modes_menu_item->AddChild("Config Words", DEBUG_WERD_CMD_EVENT);
+ modes_menu_item->AddChild("Recog Words", RECOG_WERDS);
+ modes_menu_item->AddChild("Recog Blobs", RECOG_PSEUDO);
+ modes_menu_item->AddChild("Show Blob Features", SHOW_BLOB_FEATURES);
+
+ parent_menu = root_menu_item->AddChild("DISPLAY");
+
+ parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, false);
+ parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, false);
+ parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, false);
+ parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, false);
+ parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, false);
+ parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, true);
+ parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT);
+ parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT);
+ parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT);
+ parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT);
+ parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT);
+ parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT);
+ parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT);
+ parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT);
+ parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT);
+
+
+ parent_menu = root_menu_item->AddChild("OTHER");
+
+ parent_menu->AddChild("Quit", QUIT_CMD_EVENT);
+ parent_menu->AddChild("Show Image", IMAGE_CMD_EVENT, false);
+ parent_menu->AddChild("ShowBlock Outlines", BLOCKS_CMD_EVENT, false);
+ parent_menu->AddChild("Show Baselines", BASELINES_CMD_EVENT, false);
+ parent_menu->AddChild("Uniform Display", UNIFORM_DISP_CMD_EVENT);
+ parent_menu->AddChild("Refresh Display", REFRESH_CMD_EVENT);
+
+ return root_menu_item;
+}
+
+/**
+ * do_re_display()
+ *
+ * Redisplay page
+ */
+void Tesseract::do_re_display(
+ bool (tesseract::Tesseract::* word_painter)(PAGE_RES_IT* pr_it)) {
+ int block_count = 1;
+
+ image_win->Clear();
+ if (display_image) {
+ image_win->Image(pix_binary_, 0, 0);
+ }
+
+ image_win->Brush(ScrollView::NONE);
+ PAGE_RES_IT pr_it(current_page_res);
+ for (WERD_RES* word = pr_it.word(); word != nullptr; word = pr_it.forward()) {
+ (this->*word_painter)(&pr_it);
+ if (display_baselines && pr_it.row() != pr_it.prev_row())
+ pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
+ if (display_blocks && pr_it.block() != pr_it.prev_block())
+ pr_it.block()->block->pdblk.plot(image_win, block_count++, ScrollView::RED);
+ }
+ image_win->Update();
+}
+
+/**
+ * pgeditor_main()
+ *
+ * Top level editor operation:
+ * Setup a new window and an according event handler
+ *
+ */
+
+void Tesseract::pgeditor_main(int width, int height, PAGE_RES *page_res) {
+ current_page_res = page_res;
+ if (current_page_res->block_res_list.empty())
+ return;
+
+ recog_done = false;
+ stillRunning = true;
+
+ build_image_window(width, height);
+ word_display_mode.set(DF_EDGE_STEP);
+ do_re_display(&tesseract::Tesseract::word_set_display);
+#ifndef GRAPHICS_DISABLED
+ pe = new ParamsEditor(this, image_win);
+#endif
+ PGEventHandler pgEventHandler(this);
+
+ image_win->AddEventHandler(&pgEventHandler);
+ image_win->AddMessageBox();
+
+ SVMenuNode* svMenuRoot = build_menu_new();
+
+ svMenuRoot->BuildMenu(image_win);
+ image_win->SetVisible(true);
+
+ image_win->AwaitEvent(SVET_DESTROY);
+ image_win->AddEventHandler(nullptr);
+}
+
+/**
+ * process_cmd_win_event()
+ *
+ * Process a command returned from the command window
+ * (Just call the appropriate command handler)
+ */
+
+bool Tesseract::process_cmd_win_event( // UI command semantics
+ int32_t cmd_event, // which menu item?
+ char* new_value // any prompt data
+) {
+ char msg[160];
+ bool exit = false;
+
+ color_mode = CM_RAINBOW;
+
+ // Run recognition on the full page if needed.
+ switch (cmd_event) {
+ case BLAMER_CMD_EVENT:
+ case SHOW_SUBSCRIPT_CMD_EVENT:
+ case SHOW_SUPERSCRIPT_CMD_EVENT:
+ case SHOW_ITALIC_CMD_EVENT:
+ case SHOW_BOLD_CMD_EVENT:
+ case SHOW_UNDERLINE_CMD_EVENT:
+ case SHOW_FIXEDPITCH_CMD_EVENT:
+ case SHOW_SERIF_CMD_EVENT:
+ case SHOW_SMALLCAPS_CMD_EVENT:
+ case SHOW_DROPCAPS_CMD_EVENT:
+ if (!recog_done) {
+ recog_all_words(current_page_res, nullptr, nullptr, nullptr, 0);
+ recog_done = true;
+ }
+ break;
+ default:
+ break;
+ }
+
+ char* parameter;
+
+ switch (cmd_event) {
+ case NULL_CMD_EVENT:
+ break;
+
+ case CHANGE_DISP_CMD_EVENT:
+ case DUMP_WERD_CMD_EVENT:
+ case SHOW_POINT_CMD_EVENT:
+ case SHOW_BLN_WERD_CMD_EVENT:
+ case RECOG_WERDS:
+ case RECOG_PSEUDO:
+ case SHOW_BLOB_FEATURES:
+ mode =static_cast<CMD_EVENTS>(cmd_event);
+ break;
+ case DEBUG_WERD_CMD_EVENT:
+ mode = DEBUG_WERD_CMD_EVENT;
+ parameter = image_win->ShowInputDialog("Config File Name");
+ word_config_ = parameter;
+ delete[] parameter;
+ break;
+ case BOUNDING_BOX_CMD_EVENT:
+ if (new_value[0] == 'T')
+ word_display_mode.set(DF_BOX);
+ else
+ word_display_mode.reset(DF_BOX);
+ mode = CHANGE_DISP_CMD_EVENT;
+ break;
+ case BLAMER_CMD_EVENT:
+ if (new_value[0] == 'T')
+ word_display_mode.set(DF_BLAMER);
+ else
+ word_display_mode.reset(DF_BLAMER);
+ do_re_display(&tesseract::Tesseract::word_display);
+ mode = CHANGE_DISP_CMD_EVENT;
+ break;
+ case CORRECT_TEXT_CMD_EVENT:
+ if (new_value[0] == 'T')
+ word_display_mode.set(DF_TEXT);
+ else
+ word_display_mode.reset(DF_TEXT);
+ mode = CHANGE_DISP_CMD_EVENT;
+ break;
+ case POLYGONAL_CMD_EVENT:
+ if (new_value[0] == 'T')
+ word_display_mode.set(DF_POLYGONAL);
+ else
+ word_display_mode.reset(DF_POLYGONAL);
+ mode = CHANGE_DISP_CMD_EVENT;
+ break;
+ case BL_NORM_CMD_EVENT:
+ if (new_value[0] == 'T')
+ word_display_mode.set(DF_BN_POLYGONAL);
+ else
+ word_display_mode.reset(DF_BN_POLYGONAL);
+ mode = CHANGE_DISP_CMD_EVENT;
+ break;
+ case BITMAP_CMD_EVENT:
+ if (new_value[0] == 'T')
+ word_display_mode.set(DF_EDGE_STEP);
+ else
+ word_display_mode.reset(DF_EDGE_STEP);
+ mode = CHANGE_DISP_CMD_EVENT;
+ break;
+ case UNIFORM_DISP_CMD_EVENT:
+ do_re_display(&tesseract::Tesseract::word_set_display);
+ break;
+ case IMAGE_CMD_EVENT:
+ display_image =(new_value[0] == 'T');
+ do_re_display(&tesseract::Tesseract::word_display);
+ break;
+ case BLOCKS_CMD_EVENT:
+ display_blocks =(new_value[0] == 'T');
+ do_re_display(&tesseract::Tesseract::word_display);
+ break;
+ case BASELINES_CMD_EVENT:
+ display_baselines =(new_value[0] == 'T');
+ do_re_display(&tesseract::Tesseract::word_display);
+ break;
+ case SHOW_SUBSCRIPT_CMD_EVENT:
+ color_mode = CM_SUBSCRIPT;
+ do_re_display(&tesseract::Tesseract::word_display);
+ break;
+ case SHOW_SUPERSCRIPT_CMD_EVENT:
+ color_mode = CM_SUPERSCRIPT;
+ do_re_display(&tesseract::Tesseract::word_display);
+ break;
+ case SHOW_ITALIC_CMD_EVENT:
+ color_mode = CM_ITALIC;
+ do_re_display(&tesseract::Tesseract::word_display);
+ break;
+ case SHOW_BOLD_CMD_EVENT:
+ color_mode = CM_BOLD;
+ do_re_display(&tesseract::Tesseract::word_display);
+ break;
+ case SHOW_UNDERLINE_CMD_EVENT:
+ color_mode = CM_UNDERLINE;
+ do_re_display(&tesseract::Tesseract::word_display);
+ break;
+ case SHOW_FIXEDPITCH_CMD_EVENT:
+ color_mode = CM_FIXEDPITCH;
+ do_re_display(&tesseract::Tesseract::word_display);
+ break;
+ case SHOW_SERIF_CMD_EVENT:
+ color_mode = CM_SERIF;
+ do_re_display(&tesseract::Tesseract::word_display);
+ break;
+ case SHOW_SMALLCAPS_CMD_EVENT:
+ color_mode = CM_SMALLCAPS;
+ do_re_display(&tesseract::Tesseract::word_display);
+ break;
+ case SHOW_DROPCAPS_CMD_EVENT:
+ color_mode = CM_DROPCAPS;
+ do_re_display(&tesseract::Tesseract::word_display);
+ break;
+ case REFRESH_CMD_EVENT:
+ do_re_display(&tesseract::Tesseract::word_display);
+ break;
+ case QUIT_CMD_EVENT:
+ exit = true;
+ ScrollView::Exit();
+ break;
+
+ default:
+ snprintf(msg, sizeof(msg), "Unrecognised event %" PRId32 "(%s)",
+ cmd_event, new_value);
+ image_win->AddMessage(msg);
+ break;
+ }
+ return exit;
+}
+
+
+/**
+ * process_image_event()
+ *
+ * User has done something in the image window - mouse down or up. Work out
+ * what it is and do something with it.
+ * If DOWN - just remember where it was.
+ * If UP - for each word in the selected area do the operation defined by
+ * the current mode.
+ */
+void Tesseract::process_image_event( // action in image win
+ const SVEvent &event) {
+ // The following variable should remain static, since it is used by
+ // debug editor, which uses a single Tesseract instance.
+ static ICOORD down;
+ ICOORD up;
+ TBOX selection_box;
+ char msg[80];
+
+ switch(event.type) {
+
+ case SVET_SELECTION:
+ if (event.type == SVET_SELECTION) {
+ down.set_x(event.x + event.x_size);
+ down.set_y(event.y + event.y_size);
+ if (mode == SHOW_POINT_CMD_EVENT)
+ show_point(current_page_res, event.x, event.y);
+ }
+
+ up.set_x(event.x);
+ up.set_y(event.y);
+
+ selection_box = TBOX(down, up);
+
+ switch(mode) {
+ case CHANGE_DISP_CMD_EVENT:
+ process_selected_words(
+ current_page_res,
+ selection_box,
+ &tesseract::Tesseract::word_blank_and_set_display);
+ break;
+ case DUMP_WERD_CMD_EVENT:
+ process_selected_words(current_page_res,
+ selection_box,
+ &tesseract::Tesseract::word_dumper);
+ break;
+ case SHOW_BLN_WERD_CMD_EVENT:
+ process_selected_words(current_page_res,
+ selection_box,
+ &tesseract::Tesseract::word_bln_display);
+ break;
+ case DEBUG_WERD_CMD_EVENT:
+ debug_word(current_page_res, selection_box);
+ break;
+ case SHOW_POINT_CMD_EVENT:
+ break; // ignore up event
+
+ case RECOG_WERDS:
+ #ifndef DISABLED_LEGACY_ENGINE
+ image_win->AddMessage("Recogging selected words");
+ this->process_selected_words(current_page_res,
+ selection_box,
+ &Tesseract::recog_interactive);
+ #endif // ndef DISABLED_LEGACY_ENGINE
+ break;
+ case RECOG_PSEUDO:
+ image_win->AddMessage("Recogging selected blobs");
+ recog_pseudo_word(current_page_res, selection_box);
+ break;
+ case SHOW_BLOB_FEATURES:
+ blob_feature_display(current_page_res, selection_box);
+ break;
+
+ default:
+ sprintf(msg, "Mode %d not yet implemented", mode);
+ image_win->AddMessage(msg);
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+/**
+ * debug_word
+ *
+ * Process the whole image, but load word_config_ for the selected word(s).
+ */
+void Tesseract::debug_word(PAGE_RES* page_res, const TBOX &selection_box) {
+#ifndef DISABLED_LEGACY_ENGINE
+ ResetAdaptiveClassifier();
+#endif
+ recog_all_words(page_res, nullptr, &selection_box, word_config_.c_str(), 0);
+}
+
+
+/**********************************************************************
+ * WERD PROCESSOR FUNCTIONS
+ * ========================
+ *
+ * These routines are invoked by one or more of:
+ * process_all_words()
+ * process_selected_words()
+ * or
+ * process_all_words_it()
+ * process_selected_words_it()
+ * for each word to be processed
+ **********************************************************************/
+
+/**
+ * word_blank_and_set_display() Word processor
+ *
+ * Blank display of word then redisplay word according to current display mode
+ * settings
+ */
+
+bool Tesseract::word_blank_and_set_display(PAGE_RES_IT* pr_it) {
+ pr_it->word()->word->bounding_box().plot(image_win, ScrollView::BLACK,
+ ScrollView::BLACK);
+ return word_set_display(pr_it);
+}
+
+
+/**
+ * word_bln_display()
+ *
+ * Normalize word and display in word window
+ */
+bool Tesseract::word_bln_display(PAGE_RES_IT* pr_it) {
+ WERD_RES* word_res = pr_it->word();
+ if (word_res->chopped_word == nullptr) {
+ // Setup word normalization parameters.
+ word_res->SetupForRecognition(unicharset, this, BestPix(),
+ tessedit_ocr_engine_mode, nullptr,
+ classify_bln_numeric_mode,
+ textord_use_cjk_fp_model,
+ poly_allow_detailed_fx,
+ pr_it->row()->row, pr_it->block()->block);
+ }
+ bln_word_window_handle()->Clear();
+ display_bln_lines(bln_word_window_handle(), ScrollView::CYAN,
+ 1.0, 0.0f, -1000.0f, 1000.0f);
+ C_BLOB_IT it(word_res->word->cblob_list());
+ ScrollView::Color color = WERD::NextColor(ScrollView::BLACK);
+ for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+ it.data()->plot_normed(word_res->denorm, color, ScrollView::BROWN,
+ bln_word_window_handle());
+ color = WERD::NextColor(color);
+ }
+ bln_word_window_handle()->Update();
+ return true;
+}
+
+
+
+/**
+ * word_display() Word Processor
+ *
+ * Display a word according to its display modes
+ */
+bool Tesseract::word_display(PAGE_RES_IT* pr_it) {
+ WERD_RES* word_res = pr_it->word();
+ WERD* word = word_res->word;
+ TBOX word_bb; // word bounding box
+ int word_height; // ht of word BB
+ bool displayed_something = false;
+ float shift; // from bot left
+
+ if (color_mode != CM_RAINBOW && word_res->box_word != nullptr) {
+ #ifndef DISABLED_LEGACY_ENGINE
+ BoxWord* box_word = word_res->box_word;
+ WERD_CHOICE* best_choice = word_res->best_choice;
+ int length = box_word->length();
+ if (word_res->fontinfo == nullptr) return false;
+ const FontInfo& font_info = *word_res->fontinfo;
+ for (int i = 0; i < length; ++i) {
+ ScrollView::Color color = ScrollView::GREEN;
+ switch (color_mode) {
+ case CM_SUBSCRIPT:
+ if (best_choice->BlobPosition(i) == SP_SUBSCRIPT)
+ color = ScrollView::RED;
+ break;
+ case CM_SUPERSCRIPT:
+ if (best_choice->BlobPosition(i) == SP_SUPERSCRIPT)
+ color = ScrollView::RED;
+ break;
+ case CM_ITALIC:
+ if (font_info.is_italic())
+ color = ScrollView::RED;
+ break;
+ case CM_BOLD:
+ if (font_info.is_bold())
+ color = ScrollView::RED;
+ break;
+ case CM_FIXEDPITCH:
+ if (font_info.is_fixed_pitch())
+ color = ScrollView::RED;
+ break;
+ case CM_SERIF:
+ if (font_info.is_serif())
+ color = ScrollView::RED;
+ break;
+ case CM_SMALLCAPS:
+ if (word_res->small_caps)
+ color = ScrollView::RED;
+ break;
+ case CM_DROPCAPS:
+ if (best_choice->BlobPosition(i) == SP_DROPCAP)
+ color = ScrollView::RED;
+ break;
+ // TODO(rays) underline is currently completely unsupported.
+ case CM_UNDERLINE:
+ default:
+ break;
+ }
+ image_win->Pen(color);
+ TBOX box = box_word->BlobBox(i);
+ image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
+ }
+ return true;
+ #else
+ return false;
+ #endif // ndef DISABLED_LEGACY_ENGINE
+ }
+ /*
+ Note the double coercions of(COLOUR)((int32_t)editor_image_word_bb_color)
+ etc. are to keep the compiler happy.
+ */
+ // display bounding box
+ if (word->display_flag(DF_BOX)) {
+ word->bounding_box().plot(image_win,
+ static_cast<ScrollView::Color>((int32_t)
+ editor_image_word_bb_color),
+ static_cast<ScrollView::Color>((int32_t)
+ editor_image_word_bb_color));
+
+ auto c = static_cast<ScrollView::Color>((int32_t) editor_image_blob_bb_color);
+ image_win->Pen(c);
+ // cblob iterator
+ C_BLOB_IT c_it(word->cblob_list());
+ for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward())
+ c_it.data()->bounding_box().plot(image_win);
+ displayed_something = true;
+ }
+
+ // display edge steps
+ if (word->display_flag(DF_EDGE_STEP)) { // edgesteps available
+ word->plot(image_win); // rainbow colors
+ displayed_something = true;
+ }
+
+ // display poly approx
+ if (word->display_flag(DF_POLYGONAL)) {
+ // need to convert
+ TWERD* tword = TWERD::PolygonalCopy(poly_allow_detailed_fx, word);
+ tword->plot(image_win);
+ delete tword;
+ displayed_something = true;
+ }
+
+ // Display correct text and blamer information.
+ STRING text;
+ STRING blame;
+ if (word->display_flag(DF_TEXT) && word->text() != nullptr) {
+ text = word->text();
+ }
+ if (word->display_flag(DF_BLAMER) &&
+ !(word_res->blamer_bundle != nullptr &&
+ word_res->blamer_bundle->incorrect_result_reason() == IRR_CORRECT)) {
+ text = "";
+ const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
+ if (blamer_bundle == nullptr) {
+ text += "NULL";
+ } else {
+ text = blamer_bundle->TruthString();
+ }
+ text += " -> ";
+ STRING best_choice_str;
+ if (word_res->best_choice == nullptr) {
+ best_choice_str = "NULL";
+ } else {
+ word_res->best_choice->string_and_lengths(&best_choice_str, nullptr);
+ }
+ text += best_choice_str;
+ IncorrectResultReason reason = (blamer_bundle == nullptr) ?
+ IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason();
+ ASSERT_HOST(reason < IRR_NUM_REASONS);
+ blame += " [";
+ blame += BlamerBundle::IncorrectReasonName(reason);
+ blame += "]";
+ }
+ if (text.length() > 0) {
+ word_bb = word->bounding_box();
+ image_win->Pen(ScrollView::RED);
+ word_height = word_bb.height();
+ int text_height = 0.50 * word_height;
+ if (text_height > 20) text_height = 20;
+ image_win->TextAttributes("Arial", text_height, false, false, false);
+ shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
+ image_win->Text(word_bb.left() + shift,
+ word_bb.bottom() + 0.25 * word_height, text.c_str());
+ if (blame.length() > 0) {
+ image_win->Text(word_bb.left() + shift,
+ word_bb.bottom() + 0.25 * word_height - text_height,
+ blame.c_str());
+ }
+
+ displayed_something = true;
+ }
+
+ if (!displayed_something) // display BBox anyway
+ word->bounding_box().plot(image_win,
+ static_cast<ScrollView::Color>((int32_t) editor_image_word_bb_color),
+ static_cast<ScrollView::Color>((int32_t)
+ editor_image_word_bb_color));
+ return true;
+}
+} // namespace tesseract
+#endif // !GRAPHICS_DISABLED
+
+namespace tesseract {
+/**
+ * word_dumper()
+ *
+ * Dump members to the debug window
+ */
+bool Tesseract::word_dumper(PAGE_RES_IT* pr_it) {
+ if (pr_it->block()->block != nullptr) {
+ tprintf("\nBlock data...\n");
+ pr_it->block()->block->print(nullptr, false);
+ }
+ tprintf("\nRow data...\n");
+ pr_it->row()->row->print(nullptr);
+ tprintf("\nWord data...\n");
+ WERD_RES* word_res = pr_it->word();
+ word_res->word->print();
+ if (word_res->blamer_bundle != nullptr && wordrec_debug_blamer &&
+ word_res->blamer_bundle->incorrect_result_reason() != IRR_CORRECT) {
+ tprintf("Current blamer debug: %s\n",
+ word_res->blamer_bundle->debug().c_str());
+ }
+ return true;
+}
+
+#ifndef GRAPHICS_DISABLED
+/**
+ * word_set_display() Word processor
+ *
+ * Display word according to current display mode settings
+ */
+bool Tesseract::word_set_display(PAGE_RES_IT* pr_it) {
+ WERD* word = pr_it->word()->word;
+ word->set_display_flag(DF_BOX, word_display_mode[DF_BOX]);
+ word->set_display_flag(DF_TEXT, word_display_mode[DF_TEXT]);
+ word->set_display_flag(DF_POLYGONAL, word_display_mode[DF_POLYGONAL]);
+ word->set_display_flag(DF_EDGE_STEP, word_display_mode[DF_EDGE_STEP]);
+ word->set_display_flag(DF_BN_POLYGONAL,
+ word_display_mode[DF_BN_POLYGONAL]);
+ word->set_display_flag(DF_BLAMER, word_display_mode[DF_BLAMER]);
+ return word_display(pr_it);
+}
+
+
+// page_res is non-const because the iterator doesn't know if you are going
+// to change the items it points to! Really a const here though.
+void Tesseract::blob_feature_display(PAGE_RES* page_res,
+ const TBOX& selection_box) {
+#ifndef DISABLED_LEGACY_ENGINE
+ PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
+ if (it != nullptr) {
+ WERD_RES* word_res = it->word();
+ word_res->x_height = it->row()->row->x_height();
+ word_res->SetupForRecognition(unicharset, this, BestPix(),
+ tessedit_ocr_engine_mode, nullptr,
+ classify_bln_numeric_mode,
+ textord_use_cjk_fp_model,
+ poly_allow_detailed_fx,
+ it->row()->row, it->block()->block);
+ TWERD* bln_word = word_res->chopped_word;
+ TBLOB* bln_blob = bln_word->blobs[0];
+ INT_FX_RESULT_STRUCT fx_info;
+ std::vector<INT_FEATURE_STRUCT> bl_features;
+ std::vector<INT_FEATURE_STRUCT> cn_features;
+ Classify::ExtractFeatures(*bln_blob, classify_nonlinear_norm, &bl_features,
+ &cn_features, &fx_info, nullptr);
+ // Display baseline features.
+ ScrollView* bl_win = CreateFeatureSpaceWindow("BL Features", 512, 0);
+ ClearFeatureSpaceWindow(baseline, bl_win);
+ for (int f = 0; f < bl_features.size(); ++f)
+ RenderIntFeature(bl_win, &bl_features[f], ScrollView::GREEN);
+ bl_win->Update();
+ // Display cn features.
+ ScrollView* cn_win = CreateFeatureSpaceWindow("CN Features", 512, 0);
+ ClearFeatureSpaceWindow(character, cn_win);
+ for (int f = 0; f < cn_features.size(); ++f)
+ RenderIntFeature(cn_win, &cn_features[f], ScrollView::GREEN);
+ cn_win->Update();
+
+ it->DeleteCurrentWord();
+ delete it;
+ }
+#endif // ndef DISABLED_LEGACY_ENGINE
+}
+
+#endif // !GRAPHICS_DISABLED
+
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/pgedit.h b/tesseract/src/ccmain/pgedit.h
new file mode 100644
index 00000000..55467f67
--- /dev/null
+++ b/tesseract/src/ccmain/pgedit.h
@@ -0,0 +1,71 @@
+///////////////////////////////////////////////////////////////////////
+// File: pgedit.h
+// Description: Page structure file editor
+// Author: Joern Wanke
+//
+// (C) Copyright 2007, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef PGEDIT_H
+#define PGEDIT_H
+
+#include "params.h" // for INT_VAR_H, IntParam, STRING_VAR_H, StringParam
+#include "scrollview.h" // for SVEvent (ptr only), SVEventHandler, ScrollView
+
+namespace tesseract {
+
+class BLOCK_LIST;
+class PAGE_RES;
+
+class Tesseract;
+
+// A small event handler class to process incoming events to
+// this window.
+class PGEventHandler : public SVEventHandler {
+ public:
+ PGEventHandler(tesseract::Tesseract* tess) : tess_(tess) {
+ }
+ void Notify(const SVEvent* sve) override;
+ private:
+ tesseract::Tesseract* tess_;
+};
+
+extern BLOCK_LIST *current_block_list;
+extern STRING_VAR_H (editor_image_win_name, "EditorImage",
+"Editor image window name");
+extern INT_VAR_H (editor_image_xpos, 590, "Editor image X Pos");
+extern INT_VAR_H (editor_image_ypos, 10, "Editor image Y Pos");
+extern INT_VAR_H (editor_image_height, 680, "Editor image height");
+extern INT_VAR_H (editor_image_width, 655, "Editor image width");
+extern INT_VAR_H (editor_image_word_bb_color, BLUE,
+"Word bounding box colour");
+extern INT_VAR_H (editor_image_blob_bb_color, YELLOW,
+"Blob bounding box colour");
+extern INT_VAR_H (editor_image_text_color, WHITE, "Correct text colour");
+extern STRING_VAR_H (editor_dbwin_name, "EditorDBWin",
+"Editor debug window name");
+extern INT_VAR_H (editor_dbwin_xpos, 50, "Editor debug window X Pos");
+extern INT_VAR_H (editor_dbwin_ypos, 500, "Editor debug window Y Pos");
+extern INT_VAR_H (editor_dbwin_height, 24, "Editor debug window height");
+extern INT_VAR_H (editor_dbwin_width, 80, "Editor debug window width");
+extern STRING_VAR_H (editor_word_name, "BlnWords",
+"BL normalised word window");
+extern INT_VAR_H (editor_word_xpos, 60, "Word window X Pos");
+extern INT_VAR_H (editor_word_ypos, 510, "Word window Y Pos");
+extern INT_VAR_H (editor_word_height, 240, "Word window height");
+extern INT_VAR_H (editor_word_width, 655, "Word window width");
+extern double_VAR_H (editor_smd_scale_factor, 1.0, "Scaling for smd image");
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/ccmain/recogtraining.cpp b/tesseract/src/ccmain/recogtraining.cpp
new file mode 100644
index 00000000..9368f32a
--- /dev/null
+++ b/tesseract/src/ccmain/recogtraining.cpp
@@ -0,0 +1,238 @@
+///////////////////////////////////////////////////////////////////////
+// File: recogtraining.cpp
+// Description: Functions for ambiguity and parameter training.
+// Author: Daria Antonova
+//
+// (C) Copyright 2009, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "tesseractclass.h"
+
+#include "boxread.h"
+#include "control.h"
+#include "host.h" // for NearlyEqual
+#include "ratngs.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#include "reject.h"
+#endif
+#include "stopper.h"
+
+namespace tesseract {
+
+const int16_t kMaxBoxEdgeDiff = 2;
+
+// Sets flags necessary for recognition in the training mode.
+// Opens and returns the pointer to the output file.
+FILE* Tesseract::init_recog_training(const char* filename) {
+ if (tessedit_ambigs_training) {
+ tessedit_tess_adaption_mode.set_value(0); // turn off adaption
+ tessedit_enable_doc_dict.set_value(0); // turn off document dictionary
+ // Explore all segmentations.
+ getDict().stopper_no_acceptable_choices.set_value(1);
+ }
+
+ STRING output_fname = filename;
+ const char* lastdot = strrchr(output_fname.c_str(), '.');
+ if (lastdot != nullptr)
+ output_fname[lastdot - output_fname.c_str()] = '\0';
+ output_fname += ".txt";
+ FILE* output_file = fopen(output_fname.c_str(), "a+");
+ if (output_file == nullptr) {
+ tprintf("Error: Could not open file %s\n", output_fname.c_str());
+ ASSERT_HOST(output_file);
+ }
+ return output_file;
+}
+
+// Copies the bounding box from page_res_it->word() to the given TBOX.
+static bool read_t(PAGE_RES_IT* page_res_it, TBOX* tbox) {
+ while (page_res_it->block() != nullptr && page_res_it->word() == nullptr)
+ page_res_it->forward();
+
+ if (page_res_it->word() != nullptr) {
+ *tbox = page_res_it->word()->word->bounding_box();
+
+ // If tbox->left() is negative, the training image has vertical text and
+ // all the coordinates of bounding boxes of page_res are rotated by 90
+ // degrees in a counterclockwise direction. We need to rotate the TBOX back
+ // in order to compare with the TBOXes of box files.
+ if (tbox->left() < 0) {
+ tbox->rotate(FCOORD(0.0, -1.0));
+ }
+
+ return true;
+ } else {
+ return false;
+ }
+}
+
+// This function takes tif/box pair of files and runs recognition on the image,
+// while making sure that the word bounds that tesseract identified roughly
+// match to those specified by the input box file. For each word (ngram in a
+// single bounding box from the input box file) it outputs the ocred result,
+// the correct label, rating and certainty.
+void Tesseract::recog_training_segmented(const char* filename,
+ PAGE_RES* page_res,
+ volatile ETEXT_DESC* monitor,
+ FILE* output_file) {
+ std::string box_fname = filename;
+ const char* lastdot = strrchr(box_fname.c_str(), '.');
+ if (lastdot != nullptr)
+ box_fname[lastdot - box_fname.c_str()] = '\0';
+ box_fname += ".box";
+ // ReadNextBox() will close box_file
+ FILE* box_file = fopen(box_fname.c_str(), "r");
+ if (box_file == nullptr) {
+ tprintf("Error: Could not open file %s\n", box_fname.c_str());
+ ASSERT_HOST(box_file);
+ }
+
+ PAGE_RES_IT page_res_it;
+ page_res_it.page_res = page_res;
+ page_res_it.restart_page();
+ STRING label;
+
+ // Process all the words on this page.
+ TBOX tbox; // tesseract-identified box
+ TBOX bbox; // box from the box file
+ bool keep_going;
+ int line_number = 0;
+ int examined_words = 0;
+ do {
+ keep_going = read_t(&page_res_it, &tbox);
+ keep_going &=
+ ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
+ // Align bottom left points of the TBOXes.
+ while (keep_going &&
+ !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
+ if (bbox.bottom() < tbox.bottom()) {
+ page_res_it.forward();
+ keep_going = read_t(&page_res_it, &tbox);
+ } else {
+ keep_going =
+ ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
+ }
+ }
+ while (keep_going &&
+ !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
+ if (bbox.left() > tbox.left()) {
+ page_res_it.forward();
+ keep_going = read_t(&page_res_it, &tbox);
+ } else {
+ keep_going =
+ ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
+ }
+ }
+ // OCR the word if top right points of the TBOXes are similar.
+ if (keep_going &&
+ NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
+ NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
+ ambigs_classify_and_output(label.c_str(), &page_res_it, output_file);
+ examined_words++;
+ }
+ page_res_it.forward();
+ } while (keep_going);
+
+ // Set up scripts on all of the words that did not get sent to
+ // ambigs_classify_and_output. They all should have, but if all the
+ // werd_res's don't get uch_sets, tesseract will crash when you try
+ // to iterate over them. :-(
+ int total_words = 0;
+ for (page_res_it.restart_page(); page_res_it.block() != nullptr;
+ page_res_it.forward()) {
+ if (page_res_it.word()) {
+ if (page_res_it.word()->uch_set == nullptr)
+ page_res_it.word()->SetupFake(unicharset);
+ total_words++;
+ }
+ }
+ if (examined_words < 0.85 * total_words) {
+ tprintf(
+ "TODO(antonova): clean up recog_training_segmented; "
+ " It examined only a small fraction of the ambigs image.\n");
+ }
+ tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words,
+ total_words);
+}
+
+// Helper prints the given set of blob choices.
+static void PrintPath(int length, const BLOB_CHOICE** blob_choices,
+ const UNICHARSET& unicharset, const char* label,
+ FILE* output_file) {
+ float rating = 0.0f;
+ float certainty = 0.0f;
+ for (int i = 0; i < length; ++i) {
+ const BLOB_CHOICE* blob_choice = blob_choices[i];
+ fprintf(output_file, "%s",
+ unicharset.id_to_unichar(blob_choice->unichar_id()));
+ rating += blob_choice->rating();
+ if (certainty > blob_choice->certainty())
+ certainty = blob_choice->certainty();
+ }
+ fprintf(output_file, "\t%s\t%.4f\t%.4f\n", label, rating, certainty);
+}
+
+// Helper recursively prints all paths through the ratings matrix, starting
+// at column col.
+static void PrintMatrixPaths(int col, int dim, const MATRIX& ratings,
+ int length, const BLOB_CHOICE** blob_choices,
+ const UNICHARSET& unicharset, const char* label,
+ FILE* output_file) {
+ for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
+ if (ratings.get(col, row) != NOT_CLASSIFIED) {
+ BLOB_CHOICE_IT bc_it(ratings.get(col, row));
+ for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
+ blob_choices[length] = bc_it.data();
+ if (row + 1 < dim) {
+ PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices,
+ unicharset, label, output_file);
+ } else {
+ PrintPath(length + 1, blob_choices, unicharset, label, output_file);
+ }
+ }
+ }
+ }
+}
+
+// Runs classify_word_pass1() on the current word. Outputs Tesseract's
+// raw choice as a result of the classification. For words labeled with a
+// single unichar also outputs all alternatives from blob_choices of the
+// best choice.
+void Tesseract::ambigs_classify_and_output(const char* label,
+ PAGE_RES_IT* pr_it,
+ FILE* output_file) {
+ // Classify word.
+ fflush(stdout);
+ WordData word_data(*pr_it);
+ SetupWordPassN(1, &word_data);
+ classify_word_and_language(1, pr_it, &word_data);
+ WERD_RES* werd_res = word_data.word;
+ WERD_CHOICE* best_choice = werd_res->best_choice;
+ ASSERT_HOST(best_choice != nullptr);
+
+ // Compute the number of unichars in the label.
+ std::vector<UNICHAR_ID> encoding;
+ if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
+ tprintf("Not outputting illegal unichar %s\n", label);
+ return;
+ }
+
+ // Dump all paths through the ratings matrix (which is normally small).
+ int dim = werd_res->ratings->dimension();
+ const auto** blob_choices = new const BLOB_CHOICE*[dim];
+ PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset,
+ label, output_file);
+ delete[] blob_choices;
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/reject.cpp b/tesseract/src/ccmain/reject.cpp
new file mode 100644
index 00000000..e2df9f40
--- /dev/null
+++ b/tesseract/src/ccmain/reject.cpp
@@ -0,0 +1,792 @@
+/**********************************************************************
+ * File: reject.cpp (Formerly reject.c)
+ * Description: Rejection functions used in tessedit
+ * Author: Phil Cheatle
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#include "reject.h"
+
+#ifdef DISABLED_LEGACY_ENGINE
+
+#include "tesseractclass.h"
+
+namespace tesseract {
+
+int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
+ const WERD_CHOICE &word = *werd_res->best_choice;
+ int dict_word_type = werd_res->tesseract->dict_word(word);
+ return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
+}
+} // namespace tesseract
+
+#else
+
+#include "tessvars.h"
+#include "control.h"
+#include "docqual.h"
+#include "tesseractclass.h"
+
+#include "genericvector.h"
+#include "helpers.h"
+
+#include <cctype>
+#include <cerrno>
+#include <cstring>
+
+namespace tesseract {
+
+CLISTIZEH (STRING) CLISTIZE (STRING)
+
+/*************************************************************************
+ * set_done()
+ *
+ * Set the done flag based on the word acceptability criteria
+ *************************************************************************/
+
+void Tesseract::set_done(WERD_RES *word, int16_t pass) {
+ word->done = word->tess_accepted &&
+ (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr);
+ bool word_is_ambig = word->best_choice->dangerous_ambig_found();
+ bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
+ word->best_choice->permuter() == FREQ_DAWG_PERM ||
+ word->best_choice->permuter() == USER_DAWG_PERM;
+ if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
+ one_ell_conflict(word, false)) {
+ if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
+ word->done = false;
+ }
+ if (word->done && ((!word_from_dict &&
+ word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
+ if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
+ word->done = false;
+ }
+ if (tessedit_rejection_debug) {
+ tprintf("set_done(): done=%d\n", word->done);
+ word->best_choice->print("");
+ }
+}
+
+
+/*************************************************************************
+ * make_reject_map()
+ *
+ * Sets the done flag to indicate whether the resylt is acceptable.
+ *
+ * Sets a reject map for the word.
+ *************************************************************************/
+void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
+ int i;
+ int offset;
+
+ flip_0O(word);
+ check_debug_pt(word, -1); // For trap only
+ set_done(word, pass); // Set acceptance
+ word->reject_map.initialise(word->best_choice->unichar_lengths().length());
+ reject_blanks(word);
+ /*
+ 0: Rays original heuristic - the baseline
+ */
+ if (tessedit_reject_mode == 0) {
+ if (!word->done)
+ reject_poor_matches(word);
+ } else if (tessedit_reject_mode == 5) {
+ /*
+ 5: Reject I/1/l from words where there is no strong contextual confirmation;
+ the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
+ and the whole of any words which are very small
+ */
+ if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
+ word->reject_map.rej_word_small_xht();
+ } else {
+ one_ell_conflict(word, true);
+ /*
+ Originally the code here just used the done flag. Now I have duplicated
+ and unpacked the conditions for setting the done flag so that each
+ mechanism can be turned on or off independently. This works WITHOUT
+ affecting the done flag setting.
+ */
+ if (rej_use_tess_accepted && !word->tess_accepted)
+ word->reject_map.rej_word_not_tess_accepted ();
+
+ if (rej_use_tess_blanks &&
+ (strchr (word->best_choice->unichar_string().c_str(), ' ') != nullptr))
+ word->reject_map.rej_word_contains_blanks ();
+
+ WERD_CHOICE* best_choice = word->best_choice;
+ if (rej_use_good_perm) {
+ if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
+ best_choice->permuter() == FREQ_DAWG_PERM ||
+ best_choice->permuter() == USER_DAWG_PERM) &&
+ (!rej_use_sensible_wd ||
+ acceptable_word_string(*word->uch_set,
+ best_choice->unichar_string().c_str(),
+ best_choice->unichar_lengths().c_str()) !=
+ AC_UNACCEPTABLE)) {
+ // PASSED TEST
+ } else if (best_choice->permuter() == NUMBER_PERM) {
+ if (rej_alphas_in_number_perm) {
+ for (i = 0, offset = 0;
+ best_choice->unichar_string()[offset] != '\0';
+ offset += best_choice->unichar_lengths()[i++]) {
+ if (word->reject_map[i].accepted() &&
+ word->uch_set->get_isalpha(
+ best_choice->unichar_string().c_str() + offset,
+ best_choice->unichar_lengths()[i]))
+ word->reject_map[i].setrej_bad_permuter();
+ // rej alpha
+ }
+ }
+ } else {
+ word->reject_map.rej_word_bad_permuter();
+ }
+ }
+ /* Ambig word rejection was here once !!*/
+ }
+ } else {
+ tprintf("BAD tessedit_reject_mode\n");
+ ASSERT_HOST("Fatal error encountered!" == nullptr);
+ }
+
+ if (tessedit_image_border > -1)
+ reject_edge_blobs(word);
+
+ check_debug_pt (word, 10);
+ if (tessedit_rejection_debug) {
+ tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
+ tprintf("Certainty: %f Rating: %f\n",
+ word->best_choice->certainty (), word->best_choice->rating ());
+ tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
+ }
+
+ flip_hyphens(word);
+ check_debug_pt(word, 20);
+}
+
+void reject_blanks(WERD_RES *word) {
+ int16_t i;
+ int16_t offset;
+
+ for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
+ offset += word->best_choice->unichar_lengths()[i], i += 1) {
+ if (word->best_choice->unichar_string()[offset] == ' ')
+ //rej unrecognised blobs
+ word->reject_map[i].setrej_tess_failure ();
+ }
+}
+
+void Tesseract::reject_I_1_L(WERD_RES *word) {
+ int16_t i;
+ int16_t offset;
+
+ for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
+ offset += word->best_choice->unichar_lengths()[i], i += 1) {
+ if (STRING (conflict_set_I_l_1).
+ contains (word->best_choice->unichar_string()[offset])) {
+ //rej 1Il conflict
+ word->reject_map[i].setrej_1Il_conflict ();
+ }
+ }
+}
+
+void reject_poor_matches(WERD_RES *word) {
+ float threshold = compute_reject_threshold(word->best_choice);
+ for (int i = 0; i < word->best_choice->length(); ++i) {
+ if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
+ word->reject_map[i].setrej_tess_failure();
+ else if (word->best_choice->certainty(i) < threshold)
+ word->reject_map[i].setrej_poor_match();
+ }
+}
+
+
+/**********************************************************************
+ * compute_reject_threshold
+ *
+ * Set a rejection threshold for this word.
+ * Initially this is a trivial function which looks for the largest
+ * gap in the certainty value.
+ **********************************************************************/
+
+float compute_reject_threshold(WERD_CHOICE* word) {
+ float threshold; // rejection threshold
+ float bestgap = 0.0f; // biggest gap
+ float gapstart; // bottom of gap
+
+ int blob_count = word->length();
+ GenericVector<float> ratings;
+ ratings.resize_no_init(blob_count);
+ for (int i = 0; i < blob_count; ++i) {
+ ratings[i] = word->certainty(i);
+ }
+ ratings.sort();
+ gapstart = ratings[0] - 1; // all reject if none better
+ if (blob_count >= 3) {
+ for (int index = 0; index < blob_count - 1; index++) {
+ if (ratings[index + 1] - ratings[index] > bestgap) {
+ bestgap = ratings[index + 1] - ratings[index];
+ // find biggest
+ gapstart = ratings[index];
+ }
+ }
+ }
+ threshold = gapstart + bestgap / 2;
+
+ return threshold;
+}
+
+
+/*************************************************************************
+ * reject_edge_blobs()
+ *
+ * If the word is perilously close to the edge of the image, reject those blobs
+ * in the word which are too close to the edge as they could be clipped.
+ *************************************************************************/
+void Tesseract::reject_edge_blobs(WERD_RES *word) {
+ TBOX word_box = word->word->bounding_box();
+ // Use the box_word as it is already denormed back to image coordinates.
+ int blobcount = word->box_word->length();
+
+ if (word_box.left() < tessedit_image_border ||
+ word_box.bottom() < tessedit_image_border ||
+ word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
+ word_box.top() + tessedit_image_border > ImageHeight() - 1) {
+ ASSERT_HOST(word->reject_map.length() == blobcount);
+ for (int blobindex = 0; blobindex < blobcount; blobindex++) {
+ TBOX blob_box = word->box_word->BlobBox(blobindex);
+ if (blob_box.left() < tessedit_image_border ||
+ blob_box.bottom() < tessedit_image_border ||
+ blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
+ blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
+ word->reject_map[blobindex].setrej_edge_char();
+ // Close to edge
+ }
+ }
+ }
+}
+
+/**********************************************************************
+ * one_ell_conflict()
+ *
+ * Identify words where there is a potential I/l/1 error.
+ * - A bundle of contextual heuristics!
+ **********************************************************************/
+bool Tesseract::one_ell_conflict(WERD_RES* word_res, bool update_map) {
+ const char *word;
+ const char *lengths;
+ int16_t word_len; //its length
+ int16_t first_alphanum_index_;
+ int16_t first_alphanum_offset_;
+ int16_t i;
+ int16_t offset;
+ bool non_conflict_set_char; //non conf set a/n?
+ bool conflict = false;
+ bool allow_1s;
+ ACCEPTABLE_WERD_TYPE word_type;
+ bool dict_perm_type;
+ bool dict_word_ok;
+ int dict_word_type;
+
+ word = word_res->best_choice->unichar_string().c_str();
+ lengths = word_res->best_choice->unichar_lengths().c_str();
+ word_len = strlen(lengths);
+ /*
+ If there are no occurrences of the conflict set characters then the word
+ is OK.
+ */
+ if (strpbrk(word, conflict_set_I_l_1.c_str()) == nullptr)
+ return false;
+
+ /*
+ There is a conflict if there are NO other (confirmed) alphanumerics apart
+ from those in the conflict set.
+ */
+
+ for (i = 0, offset = 0, non_conflict_set_char = false;
+ (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
+ non_conflict_set_char =
+ (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
+ word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
+ !STRING (conflict_set_I_l_1).contains (word[offset]);
+ if (!non_conflict_set_char) {
+ if (update_map)
+ reject_I_1_L(word_res);
+ return true;
+ }
+
+ /*
+ If the word is accepted by a dawg permuter, and the first alpha character
+ is "I" or "l", check to see if the alternative is also a dawg word. If it
+ is, then there is a potential error otherwise the word is ok.
+ */
+
+ dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
+ (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
+ (rej_trust_doc_dawg &&
+ (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
+ (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
+ dict_word_type = dict_word(*(word_res->best_choice));
+ dict_word_ok = (dict_word_type > 0) &&
+ (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
+
+ if ((rej_1Il_use_dict_word && dict_word_ok) ||
+ (rej_1Il_trust_permuter_type && dict_perm_type) ||
+ (dict_perm_type && dict_word_ok)) {
+ first_alphanum_index_ = first_alphanum_index (word, lengths);
+ first_alphanum_offset_ = first_alphanum_offset (word, lengths);
+ if (lengths[first_alphanum_index_] == 1 &&
+ word[first_alphanum_offset_] == 'I') {
+ word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
+ if (safe_dict_word(word_res) > 0) {
+ word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
+ if (update_map)
+ word_res->reject_map[first_alphanum_index_].
+ setrej_1Il_conflict();
+ return true;
+ }
+ else {
+ word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
+ return false;
+ }
+ }
+
+ if (lengths[first_alphanum_index_] == 1 &&
+ word[first_alphanum_offset_] == 'l') {
+ word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
+ if (safe_dict_word(word_res) > 0) {
+ word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
+ if (update_map)
+ word_res->reject_map[first_alphanum_index_].
+ setrej_1Il_conflict();
+ return true;
+ }
+ else {
+ word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
+ return false;
+ }
+ }
+ return false;
+ }
+
+ /*
+ NEW 1Il code. The old code relied on permuter types too much. In fact,
+ tess will use TOP_CHOICE permute for good things like "palette".
+ In this code the string is examined independently to see if it looks like
+ a well formed word.
+ */
+
+ /*
+ REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
+ dictionary word.
+ */
+ first_alphanum_index_ = first_alphanum_index (word, lengths);
+ first_alphanum_offset_ = first_alphanum_offset (word, lengths);
+ if (lengths[first_alphanum_index_] == 1 &&
+ word[first_alphanum_offset_] == 'l') {
+ word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
+ if (safe_dict_word(word_res) > 0)
+ return false;
+ else
+ word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
+ }
+ else if (lengths[first_alphanum_index_] == 1 &&
+ word[first_alphanum_offset_] == 'I') {
+ word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
+ if (safe_dict_word(word_res) > 0)
+ return false;
+ else
+ word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
+ }
+ /*
+ For strings containing digits:
+ If there are no alphas OR the numeric permuter liked the word,
+ reject any non 1 conflict chs
+ Else reject all conflict chs
+ */
+ if (word_contains_non_1_digit (word, lengths)) {
+ allow_1s = (alpha_count (word, lengths) == 0) ||
+ (word_res->best_choice->permuter () == NUMBER_PERM);
+
+ int16_t offset;
+ conflict = false;
+ for (i = 0, offset = 0; word[offset] != '\0';
+ offset += word_res->best_choice->unichar_lengths()[i++]) {
+ if ((!allow_1s || (word[offset] != '1')) &&
+ STRING (conflict_set_I_l_1).contains (word[offset])) {
+ if (update_map)
+ word_res->reject_map[i].setrej_1Il_conflict ();
+ conflict = true;
+ }
+ }
+ return conflict;
+ }
+ /*
+ For anything else. See if it conforms to an acceptable word type. If so,
+ treat accordingly.
+ */
+ word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
+ if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
+ first_alphanum_index_ = first_alphanum_index (word, lengths);
+ first_alphanum_offset_ = first_alphanum_offset (word, lengths);
+ if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
+ if (update_map)
+ word_res->reject_map[first_alphanum_index_].
+ setrej_1Il_conflict ();
+ return true;
+ }
+ else
+ return false;
+ }
+ else if (word_type == AC_UPPER_CASE) {
+ return false;
+ }
+ else {
+ if (update_map)
+ reject_I_1_L(word_res);
+ return true;
+ }
+}
+
+
+int16_t Tesseract::first_alphanum_index(const char *word,
+ const char *word_lengths) {
+ int16_t i;
+ int16_t offset;
+
+ for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
+ if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
+ unicharset.get_isdigit(word + offset, word_lengths[i]))
+ return i;
+ }
+ return -1;
+}
+
+int16_t Tesseract::first_alphanum_offset(const char *word,
+ const char *word_lengths) {
+ int16_t i;
+ int16_t offset;
+
+ for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
+ if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
+ unicharset.get_isdigit(word + offset, word_lengths[i]))
+ return offset;
+ }
+ return -1;
+}
+
+int16_t Tesseract::alpha_count(const char *word,
+ const char *word_lengths) {
+ int16_t i;
+ int16_t offset;
+ int16_t count = 0;
+
+ for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
+ if (unicharset.get_isalpha (word + offset, word_lengths[i]))
+ count++;
+ }
+ return count;
+}
+
+
+bool Tesseract::word_contains_non_1_digit(const char* word,
+ const char* word_lengths) {
+ int16_t i;
+ int16_t offset;
+
+ for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
+ if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
+ (word_lengths[i] != 1 || word[offset] != '1'))
+ return true;
+ }
+ return false;
+}
+
+/*************************************************************************
+ * dont_allow_1Il()
+ * Don't unreject LONE accepted 1Il conflict set chars
+ *************************************************************************/
+void Tesseract::dont_allow_1Il(WERD_RES *word) {
+ int i = 0;
+ int offset;
+ int word_len = word->reject_map.length();
+ const char *s = word->best_choice->unichar_string().c_str();
+ const char *lengths = word->best_choice->unichar_lengths().c_str();
+ bool accepted_1Il = false;
+
+ for (i = 0, offset = 0; i < word_len;
+ offset += word->best_choice->unichar_lengths()[i++]) {
+ if (word->reject_map[i].accepted()) {
+ if (STRING(conflict_set_I_l_1).contains(s[offset])) {
+ accepted_1Il = true;
+ } else {
+ if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
+ word->uch_set->get_isdigit(s + offset, lengths[i]))
+ return; // >=1 non 1Il ch accepted
+ }
+ }
+ }
+ if (!accepted_1Il)
+ return; //Nothing to worry about
+
+ for (i = 0, offset = 0; i < word_len;
+ offset += word->best_choice->unichar_lengths()[i++]) {
+ if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
+ word->reject_map[i].accepted())
+ word->reject_map[i].setrej_postNN_1Il();
+ }
+}
+
+
+int16_t Tesseract::count_alphanums(WERD_RES *word_res) {
+ int count = 0;
+ const WERD_CHOICE *best_choice = word_res->best_choice;
+ for (int i = 0; i < word_res->reject_map.length(); ++i) {
+ if ((word_res->reject_map[i].accepted()) &&
+ (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
+ word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
+ count++;
+ }
+ }
+ return count;
+}
+
+
+// reject all if most rejected.
+void Tesseract::reject_mostly_rejects(WERD_RES *word) {
+ /* Reject the whole of the word if the fraction of rejects exceeds a limit */
+
+ if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=
+ rej_whole_of_mostly_reject_word_fract)
+ word->reject_map.rej_word_mostly_rej();
+}
+
+
+bool Tesseract::repeated_nonalphanum_wd(WERD_RES* word, ROW* row) {
+ int16_t char_quality;
+ int16_t accepted_char_quality;
+
+ if (word->best_choice->unichar_lengths().length() <= 1)
+ return false;
+
+ if (!STRING(ok_repeated_ch_non_alphanum_wds).
+ contains(word->best_choice->unichar_string()[0]))
+ return false;
+
+ UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
+ for (int i = 1; i < word->best_choice->length(); ++i) {
+ if (word->best_choice->unichar_id(i) != uch_id) return false;
+ }
+
+ word_char_quality(word, &char_quality, &accepted_char_quality);
+
+ if ((word->best_choice->unichar_lengths().length () == char_quality) &&
+ (char_quality == accepted_char_quality))
+ return true;
+ else
+ return false;
+}
+
+int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
+ const WERD_CHOICE &word = *werd_res->best_choice;
+ int dict_word_type = werd_res->tesseract->dict_word(word);
+ return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
+}
+
+// Note: After running this function word_res->ratings
+// might not contain the right BLOB_CHOICE corresponding to each character
+// in word_res->best_choice.
+void Tesseract::flip_hyphens(WERD_RES *word_res) {
+ WERD_CHOICE *best_choice = word_res->best_choice;
+ int i;
+ int prev_right = -9999;
+ int next_left;
+ TBOX out_box;
+ float aspect_ratio;
+
+ if (tessedit_lower_flip_hyphen <= 1)
+ return;
+
+ int num_blobs = word_res->rebuild_word->NumBlobs();
+ UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
+ for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
+ TBLOB* blob = word_res->rebuild_word->blobs[i];
+ out_box = blob->bounding_box();
+ if (i + 1 == num_blobs)
+ next_left = 9999;
+ else
+ next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
+ // Don't touch small or touching blobs - it is too dangerous.
+ if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
+ (out_box.left() > prev_right) && (out_box.right() < next_left)) {
+ aspect_ratio = out_box.width() / static_cast<float>(out_box.height());
+ if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
+ if (aspect_ratio >= tessedit_upper_flip_hyphen &&
+ word_res->uch_set->contains_unichar_id(unichar_dash) &&
+ word_res->uch_set->get_enabled(unichar_dash)) {
+ /* Certain HYPHEN */
+ best_choice->set_unichar_id(unichar_dash, i);
+ if (word_res->reject_map[i].rejected())
+ word_res->reject_map[i].setrej_hyphen_accept();
+ }
+ if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
+ word_res->reject_map[i].accepted())
+ //Suspected HYPHEN
+ word_res->reject_map[i].setrej_hyphen ();
+ }
+ else if (best_choice->unichar_id(i) == unichar_dash) {
+ if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
+ (word_res->reject_map[i].rejected()))
+ word_res->reject_map[i].setrej_hyphen_accept();
+ //Certain HYPHEN
+
+ if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
+ (word_res->reject_map[i].accepted()))
+ //Suspected HYPHEN
+ word_res->reject_map[i].setrej_hyphen();
+ }
+ }
+ prev_right = out_box.right();
+ }
+}
+
+// Note: After running this function word_res->ratings
+// might not contain the right BLOB_CHOICE corresponding to each character
+// in word_res->best_choice.
+void Tesseract::flip_0O(WERD_RES *word_res) {
+ WERD_CHOICE *best_choice = word_res->best_choice;
+ int i;
+ TBOX out_box;
+
+ if (!tessedit_flip_0O)
+ return;
+
+ int num_blobs = word_res->rebuild_word->NumBlobs();
+ for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
+ TBLOB* blob = word_res->rebuild_word->blobs[i];
+ if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
+ word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
+ out_box = blob->bounding_box();
+ if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
+ (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
+ return; //Beware words with sub/superscripts
+ }
+ }
+ UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
+ UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
+ if (unichar_0 == INVALID_UNICHAR_ID ||
+ !word_res->uch_set->get_enabled(unichar_0) ||
+ unichar_O == INVALID_UNICHAR_ID ||
+ !word_res->uch_set->get_enabled(unichar_O)) {
+ return; // 0 or O are not present/enabled in unicharset
+ }
+ for (i = 1; i < best_choice->length(); ++i) {
+ if (best_choice->unichar_id(i) == unichar_0 ||
+ best_choice->unichar_id(i) == unichar_O) {
+ /* A0A */
+ if ((i+1) < best_choice->length() &&
+ non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
+ non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
+ best_choice->set_unichar_id(unichar_O, i);
+ }
+ /* A00A */
+ if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
+ (i+1) < best_choice->length() &&
+ (best_choice->unichar_id(i+1) == unichar_0 ||
+ best_choice->unichar_id(i+1) == unichar_O) &&
+ (i+2) < best_choice->length() &&
+ non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
+ best_choice->set_unichar_id(unichar_O, i);
+ i++;
+ }
+ /* AA0<non digit or end of word> */
+ if ((i > 1) &&
+ non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
+ non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
+ (((i+1) < best_choice->length() &&
+ !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
+ !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
+ !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
+ (i == best_choice->length() - 1))) {
+ best_choice->set_unichar_id(unichar_O, i);
+ }
+ /* 9O9 */
+ if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
+ (i+1) < best_choice->length() &&
+ non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
+ best_choice->set_unichar_id(unichar_0, i);
+ }
+ /* 9OOO */
+ if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
+ (i+2) < best_choice->length() &&
+ (best_choice->unichar_id(i+1) == unichar_0 ||
+ best_choice->unichar_id(i+1) == unichar_O) &&
+ (best_choice->unichar_id(i+2) == unichar_0 ||
+ best_choice->unichar_id(i+2) == unichar_O)) {
+ best_choice->set_unichar_id(unichar_0, i);
+ best_choice->set_unichar_id(unichar_0, i+1);
+ best_choice->set_unichar_id(unichar_0, i+2);
+ i += 2;
+ }
+ /* 9OO<non upper> */
+ if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
+ (i+2) < best_choice->length() &&
+ (best_choice->unichar_id(i+1) == unichar_0 ||
+ best_choice->unichar_id(i+1) == unichar_O) &&
+ !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
+ best_choice->set_unichar_id(unichar_0, i);
+ best_choice->set_unichar_id(unichar_0, i+1);
+ i++;
+ }
+ /* 9O<non upper> */
+ if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
+ (i+1) < best_choice->length() &&
+ !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
+ best_choice->set_unichar_id(unichar_0, i);
+ }
+ /* 9[.,]OOO.. */
+ if ((i > 1) &&
+ (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
+ word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
+ (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
+ best_choice->unichar_id(i-2) == unichar_O)) {
+ if (best_choice->unichar_id(i-2) == unichar_O) {
+ best_choice->set_unichar_id(unichar_0, i-2);
+ }
+ while (i < best_choice->length() &&
+ (best_choice->unichar_id(i) == unichar_O ||
+ best_choice->unichar_id(i) == unichar_0)) {
+ best_choice->set_unichar_id(unichar_0, i);
+ i++;
+ }
+ i--;
+ }
+ }
+ }
+}
+
+bool Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
+ return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
+}
+
+bool Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
+ return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
+}
+} // namespace tesseract
+
+#endif // def DISABLED_LEGACY_ENGINE
diff --git a/tesseract/src/ccmain/reject.h b/tesseract/src/ccmain/reject.h
new file mode 100644
index 00000000..e144813a
--- /dev/null
+++ b/tesseract/src/ccmain/reject.h
@@ -0,0 +1,39 @@
+/**********************************************************************
+ * File: reject.h
+ * Description: Rejection functions used in tessedit
+ * Author: Phil Cheatle
+ * Created: Wed Sep 23 16:50:21 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef REJECT_H
+#define REJECT_H
+
+namespace tesseract {
+
+class WERD_CHOICE;
+class WERD_RES;
+
+void reject_blanks(WERD_RES *word);
+void reject_poor_matches(WERD_RES *word);
+float compute_reject_threshold(WERD_CHOICE* word);
+bool word_contains_non_1_digit(const char* word, const char* word_lengths);
+void dont_allow_1Il(WERD_RES *word);
+void flip_hyphens(WERD_RES *word);
+void flip_0O(WERD_RES *word);
+bool non_0_digit(const char* str, int length);
+
+} // namespace tesseract
+
+#endif
diff --git a/tesseract/src/ccmain/resultiterator.cpp b/tesseract/src/ccmain/resultiterator.cpp
new file mode 100644
index 00000000..d8f537f2
--- /dev/null
+++ b/tesseract/src/ccmain/resultiterator.cpp
@@ -0,0 +1,752 @@
+///////////////////////////////////////////////////////////////////////
+// File: resultiterator.cpp
+// Description: Iterator for tesseract results that is capable of
+// iterating in proper reading order over Bi Directional
+// (e.g. mixed Hebrew and English) text.
+// Author: David Eger
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include <tesseract/resultiterator.h>
+
+#include "pageres.h"
+#include "tesseractclass.h"
+#include "unicharset.h"
+
+#include "allheaders.h"
+
+#include <set>
+#include <vector>
+
+static const char * const kLRM = "\u200E"; // Left-to-Right Mark
+static const char * const kRLM = "\u200F"; // Right-to-Left Mark
+
+namespace tesseract {
+
+ResultIterator::ResultIterator(const LTRResultIterator& resit)
+ : LTRResultIterator(resit) {
+ in_minor_direction_ = false;
+ at_beginning_of_minor_run_ = false;
+ preserve_interword_spaces_ = false;
+
+ auto* p = ParamUtils::FindParam<BoolParam>("preserve_interword_spaces",
+ GlobalParams()->bool_params,
+ tesseract_->params()->bool_params);
+ if (p != nullptr)
+ preserve_interword_spaces_ = (bool)(*p);
+
+ current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
+ MoveToLogicalStartOfTextline();
+}
+
+ResultIterator* ResultIterator::StartOfParagraph(
+ const LTRResultIterator& resit) {
+ return new ResultIterator(resit);
+}
+
+bool ResultIterator::ParagraphIsLtr() const {
+ return current_paragraph_is_ltr_;
+}
+
+bool ResultIterator::CurrentParagraphIsLtr() const {
+ if (!it_->word())
+ return true; // doesn't matter.
+ LTRResultIterator it(*this);
+ it.RestartParagraph();
+ // Try to figure out the ltr-ness of the paragraph. The rules below
+ // make more sense in the context of a difficult paragraph example.
+ // Here we denote {ltr characters, RTL CHARACTERS}:
+ //
+ // "don't go in there!" DAIS EH
+ // EHT OTNI DEPMUJ FELSMIH NEHT DNA
+ // .GNIDLIUB GNINRUB
+ //
+ // On the first line, the left-most word is LTR and the rightmost word
+ // is RTL. Thus, we are better off taking the majority direction for
+ // the whole paragraph contents. So instead of "the leftmost word is LTR"
+ // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
+ // would not do: Typically an RTL paragraph would *not* start with an LTR
+ // word. So our heuristics are as follows:
+ //
+ // (1) If the first text line has an RTL word in the left-most position
+ // it is RTL.
+ // (2) If the first text line has an LTR word in the right-most position
+ // it is LTR.
+ // (3) If neither of the above is true, take the majority count for the
+ // paragraph -- if there are more rtl words, it is RTL. If there
+ // are more LTR words, it's LTR.
+ bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
+ bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
+ int num_ltr, num_rtl;
+ num_rtl = leftmost_rtl ? 1 : 0;
+ num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
+ for (it.Next(RIL_WORD);
+ !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
+ it.Next(RIL_WORD)) {
+ StrongScriptDirection dir = it.WordDirection();
+ rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
+ num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
+ num_ltr += rightmost_ltr ? 1 : 0;
+ }
+ if (leftmost_rtl)
+ return false;
+ if (rightmost_ltr)
+ return true;
+ // First line is ambiguous. Take statistics on the whole paragraph.
+ if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA))
+ do {
+ StrongScriptDirection dir = it.WordDirection();
+ num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
+ num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
+ } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
+ return num_ltr >= num_rtl;
+}
+
+const int ResultIterator::kMinorRunStart = -1;
+const int ResultIterator::kMinorRunEnd = -2;
+const int ResultIterator::kComplexWord = -3;
+
+void ResultIterator::CalculateBlobOrder(
+ std::vector<int>* blob_indices) const {
+ bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
+ blob_indices->clear();
+ if (Empty(RIL_WORD))
+ return;
+ if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
+ // Easy! just return the blobs in order;
+ for (int i = 0; i < word_length_; i++) blob_indices->push_back(i);
+ return;
+ }
+
+ // The blobs are in left-to-right order, but the current reading context
+ // is right-to-left.
+ const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
+ const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
+ const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
+ const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
+ const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
+ const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
+ const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
+
+ // Step 1: Scan for and mark European Number sequences
+ // [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
+ GenericVector<int> letter_types;
+ for (int i = 0; i < word_length_; i++) {
+ letter_types.push_back(it_->word()->SymbolDirection(i));
+ }
+ // Convert a single separtor sandwiched between two EN's into an EN.
+ for (int i = 0; i + 2 < word_length_; i++) {
+ if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
+ (letter_types[i + 1] == U_EURO_NUM_SEP ||
+ letter_types[i + 1] == U_COMMON_NUM_SEP)) {
+ letter_types[i + 1] = U_EURO_NUM;
+ }
+ }
+ // Scan for sequences of European Number Terminators around ENs and convert
+ // them to ENs.
+ for (int i = 0; i < word_length_; i++) {
+ if (letter_types[i] == U_EURO_NUM_TERM) {
+ int j = i + 1;
+ while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) {
+ j++;
+ }
+ if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
+ // The sequence [i..j] should be converted to all European Numbers.
+ for (int k = i; k < j; k++) letter_types[k] = U_EURO_NUM;
+ }
+ j = i - 1;
+ while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) {
+ j--;
+ }
+ if (j > -1 && letter_types[j] == U_EURO_NUM) {
+ // The sequence [j..i] should be converted to all European Numbers.
+ for (int k = j; k <= i; k++) letter_types[k] = U_EURO_NUM;
+ }
+ }
+ }
+ // Step 2: Convert all remaining types to either L or R.
+ // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
+ // All other are R.
+ for (int i = 0; i < word_length_;) {
+ int ti = letter_types[i];
+ if (ti == U_LTR || ti == U_EURO_NUM) {
+ // Left to right sequence; scan to the end of it.
+ int last_good = i;
+ for (int j = i + 1; j < word_length_; j++) {
+ int tj = letter_types[j];
+ if (tj == U_LTR || tj == U_EURO_NUM) {
+ last_good = j;
+ } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
+ // do nothing.
+ } else {
+ break;
+ }
+ }
+ // [i..last_good] is the L sequence
+ for (int k = i; k <= last_good; k++) letter_types[k] = U_LTR;
+ i = last_good + 1;
+ } else {
+ letter_types[i] = U_RTL;
+ i++;
+ }
+ }
+
+ // At this point, letter_types is entirely U_LTR or U_RTL.
+ for (int i = word_length_ - 1; i >= 0;) {
+ if (letter_types[i] == U_RTL) {
+ blob_indices->push_back(i);
+ i--;
+ } else {
+ // left to right sequence. scan to the beginning.
+ int j = i - 1;
+ for (; j >= 0 && letter_types[j] != U_RTL; j--) {
+ } // pass
+ // Now (j, i] is LTR
+ for (int k = j + 1; k <= i; k++) blob_indices->push_back(k);
+ i = j;
+ }
+ }
+ ASSERT_HOST(blob_indices->size() == word_length_);
+}
+
+static void PrintScriptDirs(const std::vector<StrongScriptDirection>& dirs) {
+ for (int i = 0; i < dirs.size(); i++) {
+ switch (dirs[i]) {
+ case DIR_NEUTRAL:
+ tprintf("N ");
+ break;
+ case DIR_LEFT_TO_RIGHT:
+ tprintf("L ");
+ break;
+ case DIR_RIGHT_TO_LEFT:
+ tprintf("R ");
+ break;
+ case DIR_MIX:
+ tprintf("Z ");
+ break;
+ default:
+ tprintf("? ");
+ break;
+ }
+ }
+ tprintf("\n");
+}
+
+void ResultIterator::CalculateTextlineOrder(
+ bool paragraph_is_ltr, const LTRResultIterator& resit,
+ std::vector<int>* word_indices) const {
+ std::vector<StrongScriptDirection> directions;
+ CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
+}
+
+void ResultIterator::CalculateTextlineOrder(
+ bool paragraph_is_ltr, const LTRResultIterator& resit,
+ std::vector<StrongScriptDirection>* dirs_arg,
+ std::vector<int>* word_indices) const {
+ std::vector<StrongScriptDirection> dirs;
+ std::vector<StrongScriptDirection>* directions;
+ directions = (dirs_arg != nullptr) ? dirs_arg : &dirs;
+ directions->clear();
+
+ // A LTRResultIterator goes strictly left-to-right word order.
+ LTRResultIterator ltr_it(resit);
+ ltr_it.RestartRow();
+ if (ltr_it.Empty(RIL_WORD))
+ return;
+ do {
+ directions->push_back(ltr_it.WordDirection());
+ } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
+
+ word_indices->clear();
+ CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
+}
+
+void ResultIterator::CalculateTextlineOrder(
+ bool paragraph_is_ltr,
+ const std::vector<StrongScriptDirection>& word_dirs,
+ std::vector<int>* reading_order) {
+ reading_order->clear();
+ if (word_dirs.size() == 0)
+ return;
+
+ // Take all of the runs of minor direction words and insert them
+ // in reverse order.
+ int minor_direction, major_direction, major_step, start, end;
+ if (paragraph_is_ltr) {
+ start = 0;
+ end = word_dirs.size();
+ major_step = 1;
+ major_direction = DIR_LEFT_TO_RIGHT;
+ minor_direction = DIR_RIGHT_TO_LEFT;
+ } else {
+ start = word_dirs.size() - 1;
+ end = -1;
+ major_step = -1;
+ major_direction = DIR_RIGHT_TO_LEFT;
+ minor_direction = DIR_LEFT_TO_RIGHT;
+ // Special rule: if there are neutral words at the right most side
+ // of a line adjacent to a left-to-right word in the middle of the
+ // line, we interpret the end of the line as a single LTR sequence.
+ if (word_dirs[start] == DIR_NEUTRAL) {
+ int neutral_end = start;
+ while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
+ neutral_end--;
+ }
+ if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
+ // LTR followed by neutrals.
+ // Scan for the beginning of the minor left-to-right run.
+ int left = neutral_end;
+ for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
+ if (word_dirs[i] == DIR_LEFT_TO_RIGHT)
+ left = i;
+ }
+ reading_order->push_back(kMinorRunStart);
+ for (int i = left; i < word_dirs.size(); i++) {
+ reading_order->push_back(i);
+ if (word_dirs[i] == DIR_MIX)
+ reading_order->push_back(kComplexWord);
+ }
+ reading_order->push_back(kMinorRunEnd);
+ start = left - 1;
+ }
+ }
+ }
+ for (int i = start; i != end;) {
+ if (word_dirs[i] == minor_direction) {
+ int j = i;
+ while (j != end && word_dirs[j] != major_direction) j += major_step;
+ if (j == end)
+ j -= major_step;
+ while (j != i && word_dirs[j] != minor_direction) j -= major_step;
+ // [j..i] is a minor direction run.
+ reading_order->push_back(kMinorRunStart);
+ for (int k = j; k != i; k -= major_step) {
+ reading_order->push_back(k);
+ }
+ reading_order->push_back(i);
+ reading_order->push_back(kMinorRunEnd);
+ i = j + major_step;
+ } else {
+ reading_order->push_back(i);
+ if (word_dirs[i] == DIR_MIX)
+ reading_order->push_back(kComplexWord);
+ i += major_step;
+ }
+ }
+}
+
+int ResultIterator::LTRWordIndex() const {
+ int this_word_index = 0;
+ LTRResultIterator textline(*this);
+ textline.RestartRow();
+ while (!textline.PositionedAtSameWord(it_)) {
+ this_word_index++;
+ textline.Next(RIL_WORD);
+ }
+ return this_word_index;
+}
+
+void ResultIterator::MoveToLogicalStartOfWord() {
+ if (word_length_ == 0) {
+ BeginWord(0);
+ return;
+ }
+ std::vector<int> blob_order;
+ CalculateBlobOrder(&blob_order);
+ if (blob_order.size() == 0 || blob_order[0] == 0)
+ return;
+ BeginWord(blob_order[0]);
+}
+
+bool ResultIterator::IsAtFinalSymbolOfWord() const {
+ if (!it_->word())
+ return true;
+ std::vector<int> blob_order;
+ CalculateBlobOrder(&blob_order);
+ return blob_order.size() == 0 || blob_order.back() == blob_index_;
+}
+
+bool ResultIterator::IsAtFirstSymbolOfWord() const {
+ if (!it_->word())
+ return true;
+ std::vector<int> blob_order;
+ CalculateBlobOrder(&blob_order);
+ return blob_order.size() == 0 || blob_order[0] == blob_index_;
+}
+
+void ResultIterator::AppendSuffixMarks(std::string* text) const {
+ if (!it_->word())
+ return;
+ bool reading_direction_is_ltr =
+ current_paragraph_is_ltr_ ^ in_minor_direction_;
+ // scan forward to see what meta-information the word ordering algorithm
+ // left us.
+ // If this word is at the *end* of a minor run, insert the other
+ // direction's mark; else if this was a complex word, insert the
+ // current reading order's mark.
+ std::vector<int> textline_order;
+ CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &textline_order);
+ int this_word_index = LTRWordIndex();
+ size_t i = 0;
+ for (const auto word_index : textline_order) {
+ if (word_index == this_word_index) {
+ break;
+ }
+ i++;
+ }
+ if (i == textline_order.size()) {
+ return;
+ }
+
+ int last_non_word_mark = 0;
+ for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
+ last_non_word_mark = textline_order[i];
+ }
+ if (last_non_word_mark == kComplexWord) {
+ *text += reading_direction_is_ltr ? kLRM : kRLM;
+ } else if (last_non_word_mark == kMinorRunEnd) {
+ if (current_paragraph_is_ltr_) {
+ *text += kLRM;
+ } else {
+ *text += kRLM;
+ }
+ }
+}
+
+void ResultIterator::MoveToLogicalStartOfTextline() {
+ std::vector<int> word_indices;
+ RestartRow();
+ CalculateTextlineOrder(current_paragraph_is_ltr_,
+ dynamic_cast<const LTRResultIterator&>(*this),
+ &word_indices);
+ int i = 0;
+ for (; i < word_indices.size() && word_indices[i] < 0; i++) {
+ if (word_indices[i] == kMinorRunStart)
+ in_minor_direction_ = true;
+ else if (word_indices[i] == kMinorRunEnd)
+ in_minor_direction_ = false;
+ }
+ if (in_minor_direction_)
+ at_beginning_of_minor_run_ = true;
+ if (i >= word_indices.size())
+ return;
+ int first_word_index = word_indices[i];
+ for (int j = 0; j < first_word_index; j++) {
+ PageIterator::Next(RIL_WORD);
+ }
+ MoveToLogicalStartOfWord();
+}
+
+void ResultIterator::Begin() {
+ LTRResultIterator::Begin();
+ current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
+ in_minor_direction_ = false;
+ at_beginning_of_minor_run_ = false;
+ MoveToLogicalStartOfTextline();
+}
+
+bool ResultIterator::Next(PageIteratorLevel level) {
+ if (it_->block() == nullptr)
+ return false; // already at end!
+ switch (level) {
+ case RIL_BLOCK: // explicit fall-through
+ case RIL_PARA: // explicit fall-through
+ case RIL_TEXTLINE:
+ if (!PageIterator::Next(level))
+ return false;
+ if (IsWithinFirstTextlineOfParagraph()) {
+ // if we've advanced to a new paragraph,
+ // recalculate current_paragraph_is_ltr_
+ current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
+ }
+ in_minor_direction_ = false;
+ MoveToLogicalStartOfTextline();
+ return it_->block() != nullptr;
+ case RIL_SYMBOL: {
+ std::vector<int> blob_order;
+ CalculateBlobOrder(&blob_order);
+ int next_blob = 0;
+ while (next_blob < blob_order.size() &&
+ blob_index_ != blob_order[next_blob])
+ next_blob++;
+ next_blob++;
+ if (next_blob < blob_order.size()) {
+ // we're in the same word; simply advance one blob.
+ BeginWord(blob_order[next_blob]);
+ at_beginning_of_minor_run_ = false;
+ return true;
+ }
+ level = RIL_WORD; // we've fallen through to the next word.
+ }
+ // Fall through.
+ case RIL_WORD: // explicit fall-through.
+ {
+ if (it_->word() == nullptr)
+ return Next(RIL_BLOCK);
+ std::vector<int> word_indices;
+ int this_word_index = LTRWordIndex();
+ CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &word_indices);
+ int final_real_index = word_indices.size() - 1;
+ while (final_real_index > 0 && word_indices[final_real_index] < 0)
+ final_real_index--;
+ for (int i = 0; i < final_real_index; i++) {
+ if (word_indices[i] == this_word_index) {
+ int j = i + 1;
+ for (; j < final_real_index && word_indices[j] < 0; j++) {
+ if (word_indices[j] == kMinorRunStart)
+ in_minor_direction_ = true;
+ if (word_indices[j] == kMinorRunEnd)
+ in_minor_direction_ = false;
+ }
+ at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
+ // awesome, we move to word_indices[j]
+ if (BidiDebug(3)) {
+ tprintf("Next(RIL_WORD): %d -> %d\n", this_word_index,
+ word_indices[j]);
+ }
+ PageIterator::RestartRow();
+ for (int k = 0; k < word_indices[j]; k++) {
+ PageIterator::Next(RIL_WORD);
+ }
+ MoveToLogicalStartOfWord();
+ return true;
+ }
+ }
+ if (BidiDebug(3)) {
+ tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
+ }
+ // we're going off the end of the text line.
+ return Next(RIL_TEXTLINE);
+ }
+ }
+ ASSERT_HOST(false); // shouldn't happen.
+ return false;
+}
+
+bool ResultIterator::IsAtBeginningOf(PageIteratorLevel level) const {
+ if (it_->block() == nullptr)
+ return false; // Already at the end!
+ if (it_->word() == nullptr)
+ return true; // In an image block.
+ if (level == RIL_SYMBOL)
+ return true; // Always at beginning of a symbol.
+
+ bool at_word_start = IsAtFirstSymbolOfWord();
+ if (level == RIL_WORD)
+ return at_word_start;
+
+ ResultIterator line_start(*this);
+ // move to the first word in the line...
+ line_start.MoveToLogicalStartOfTextline();
+
+ bool at_textline_start = at_word_start && *line_start.it_ == *it_;
+ if (level == RIL_TEXTLINE)
+ return at_textline_start;
+
+ // now we move to the left-most word...
+ line_start.RestartRow();
+ bool at_block_start = at_textline_start &&
+ line_start.it_->block() != line_start.it_->prev_block();
+ if (level == RIL_BLOCK)
+ return at_block_start;
+
+ bool at_para_start =
+ at_block_start ||
+ (at_textline_start && line_start.it_->row()->row->para() !=
+ line_start.it_->prev_row()->row->para());
+ if (level == RIL_PARA)
+ return at_para_start;
+
+ ASSERT_HOST(false); // shouldn't happen.
+ return false;
+}
+
+/**
+ * NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the
+ * change that the variable next is now a ResultIterator instead of a
+ * PageIterator.
+ */
+bool ResultIterator::IsAtFinalElement(PageIteratorLevel level,
+ PageIteratorLevel element) const {
+ if (Empty(element))
+ return true; // Already at the end!
+ // The result is true if we step forward by element and find we are
+ // at the the end of the page or at beginning of *all* levels in:
+ // [level, element).
+ // When there is more than one level difference between element and level,
+ // we could for instance move forward one symbol and still be at the first
+ // word on a line, so we also have to be at the first symbol in a word.
+ ResultIterator next(*this);
+ next.Next(element);
+ if (next.Empty(element))
+ return true; // Reached the end of the page.
+ while (element > level) {
+ element = static_cast<PageIteratorLevel>(element - 1);
+ if (!next.IsAtBeginningOf(element))
+ return false;
+ }
+ return true;
+}
+
+// Returns the number of blanks before the current word.
+int ResultIterator::BlanksBeforeWord() const {
+ if (CurrentParagraphIsLtr())
+ return LTRResultIterator::BlanksBeforeWord();
+ return IsAtBeginningOf(RIL_TEXTLINE) ? 0 : 1;
+}
+
+/**
+ * Returns the null terminated UTF-8 encoded text string for the current
+ * object at the given level. Use delete [] to free after use.
+ */
+char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
+ if (it_->word() == nullptr)
+ return nullptr; // Already at the end!
+ std::string text;
+ switch (level) {
+ case RIL_BLOCK: {
+ ResultIterator pp(*this);
+ do {
+ pp.AppendUTF8ParagraphText(&text);
+ } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
+ } break;
+ case RIL_PARA:
+ AppendUTF8ParagraphText(&text);
+ break;
+ case RIL_TEXTLINE: {
+ ResultIterator it(*this);
+ it.MoveToLogicalStartOfTextline();
+ it.IterateAndAppendUTF8TextlineText(&text);
+ } break;
+ case RIL_WORD:
+ AppendUTF8WordText(&text);
+ break;
+ case RIL_SYMBOL: {
+ bool reading_direction_is_ltr =
+ current_paragraph_is_ltr_ ^ in_minor_direction_;
+ if (at_beginning_of_minor_run_) {
+ text += reading_direction_is_ltr ? kLRM : kRLM;
+ }
+ text = it_->word()->BestUTF8(blob_index_, false);
+ if (IsAtFinalSymbolOfWord())
+ AppendSuffixMarks(&text);
+ } break;
+ }
+ int length = text.length() + 1;
+ char* result = new char[length];
+ strncpy(result, text.c_str(), length);
+ return result;
+}
+std::vector<std::vector<std::vector<std::pair<const char*, float>>>>*
+ResultIterator::GetRawLSTMTimesteps() const {
+ if (it_->word() != nullptr) {
+ return &it_->word()->segmented_timesteps;
+ } else {
+ return nullptr;
+ }
+}
+
+std::vector<std::vector<std::pair<const char*, float>>>*
+ResultIterator::GetBestLSTMSymbolChoices() const {
+ if (it_->word() != nullptr) {
+ return &it_->word()->CTC_symbol_choices;
+ } else {
+ return nullptr;
+ }
+}
+
+void ResultIterator::AppendUTF8WordText(std::string* text) const {
+ if (!it_->word())
+ return;
+ ASSERT_HOST(it_->word()->best_choice != nullptr);
+ bool reading_direction_is_ltr =
+ current_paragraph_is_ltr_ ^ in_minor_direction_;
+ if (at_beginning_of_minor_run_) {
+ *text += reading_direction_is_ltr ? kLRM : kRLM;
+ }
+
+ std::vector<int> blob_order;
+ CalculateBlobOrder(&blob_order);
+ for (int i = 0; i < blob_order.size(); i++) {
+ *text += it_->word()->BestUTF8(blob_order[i], false);
+ }
+ AppendSuffixMarks(text);
+}
+
+void ResultIterator::IterateAndAppendUTF8TextlineText(std::string* text) {
+ if (Empty(RIL_WORD)) {
+ Next(RIL_WORD);
+ return;
+ }
+ if (BidiDebug(1)) {
+ std::vector<int> textline_order;
+ std::vector<StrongScriptDirection> dirs;
+ CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &dirs,
+ &textline_order);
+ tprintf("Strong Script dirs [%p/P=%s]: ", it_->row(),
+ current_paragraph_is_ltr_ ? "ltr" : "rtl");
+ PrintScriptDirs(dirs);
+ tprintf("Logical textline order [%p/P=%s]: ", it_->row(),
+ current_paragraph_is_ltr_ ? "ltr" : "rtl");
+ for (int i = 0; i < textline_order.size(); i++) {
+ tprintf("%d ", textline_order[i]);
+ }
+ tprintf("\n");
+ }
+
+ int words_appended = 0;
+ do {
+ int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space()
+ : (words_appended > 0);
+ for (int i = 0; i < numSpaces; ++i) {
+ *text += " ";
+ }
+ AppendUTF8WordText(text);
+ words_appended++;
+ if (BidiDebug(2)) {
+ tprintf("Num spaces=%d, text=%s\n", numSpaces, text->c_str());
+ }
+ } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
+ if (BidiDebug(1)) {
+ tprintf("%d words printed\n", words_appended);
+ }
+ *text += line_separator_;
+ // If we just finished a paragraph, add an extra newline.
+ if (IsAtBeginningOf(RIL_PARA)) {
+ *text += paragraph_separator_;
+ }
+}
+
+void ResultIterator::AppendUTF8ParagraphText(std::string* text) const {
+ ResultIterator it(*this);
+ it.RestartParagraph();
+ it.MoveToLogicalStartOfTextline();
+ if (it.Empty(RIL_WORD))
+ return;
+ do {
+ it.IterateAndAppendUTF8TextlineText(text);
+ } while (it.it_->block() != nullptr && !it.IsAtBeginningOf(RIL_PARA));
+}
+
+bool ResultIterator::BidiDebug(int min_level) const {
+ int debug_level = 1;
+ auto* p =
+ ParamUtils::FindParam<IntParam>("bidi_debug", GlobalParams()->int_params,
+ tesseract_->params()->int_params);
+ if (p != nullptr)
+ debug_level = (int32_t)(*p);
+ return debug_level >= min_level;
+}
+
+} // namespace tesseract.
diff --git a/tesseract/src/ccmain/superscript.cpp b/tesseract/src/ccmain/superscript.cpp
new file mode 100644
index 00000000..02d22451
--- /dev/null
+++ b/tesseract/src/ccmain/superscript.cpp
@@ -0,0 +1,610 @@
+/******************************************************************
+ * File: superscript.cpp
+ * Description: Correction pass to fix superscripts and subscripts.
+ * Author: David Eger
+ *
+ * (C) Copyright 2012, Google, Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "normalis.h"
+#include "tesseractclass.h"
+
+namespace tesseract {
+
+static int LeadingUnicharsToChopped(WERD_RES *word, int num_unichars) {
+ int num_chopped = 0;
+ for (int i = 0; i < num_unichars; i++)
+ num_chopped += word->best_state[i];
+ return num_chopped;
+}
+
+static int TrailingUnicharsToChopped(WERD_RES *word, int num_unichars) {
+ int num_chopped = 0;
+ for (int i = 0; i < num_unichars; i++)
+ num_chopped += word->best_state[word->best_state.size() - 1 - i];
+ return num_chopped;
+}
+
+/**
+ * Given a recognized blob, see if a contiguous collection of sub-pieces
+ * (chopped blobs) starting at its left might qualify as being a subscript
+ * or superscript letter based only on y position. Also do this for the
+ * right side.
+ */
+static void YOutlierPieces(WERD_RES *word, int rebuilt_blob_index,
+ int super_y_bottom, int sub_y_top,
+ ScriptPos *leading_pos, int *num_leading_outliers,
+ ScriptPos *trailing_pos,
+ int *num_trailing_outliers) {
+ ScriptPos sp_unused1, sp_unused2;
+ int unused1, unused2;
+ if (!leading_pos) leading_pos = &sp_unused1;
+ if (!num_leading_outliers) num_leading_outliers = &unused1;
+ if (!trailing_pos) trailing_pos = &sp_unused2;
+ if (!num_trailing_outliers) num_trailing_outliers = &unused2;
+
+ *num_leading_outliers = *num_trailing_outliers = 0;
+ *leading_pos = *trailing_pos = SP_NORMAL;
+
+ int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index);
+ int num_chopped_pieces = word->best_state[rebuilt_blob_index];
+ ScriptPos last_pos = SP_NORMAL;
+ int trailing_outliers = 0;
+ for (int i = 0; i < num_chopped_pieces; i++) {
+ TBOX box = word->chopped_word->blobs[chopped_start + i]->bounding_box();
+ ScriptPos pos = SP_NORMAL;
+ if (box.bottom() >= super_y_bottom) {
+ pos = SP_SUPERSCRIPT;
+ } else if (box.top() <= sub_y_top) {
+ pos = SP_SUBSCRIPT;
+ }
+ if (pos == SP_NORMAL) {
+ if (trailing_outliers == i) {
+ *num_leading_outliers = trailing_outliers;
+ *leading_pos = last_pos;
+ }
+ trailing_outliers = 0;
+ } else {
+ if (pos == last_pos) {
+ trailing_outliers++;
+ } else {
+ trailing_outliers = 1;
+ }
+ }
+ last_pos = pos;
+ }
+ *num_trailing_outliers = trailing_outliers;
+ *trailing_pos = last_pos;
+}
+
+/**
+ * Attempt to split off any high (or low) bits at the ends of the word with poor
+ * certainty and recognize them separately. If the certainty gets much better
+ * and other sanity checks pass, accept.
+ *
+ * This superscript fix is meant to be called in the second pass of recognition
+ * when we have tried once and already have a preliminary answer for word.
+ *
+ * @return Whether we modified the given word.
+ */
+bool Tesseract::SubAndSuperscriptFix(WERD_RES *word) {
+ if (word->tess_failed || word->word->flag(W_REP_CHAR) ||
+ !word->best_choice) {
+ return false;
+ }
+ int num_leading, num_trailing;
+ ScriptPos sp_leading, sp_trailing;
+ float leading_certainty, trailing_certainty;
+ float avg_certainty, unlikely_threshold;
+
+ // Calculate the number of whole suspicious characters at the edges.
+ GetSubAndSuperscriptCandidates(
+ word, &num_leading, &sp_leading, &leading_certainty,
+ &num_trailing, &sp_trailing, &trailing_certainty,
+ &avg_certainty, &unlikely_threshold);
+
+ const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
+ const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
+
+ int num_blobs = word->best_choice->length();
+
+ // Calculate the remainder (partial characters) at the edges.
+ // This accounts for us having classified the best version of
+ // a word as [speaker?'] when it was instead [speaker.^{21}]
+ // (that is we accidentally thought the 2 was attached to the period).
+ int num_remainder_leading = 0, num_remainder_trailing = 0;
+ if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
+ int super_y_bottom =
+ kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
+ int sub_y_top =
+ kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
+ int last_word_char = num_blobs - 1 - num_trailing;
+ float last_char_certainty = word->best_choice->certainty(last_word_char);
+ if (word->best_choice->unichar_id(last_word_char) != 0 &&
+ last_char_certainty <= unlikely_threshold) {
+ ScriptPos rpos;
+ YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top,
+ nullptr, nullptr, &rpos, &num_remainder_trailing);
+ if (num_trailing > 0 && rpos != sp_trailing) num_remainder_trailing = 0;
+ if (num_remainder_trailing > 0 &&
+ last_char_certainty < trailing_certainty) {
+ trailing_certainty = last_char_certainty;
+ }
+ }
+ bool another_blob_available = (num_remainder_trailing == 0) ||
+ num_leading + num_trailing + 1 < num_blobs;
+ int first_char_certainty = word->best_choice->certainty(num_leading);
+ if (another_blob_available &&
+ word->best_choice->unichar_id(num_leading) != 0 &&
+ first_char_certainty <= unlikely_threshold) {
+ ScriptPos lpos;
+ YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top,
+ &lpos, &num_remainder_leading, nullptr, nullptr);
+ if (num_leading > 0 && lpos != sp_leading) num_remainder_leading = 0;
+ if (num_remainder_leading > 0 &&
+ first_char_certainty < leading_certainty) {
+ leading_certainty = first_char_certainty;
+ }
+ }
+ }
+
+ // If nothing to do, bail now.
+ if (num_leading + num_trailing +
+ num_remainder_leading + num_remainder_trailing == 0) {
+ return false;
+ }
+
+ if (superscript_debug >= 1) {
+ tprintf("Candidate for superscript detection: %s (",
+ word->best_choice->unichar_string().c_str());
+ if (num_leading || num_remainder_leading) {
+ tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading,
+ leading_pos);
+ }
+ if (num_trailing || num_remainder_trailing) {
+ tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing,
+ trailing_pos);
+ }
+ tprintf(")\n");
+ }
+ if (superscript_debug >= 3) {
+ word->best_choice->print();
+ }
+ if (superscript_debug >= 2) {
+ tprintf(" Certainties -- Average: %.2f Unlikely thresh: %.2f ",
+ avg_certainty, unlikely_threshold);
+ if (num_leading)
+ tprintf("Orig. leading (min): %.2f ", leading_certainty);
+ if (num_trailing)
+ tprintf("Orig. trailing (min): %.2f ", trailing_certainty);
+ tprintf("\n");
+ }
+
+ // We've now calculated the number of rebuilt blobs we want to carve off.
+ // However, split_word() works from TBLOBs in chopped_word, so we need to
+ // convert to those.
+ int num_chopped_leading =
+ LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
+ int num_chopped_trailing =
+ TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
+
+ int retry_leading = 0;
+ int retry_trailing = 0;
+ bool is_good = false;
+ WERD_RES *revised = TrySuperscriptSplits(
+ num_chopped_leading, leading_certainty, sp_leading,
+ num_chopped_trailing, trailing_certainty, sp_trailing,
+ word, &is_good, &retry_leading, &retry_trailing);
+ if (is_good) {
+ word->ConsumeWordResults(revised);
+ } else if (retry_leading || retry_trailing) {
+ int retry_chopped_leading =
+ LeadingUnicharsToChopped(revised, retry_leading);
+ int retry_chopped_trailing =
+ TrailingUnicharsToChopped(revised, retry_trailing);
+ WERD_RES *revised2 = TrySuperscriptSplits(
+ retry_chopped_leading, leading_certainty, sp_leading,
+ retry_chopped_trailing, trailing_certainty, sp_trailing,
+ revised, &is_good, &retry_leading, &retry_trailing);
+ if (is_good) {
+ word->ConsumeWordResults(revised2);
+ }
+ delete revised2;
+ }
+ delete revised;
+ return is_good;
+}
+
+/**
+ * Determine how many characters (rebuilt blobs) on each end of a given word
+ * might plausibly be superscripts so SubAndSuperscriptFix can try to
+ * re-recognize them. Even if we find no whole blobs at either end,
+ * we will set *unlikely_threshold to a certainty that might be used to
+ * select "bad enough" outlier characters. If *unlikely_threshold is set to 0,
+ * though, there's really no hope.
+ *
+ * @param[in] word The word to examine.
+ * @param[out] num_rebuilt_leading the number of rebuilt blobs at the start
+ * of the word which are all up or down and
+ * seem badly classified.
+ * @param[out] leading_pos "super" or "sub" (for debugging)
+ * @param[out] leading_certainty the worst certainty in the leading blobs.
+ * @param[out] num_rebuilt_trailing the number of rebuilt blobs at the end
+ * of the word which are all up or down and
+ * seem badly classified.
+ * @param[out] trailing_pos "super" or "sub" (for debugging)
+ * @param[out] trailing_certainty the worst certainty in the trailing blobs.
+ * @param[out] avg_certainty the average certainty of "normal" blobs in
+ * the word.
+ * @param[out] unlikely_threshold the threshold (on certainty) we used to
+ * select "bad enough" outlier characters.
+ */
+void Tesseract::GetSubAndSuperscriptCandidates(const WERD_RES *word,
+ int *num_rebuilt_leading,
+ ScriptPos *leading_pos,
+ float *leading_certainty,
+ int *num_rebuilt_trailing,
+ ScriptPos *trailing_pos,
+ float *trailing_certainty,
+ float *avg_certainty,
+ float *unlikely_threshold) {
+ *avg_certainty = *unlikely_threshold = 0.0f;
+ *num_rebuilt_leading = *num_rebuilt_trailing = 0;
+ *leading_certainty = *trailing_certainty = 0.0f;
+
+ int super_y_bottom =
+ kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
+ int sub_y_top =
+ kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
+
+ // Step one: Get an average certainty for "normally placed" characters.
+
+ // Counts here are of blobs in the rebuild_word / unichars in best_choice.
+ *leading_pos = *trailing_pos = SP_NORMAL;
+ int leading_outliers = 0;
+ int trailing_outliers = 0;
+ int num_normal = 0;
+ float normal_certainty_total = 0.0f;
+ float worst_normal_certainty = 0.0f;
+ ScriptPos last_pos = SP_NORMAL;
+ int num_blobs = word->rebuild_word->NumBlobs();
+ for (int b = 0; b < num_blobs; ++b) {
+ TBOX box = word->rebuild_word->blobs[b]->bounding_box();
+ ScriptPos pos = SP_NORMAL;
+ if (box.bottom() >= super_y_bottom) {
+ pos = SP_SUPERSCRIPT;
+ } else if (box.top() <= sub_y_top) {
+ pos = SP_SUBSCRIPT;
+ }
+ if (pos == SP_NORMAL) {
+ if (word->best_choice->unichar_id(b) != 0) {
+ float char_certainty = word->best_choice->certainty(b);
+ if (char_certainty < worst_normal_certainty) {
+ worst_normal_certainty = char_certainty;
+ }
+ num_normal++;
+ normal_certainty_total += char_certainty;
+ }
+ if (trailing_outliers == b) {
+ leading_outliers = trailing_outliers;
+ *leading_pos = last_pos;
+ }
+ trailing_outliers = 0;
+ } else {
+ if (last_pos == pos) {
+ trailing_outliers++;
+ } else {
+ trailing_outliers = 1;
+ }
+ }
+ last_pos = pos;
+ }
+ *trailing_pos = last_pos;
+ if (num_normal >= 3) { // throw out the worst as an outlier.
+ num_normal--;
+ normal_certainty_total -= worst_normal_certainty;
+ }
+ if (num_normal > 0) {
+ *avg_certainty = normal_certainty_total / num_normal;
+ *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
+ }
+ if (num_normal == 0 ||
+ (leading_outliers == 0 && trailing_outliers == 0)) {
+ return;
+ }
+
+ // Step two: Try to split off bits of the word that are both outliers
+ // and have much lower certainty than average
+ // Calculate num_leading and leading_certainty.
+ for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0;
+ *num_rebuilt_leading < leading_outliers;
+ (*num_rebuilt_leading)++) {
+ float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
+ if (char_certainty > *unlikely_threshold) {
+ break;
+ }
+ if (char_certainty < *leading_certainty) {
+ *leading_certainty = char_certainty;
+ }
+ }
+
+ // Calculate num_trailing and trailing_certainty.
+ for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
+ *num_rebuilt_trailing < trailing_outliers;
+ (*num_rebuilt_trailing)++) {
+ int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
+ float char_certainty = word->best_choice->certainty(blob_idx);
+ if (char_certainty > *unlikely_threshold) {
+ break;
+ }
+ if (char_certainty < *trailing_certainty) {
+ *trailing_certainty = char_certainty;
+ }
+ }
+}
+
+
+/**
+ * Try splitting off the given number of (chopped) blobs from the front and
+ * back of the given word and recognizing the pieces.
+ *
+ * @param[in] num_chopped_leading how many chopped blobs from the left
+ * end of the word to chop off and try recognizing as a
+ * superscript (or subscript)
+ * @param[in] leading_certainty the (minimum) certainty had by the
+ * characters in the original leading section.
+ * @param[in] leading_pos "super" or "sub" (for debugging)
+ * @param[in] num_chopped_trailing how many chopped blobs from the right
+ * end of the word to chop off and try recognizing as a
+ * superscript (or subscript)
+ * @param[in] trailing_certainty the (minimum) certainty had by the
+ * characters in the original trailing section.
+ * @param[in] trailing_pos "super" or "sub" (for debugging)
+ * @param[in] word the word to try to chop up.
+ * @param[out] is_good do we believe our result?
+ * @param[out] retry_rebuild_leading, retry_rebuild_trailing
+ * If non-zero, and !is_good, then the caller may have luck trying
+ * to split the returned word with this number of (rebuilt) leading
+ * and trailing blobs / unichars.
+ * @return A word which is the result of re-recognizing as asked.
+ */
+WERD_RES *Tesseract::TrySuperscriptSplits(
+ int num_chopped_leading, float leading_certainty, ScriptPos leading_pos,
+ int num_chopped_trailing, float trailing_certainty,
+ ScriptPos trailing_pos,
+ WERD_RES *word,
+ bool *is_good,
+ int *retry_rebuild_leading, int *retry_rebuild_trailing) {
+ int num_chopped = word->chopped_word->NumBlobs();
+
+ *retry_rebuild_leading = *retry_rebuild_trailing = 0;
+
+ // Chop apart the word into up to three pieces.
+
+ BlamerBundle *bb0 = nullptr;
+ BlamerBundle *bb1 = nullptr;
+ WERD_RES *prefix = nullptr;
+ WERD_RES *core = nullptr;
+ WERD_RES *suffix = nullptr;
+ if (num_chopped_leading > 0) {
+ prefix = new WERD_RES(*word);
+ split_word(prefix, num_chopped_leading, &core, &bb0);
+ } else {
+ core = new WERD_RES(*word);
+ }
+
+ if (num_chopped_trailing > 0) {
+ int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
+ split_word(core, split_pt, &suffix, &bb1);
+ }
+
+ // Recognize the pieces in turn.
+ int saved_cp_multiplier = classify_class_pruner_multiplier;
+ int saved_im_multiplier = classify_integer_matcher_multiplier;
+ if (prefix) {
+ // Turn off Tesseract's y-position penalties for the leading superscript.
+ classify_class_pruner_multiplier.set_value(0);
+ classify_integer_matcher_multiplier.set_value(0);
+
+ // Adjust our expectations about the baseline for this prefix.
+ if (superscript_debug >= 3) {
+ tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
+ }
+ recog_word_recursive(prefix);
+ if (superscript_debug >= 2) {
+ tprintf(" The leading bits look like %s %s\n",
+ ScriptPosToString(leading_pos),
+ prefix->best_choice->unichar_string().c_str());
+ }
+
+ // Restore the normal y-position penalties.
+ classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
+ classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
+ }
+
+ if (superscript_debug >= 3) {
+ tprintf(" recognizing middle %d chopped blobs\n",
+ num_chopped - num_chopped_leading - num_chopped_trailing);
+ }
+
+ if (suffix) {
+ // Turn off Tesseract's y-position penalties for the trailing superscript.
+ classify_class_pruner_multiplier.set_value(0);
+ classify_integer_matcher_multiplier.set_value(0);
+
+ if (superscript_debug >= 3) {
+ tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
+ }
+ recog_word_recursive(suffix);
+ if (superscript_debug >= 2) {
+ tprintf(" The trailing bits look like %s %s\n",
+ ScriptPosToString(trailing_pos),
+ suffix->best_choice->unichar_string().c_str());
+ }
+
+ // Restore the normal y-position penalties.
+ classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
+ classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
+ }
+
+ // Evaluate whether we think the results are believably better
+ // than what we already had.
+ bool good_prefix = !prefix || BelievableSuperscript(
+ superscript_debug >= 1, *prefix,
+ superscript_bettered_certainty * leading_certainty,
+ retry_rebuild_leading, nullptr);
+ bool good_suffix = !suffix || BelievableSuperscript(
+ superscript_debug >= 1, *suffix,
+ superscript_bettered_certainty * trailing_certainty,
+ nullptr, retry_rebuild_trailing);
+
+ *is_good = good_prefix && good_suffix;
+ if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
+ // None of it is any good. Quit now.
+ delete core;
+ delete prefix;
+ delete suffix;
+ delete bb1;
+ return nullptr;
+ }
+ recog_word_recursive(core);
+
+ // Now paste the results together into core.
+ if (suffix) {
+ suffix->SetAllScriptPositions(trailing_pos);
+ join_words(core, suffix, bb1);
+ }
+ if (prefix) {
+ prefix->SetAllScriptPositions(leading_pos);
+ join_words(prefix, core, bb0);
+ core = prefix;
+ prefix = nullptr;
+ }
+
+ if (superscript_debug >= 1) {
+ tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
+ core->best_choice->unichar_string().c_str());
+ }
+ return core;
+}
+
+
+/**
+ * Return whether this is believable superscript or subscript text.
+ *
+ * We insist that:
+ * + there are no punctuation marks.
+ * + there are no italics.
+ * + no normal-sized character is smaller than superscript_scaledown_ratio
+ * of what it ought to be, and
+ * + each character is at least as certain as certainty_threshold.
+ *
+ * @param[in] debug If true, spew debug output
+ * @param[in] word The word whose best_choice we're evaluating
+ * @param[in] certainty_threshold If any of the characters have less
+ * certainty than this, reject.
+ * @param[out] left_ok How many left-side characters were ok?
+ * @param[out] right_ok How many right-side characters were ok?
+ * @return Whether the complete best choice is believable as a superscript.
+ */
+bool Tesseract::BelievableSuperscript(bool debug,
+ const WERD_RES &word,
+ float certainty_threshold,
+ int *left_ok,
+ int *right_ok) const {
+ int initial_ok_run_count = 0;
+ int ok_run_count = 0;
+ float worst_certainty = 0.0f;
+ const WERD_CHOICE &wc = *word.best_choice;
+
+ const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table();
+ for (int i = 0; i < wc.length(); i++) {
+ TBLOB *blob = word.rebuild_word->blobs[i];
+ UNICHAR_ID unichar_id = wc.unichar_id(i);
+ float char_certainty = wc.certainty(i);
+ bool bad_certainty = char_certainty < certainty_threshold;
+ bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
+ bool is_italic = word.fontinfo && word.fontinfo->is_italic();
+ BLOB_CHOICE *choice = word.GetBlobChoice(i);
+ if (choice && fontinfo_table.size() > 0) {
+ // Get better information from the specific choice, if available.
+ int font_id1 = choice->fontinfo_id();
+ bool font1_is_italic = font_id1 >= 0
+ ? fontinfo_table.get(font_id1).is_italic() : false;
+ int font_id2 = choice->fontinfo_id2();
+ is_italic = font1_is_italic &&
+ (font_id2 < 0 || fontinfo_table.get(font_id2).is_italic());
+ }
+
+ float height_fraction = 1.0f;
+ float char_height = blob->bounding_box().height();
+ float normal_height = char_height;
+ if (wc.unicharset()->top_bottom_useful()) {
+ int min_bot, max_bot, min_top, max_top;
+ wc.unicharset()->get_top_bottom(unichar_id,
+ &min_bot, &max_bot,
+ &min_top, &max_top);
+ float hi_height = max_top - max_bot;
+ float lo_height = min_top - min_bot;
+ normal_height = (hi_height + lo_height) / 2;
+ if (normal_height >= kBlnXHeight) {
+ // Only ding characters that we have decent information for because
+ // they're supposed to be normal sized, not tiny specks or dashes.
+ height_fraction = char_height / normal_height;
+ }
+ }
+ bool bad_height = height_fraction < superscript_scaledown_ratio;
+
+ if (debug) {
+ if (is_italic) {
+ tprintf(" Rejecting: superscript is italic.\n");
+ }
+ if (is_punc) {
+ tprintf(" Rejecting: punctuation present.\n");
+ }
+ const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
+ if (bad_certainty) {
+ tprintf(" Rejecting: don't believe character %s with certainty %.2f "
+ "which is less than threshold %.2f\n", char_str,
+ char_certainty, certainty_threshold);
+ }
+ if (bad_height) {
+ tprintf(" Rejecting: character %s seems too small @ %.2f versus "
+ "expected %.2f\n", char_str, char_height, normal_height);
+ }
+ }
+ if (bad_certainty || bad_height || is_punc || is_italic) {
+ if (ok_run_count == i) {
+ initial_ok_run_count = ok_run_count;
+ }
+ ok_run_count = 0;
+ } else {
+ ok_run_count++;
+ }
+ if (char_certainty < worst_certainty) {
+ worst_certainty = char_certainty;
+ }
+ }
+ bool all_ok = ok_run_count == wc.length();
+ if (all_ok && debug) {
+ tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
+ }
+ if (!all_ok) {
+ if (left_ok) *left_ok = initial_ok_run_count;
+ if (right_ok) *right_ok = ok_run_count;
+ }
+ return all_ok;
+}
+
+
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/tessbox.cpp b/tesseract/src/ccmain/tessbox.cpp
new file mode 100644
index 00000000..80c5a9ad
--- /dev/null
+++ b/tesseract/src/ccmain/tessbox.cpp
@@ -0,0 +1,75 @@
+/**********************************************************************
+ * File: tessbox.cpp (Formerly tessbox.c)
+ * Description: Black boxed Tess for developing a resaljet.
+ * Author: Ray Smith
+ * Created: Thu Apr 23 11:03:36 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "mfoutline.h"
+#include "tesseractclass.h"
+
+/**
+ * @name tess_segment_pass_n
+ *
+ * Segment a word using the pass_n conditions of the tess segmenter.
+ * @param pass_n pass number
+ * @param word word to do
+ */
+
+namespace tesseract {
+void Tesseract::tess_segment_pass_n(int pass_n, WERD_RES *word) {
+ int saved_enable_assoc = 0;
+ int saved_chop_enable = 0;
+
+ if (word->word->flag(W_DONT_CHOP)) {
+ saved_enable_assoc = wordrec_enable_assoc;
+ saved_chop_enable = chop_enable;
+ wordrec_enable_assoc.set_value(0);
+ chop_enable.set_value(0);
+ }
+ if (pass_n == 1)
+ set_pass1();
+ else
+ set_pass2();
+ recog_word(word);
+ if (word->best_choice == nullptr)
+ word->SetupFake(*word->uch_set);
+ if (word->word->flag(W_DONT_CHOP)) {
+ wordrec_enable_assoc.set_value(saved_enable_assoc);
+ chop_enable.set_value(saved_chop_enable);
+ }
+}
+
+/**
+ * @name tess_acceptable_word
+ *
+ * @return true if the word is regarded as "good enough".
+ * @param word_choice after context
+ * @param raw_choice before context
+ */
+bool Tesseract::tess_acceptable_word(WERD_RES* word) {
+ return getDict().AcceptableResult(word);
+}
+
+
+/**
+ * @name tess_add_doc_word
+ *
+ * Add the given word to the document dictionary
+ */
+void Tesseract::tess_add_doc_word(WERD_CHOICE *word_choice) {
+ getDict().add_document_word(*word_choice);
+}
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/tessedit.cpp b/tesseract/src/ccmain/tessedit.cpp
new file mode 100644
index 00000000..15b433f1
--- /dev/null
+++ b/tesseract/src/ccmain/tessedit.cpp
@@ -0,0 +1,474 @@
+/**********************************************************************
+ * File: tessedit.cpp (Formerly tessedit.c)
+ * Description: (Previously) Main program for merge of tess and editor.
+ * Now just code to load the language model and various
+ * engine-specific data files.
+ * Author: Ray Smith
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+# include "config_auto.h"
+#endif
+
+#include "control.h"
+# include "matchdefs.h"
+#include "pageres.h"
+#include "params.h"
+#include "stopper.h"
+#include "tesseractclass.h"
+#include "tessvars.h"
+#include "tprintf.h"
+#ifndef DISABLED_LEGACY_ENGINE
+# include "chop.h"
+# include "intmatcher.h"
+# include "reject.h"
+#endif
+#include "lstmrecognizer.h"
+
+namespace tesseract {
+
+// Read a "config" file containing a set of variable, value pairs.
+// Searches the standard places: tessdata/configs, tessdata/tessconfigs
+// and also accepts a relative or absolute path name.
+void Tesseract::read_config_file(const char* filename,
+ SetParamConstraint constraint) {
+ std::string path = datadir;
+ path += "configs/";
+ path += filename;
+ FILE* fp;
+ if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
+ fclose(fp);
+ } else {
+ path = datadir;
+ path += "tessconfigs/";
+ path += filename;
+ if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
+ fclose(fp);
+ } else {
+ path = filename;
+ }
+ }
+ ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params());
+}
+
+// Returns false if a unicharset file for the specified language was not found
+// or was invalid.
+// This function initializes TessdataManager. After TessdataManager is
+// no longer needed, TessdataManager::End() should be called.
+//
+// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
+// it is OEM_DEFAULT, in which case the value of the variable will be obtained
+// from the language-specific config file (stored in [lang].traineddata), from
+// the config files specified on the command line or left as the default
+// OEM_TESSERACT_ONLY if none of the configs specify this variable.
+bool Tesseract::init_tesseract_lang_data(
+ const char* arg0, const char* textbase, const char* language,
+ OcrEngineMode oem, char** configs, int configs_size,
+ const std::vector<std::string>* vars_vec,
+ const std::vector<std::string>* vars_values, bool set_only_non_debug_params,
+ TessdataManager* mgr) {
+ // Set the basename, compute the data directory.
+ main_setup(arg0, textbase);
+
+ // Set the language data path prefix
+ lang = language != nullptr ? language : "eng";
+ language_data_path_prefix = datadir;
+ language_data_path_prefix += lang;
+ language_data_path_prefix += ".";
+
+ // Initialize TessdataManager.
+ std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
+ if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) {
+ tprintf("Error opening data file %s\n", tessdata_path.c_str());
+ tprintf(
+ "Please make sure the TESSDATA_PREFIX environment variable is set"
+ " to your \"tessdata\" directory.\n");
+ return false;
+ }
+#ifdef DISABLED_LEGACY_ENGINE
+ tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
+#else
+ if (oem == OEM_DEFAULT) {
+ // Set the engine mode from availability, which can then be overridden by
+ // the config file when we read it below.
+ if (!mgr->IsLSTMAvailable()) {
+ tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
+ } else if (!mgr->IsBaseAvailable()) {
+ tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
+ } else {
+ tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
+ }
+ }
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+ // If a language specific config file (lang.config) exists, load it in.
+ TFile fp;
+ if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
+ ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp,
+ this->params());
+ }
+
+ SetParamConstraint set_params_constraint =
+ set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
+ : SET_PARAM_CONSTRAINT_NONE;
+ // Load tesseract variables from config files. This is done after loading
+ // language-specific variables from [lang].traineddata file, so that custom
+ // config files can override values in [lang].traineddata file.
+ for (int i = 0; i < configs_size; ++i) {
+ read_config_file(configs[i], set_params_constraint);
+ }
+
+ // Set params specified in vars_vec (done after setting params from config
+ // files, so that params in vars_vec can override those from files).
+ if (vars_vec != nullptr && vars_values != nullptr) {
+ for (int i = 0; i < vars_vec->size(); ++i) {
+ if (!ParamUtils::SetParam((*vars_vec)[i].c_str(),
+ (*vars_values)[i].c_str(),
+ set_params_constraint, this->params())) {
+ tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str());
+ }
+ }
+ }
+
+ if (!tessedit_write_params_to_file.empty()) {
+ FILE* params_file = fopen(tessedit_write_params_to_file.c_str(), "wb");
+ if (params_file != nullptr) {
+ ParamUtils::PrintParams(params_file, this->params());
+ fclose(params_file);
+ } else {
+ tprintf("Failed to open %s for writing params.\n",
+ tessedit_write_params_to_file.c_str());
+ }
+ }
+
+#ifndef DISABLED_LEGACY_ENGINE
+ // Determine which ocr engine(s) should be loaded and used for recognition.
+ if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
+#endif
+
+ // If we are only loading the config file (and so not planning on doing any
+ // recognition) then there's nothing else do here.
+ if (tessedit_init_config_only) {
+ return true;
+ }
+
+// The various OcrEngineMode settings (see tesseract/publictypes.h) determine which
+// engine-specific data files need to be loaded.
+// If LSTM_ONLY is requested, the base Tesseract files are *Not* required.
+#ifdef DISABLED_LEGACY_ENGINE
+ if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
+#else
+ if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
+ tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
+#endif // ndef DISABLED_LEGACY_ENGINE
+ if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
+ lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix.c_str());
+ ASSERT_HOST(lstm_recognizer_->Load(
+ this->params(), lstm_use_matrix ? language : nullptr, mgr));
+ } else {
+ tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
+ tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
+ }
+ }
+
+ // Load the unicharset
+ if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
+ // Avoid requiring a unicharset when we aren't running base tesseract.
+ unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
+ }
+#ifndef DISABLED_LEGACY_ENGINE
+ else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) ||
+ !unicharset.load_from_file(&fp, false)) {
+ tprintf("Error: Tesseract (legacy) engine requested, but components are "
+ "not present in %s!!\n", tessdata_path.c_str());
+ return false;
+ }
+#endif // ndef DISABLED_LEGACY_ENGINE
+ if (unicharset.size() > MAX_NUM_CLASSES) {
+ tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
+ return false;
+ }
+ right_to_left_ = unicharset.major_right_to_left();
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+ // Setup initial unichar ambigs table and read universal ambigs.
+ UNICHARSET encoder_unicharset;
+ encoder_unicharset.CopyFrom(unicharset);
+ unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
+ unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
+
+ if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
+ unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp,
+ ambigs_debug_level,
+ use_ambigs_for_adaption, &unicharset);
+ }
+
+ // Init ParamsModel.
+ // Load pass1 and pass2 weights (for now these two sets are the same, but in
+ // the future separate sets of weights can be generated).
+ for (int p = ParamsModel::PTRAIN_PASS1; p < ParamsModel::PTRAIN_NUM_PASSES;
+ ++p) {
+ language_model_->getParamsModel().SetPass(
+ static_cast<ParamsModel::PassEnum>(p));
+ if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
+ if (!language_model_->getParamsModel().LoadFromFp(lang.c_str(), &fp)) {
+ return false;
+ }
+ }
+ }
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+ return true;
+}
+
+// Helper returns true if the given string is in the vector of strings.
+static bool IsStrInList(const std::string& str,
+ const std::vector<std::string>& str_list) {
+ for (int i = 0; i < str_list.size(); ++i) {
+ if (str_list[i] == str) return true;
+ }
+ return false;
+}
+
+// Parse a string of the form [~]<lang>[+[~]<lang>]*.
+// Langs with no prefix get appended to to_load, provided they
+// are not in there already.
+// Langs with ~ prefix get appended to not_to_load, provided they are not in
+// there already.
+void Tesseract::ParseLanguageString(const char* lang_str,
+ std::vector<std::string>* to_load,
+ std::vector<std::string>* not_to_load) {
+ std::string remains(lang_str);
+ while (!remains.empty()) {
+ // Find the start of the lang code and which vector to add to.
+ const char* start = remains.c_str();
+ while (*start == '+') ++start;
+ std::vector<std::string>* target = to_load;
+ if (*start == '~') {
+ target = not_to_load;
+ ++start;
+ }
+ // Find the index of the end of the lang code in string start.
+ int end = strlen(start);
+ const char* plus = strchr(start, '+');
+ if (plus != nullptr && plus - start < end) end = plus - start;
+ std::string lang_code(start);
+ lang_code.resize(end);
+ std::string next(start + end);
+ remains = next;
+ // Check whether lang_code is already in the target vector and add.
+ if (!IsStrInList(lang_code, *target)) {
+ target->push_back(lang_code);
+ }
+ }
+}
+
+// Initialize for potentially a set of languages defined by the language
+// string and recursively any additional languages required by any language
+// traineddata file (via tessedit_load_sublangs in its config) that is loaded.
+// See init_tesseract_internal for args.
+int Tesseract::init_tesseract(const char* arg0, const char* textbase,
+ const char* language, OcrEngineMode oem,
+ char** configs, int configs_size,
+ const std::vector<std::string>* vars_vec,
+ const std::vector<std::string>* vars_values,
+ bool set_only_non_debug_params,
+ TessdataManager* mgr) {
+ std::vector<std::string> langs_to_load;
+ std::vector<std::string> langs_not_to_load;
+ ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
+
+ for (auto* lang : sub_langs_) {
+ delete lang;
+ }
+ sub_langs_.clear();
+ // Find the first loadable lang and load into this.
+ // Add any languages that this language requires
+ bool loaded_primary = false;
+ // Load the rest into sub_langs_.
+ for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
+ if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
+ const char* lang_str = langs_to_load[lang_index].c_str();
+ Tesseract* tess_to_init;
+ if (!loaded_primary) {
+ tess_to_init = this;
+ } else {
+ tess_to_init = new Tesseract;
+ }
+
+ int result = tess_to_init->init_tesseract_internal(
+ arg0, textbase, lang_str, oem, configs, configs_size, vars_vec,
+ vars_values, set_only_non_debug_params, mgr);
+ // Forget that language, but keep any reader we were given.
+ mgr->Clear();
+
+ if (!loaded_primary) {
+ if (result < 0) {
+ tprintf("Failed loading language '%s'\n", lang_str);
+ } else {
+ ParseLanguageString(tess_to_init->tessedit_load_sublangs.c_str(),
+ &langs_to_load, &langs_not_to_load);
+ loaded_primary = true;
+ }
+ } else {
+ if (result < 0) {
+ tprintf("Failed loading language '%s'\n", lang_str);
+ delete tess_to_init;
+ } else {
+ sub_langs_.push_back(tess_to_init);
+ // Add any languages that this language requires
+ ParseLanguageString(tess_to_init->tessedit_load_sublangs.c_str(),
+ &langs_to_load, &langs_not_to_load);
+ }
+ }
+ }
+ }
+ if (!loaded_primary) {
+ tprintf("Tesseract couldn't load any languages!\n");
+ return -1; // Couldn't load any language!
+ }
+#ifndef DISABLED_LEGACY_ENGINE
+ if (!sub_langs_.empty()) {
+ // In multilingual mode word ratings have to be directly comparable,
+ // so use the same language model weights for all languages:
+ // use the primary language's params model if
+ // tessedit_use_primary_params_model is set,
+ // otherwise use default language model weights.
+ if (tessedit_use_primary_params_model) {
+ for (int s = 0; s < sub_langs_.size(); ++s) {
+ sub_langs_[s]->language_model_->getParamsModel().Copy(
+ this->language_model_->getParamsModel());
+ }
+ tprintf("Using params model of the primary language\n");
+ } else {
+ this->language_model_->getParamsModel().Clear();
+ for (int s = 0; s < sub_langs_.size(); ++s) {
+ sub_langs_[s]->language_model_->getParamsModel().Clear();
+ }
+ }
+ }
+
+ SetupUniversalFontIds();
+#endif // ndef DISABLED_LEGACY_ENGINE
+ return 0;
+}
+
+// Common initialization for a single language.
+// arg0 is the datapath for the tessdata directory, which could be the
+// path of the tessdata directory with no trailing /, or (if tessdata
+// lives in the same directory as the executable, the path of the executable,
+// hence the name arg0.
+// textbase is an optional output file basename (used only for training)
+// language is the language code to load.
+// oem controls which engine(s) will operate on the image
+// configs (argv) is an array of config filenames to load variables from.
+// May be nullptr.
+// configs_size (argc) is the number of elements in configs.
+// vars_vec is an optional vector of variables to set.
+// vars_values is an optional corresponding vector of values for the variables
+// in vars_vec.
+// If set_only_init_params is true, then only the initialization variables
+// will be set.
+int Tesseract::init_tesseract_internal(const char* arg0, const char* textbase,
+ const char* language, OcrEngineMode oem,
+ char** configs, int configs_size,
+ const std::vector<std::string>* vars_vec,
+ const std::vector<std::string>* vars_values,
+ bool set_only_non_debug_params,
+ TessdataManager* mgr) {
+ if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
+ configs_size, vars_vec, vars_values,
+ set_only_non_debug_params, mgr)) {
+ return -1;
+ }
+ if (tessedit_init_config_only) {
+ return 0;
+ }
+ // If only LSTM will be used, skip loading Tesseract classifier's
+ // pre-trained templates and dictionary.
+ bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;
+ program_editup(textbase, init_tesseract ? mgr : nullptr,
+ init_tesseract ? mgr : nullptr);
+ return 0; // Normal exit
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+// Helper builds the all_fonts table by adding new fonts from new_fonts.
+static void CollectFonts(const UnicityTable<FontInfo>& new_fonts,
+ UnicityTable<FontInfo>* all_fonts) {
+ for (int i = 0; i < new_fonts.size(); ++i) {
+ // UnicityTable uniques as we go.
+ all_fonts->push_back(new_fonts.get(i));
+ }
+}
+
+// Helper assigns an id to lang_fonts using the index in all_fonts table.
+static void AssignIds(const UnicityTable<FontInfo>& all_fonts,
+ UnicityTable<FontInfo>* lang_fonts) {
+ for (int i = 0; i < lang_fonts->size(); ++i) {
+ int index = all_fonts.get_id(lang_fonts->get(i));
+ lang_fonts->get_mutable(i)->universal_id = index;
+ }
+}
+
+// Set the universal_id member of each font to be unique among all
+// instances of the same font loaded.
+void Tesseract::SetupUniversalFontIds() {
+ // Note that we can get away with bitwise copying FontInfo in
+ // all_fonts, as it is a temporary structure and we avoid setting the
+ // delete callback.
+ UnicityTable<FontInfo> all_fonts;
+
+ // Create the universal ID table.
+ CollectFonts(get_fontinfo_table(), &all_fonts);
+ for (int i = 0; i < sub_langs_.size(); ++i) {
+ CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
+ }
+ // Assign ids from the table to each font table.
+ AssignIds(all_fonts, &get_fontinfo_table());
+ for (int i = 0; i < sub_langs_.size(); ++i) {
+ AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
+ }
+ font_table_size_ = all_fonts.size();
+}
+
+// init the LM component
+int Tesseract::init_tesseract_lm(const char* arg0, const char* textbase,
+ const char* language, TessdataManager* mgr) {
+ if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
+ nullptr, 0, nullptr, nullptr, false, mgr))
+ return -1;
+ getDict().SetupForLoad(Dict::GlobalDawgCache());
+ getDict().Load(lang, mgr);
+ getDict().FinishLoad();
+ return 0;
+}
+
+#endif // ndef DISABLED_LEGACY_ENGINE
+
+void Tesseract::end_tesseract() { end_recog(); }
+
+/* Define command type identifiers */
+
+enum CMD_EVENTS {
+ ACTION_1_CMD_EVENT,
+ RECOG_WERDS,
+ RECOG_PSEUDO,
+ ACTION_2_CMD_EVENT
+};
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/tesseractclass.cpp b/tesseract/src/ccmain/tesseractclass.cpp
new file mode 100644
index 00000000..fdd88c52
--- /dev/null
+++ b/tesseract/src/ccmain/tesseractclass.cpp
@@ -0,0 +1,707 @@
+///////////////////////////////////////////////////////////////////////
+// File: tesseractclass.cpp
+// Description: The Tesseract class. It holds/owns everything needed
+// to run Tesseract on a single language, and also a set of
+// sub-Tesseracts to run sub-languages. For thread safety, *every*
+// variable that was previously global or static (except for
+// constant data, and some visual debugging flags) has been moved
+// in here, directly, or indirectly.
+// This makes it safe to run multiple Tesseracts in different
+// threads in parallel, and keeps the different language
+// instances separate.
+// Some global functions remain, but they are isolated re-entrant
+// functions that operate on their arguments. Functions that work
+// on variable data have been moved to an appropriate class based
+// mostly on the directory hierarchy. For more information see
+// slide 6 of "2ArchitectureAndDataStructures" in
+// https://drive.google.com/file/d/0B7l10Bj_LprhbUlIUFlCdGtDYkE/edit?usp=sharing
+// Some global data and related functions still exist in the
+// training-related code, but they don't interfere with normal
+// recognition operation.
+// Author: Ray Smith
+//
+// (C) Copyright 2008, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#include "tesseractclass.h"
+
+#include "allheaders.h"
+#include "edgblob.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#include "equationdetect.h"
+#endif
+#include "lstmrecognizer.h"
+
+namespace tesseract {
+
+Tesseract::Tesseract()
+ : BOOL_MEMBER(tessedit_resegment_from_boxes, false,
+ "Take segmentation and labeling from box file",
+ this->params()),
+ BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,
+ "Conversion of word/line box file to char box file",
+ this->params()),
+ BOOL_MEMBER(tessedit_train_from_boxes, false,
+ "Generate training data from boxed chars", this->params()),
+ BOOL_MEMBER(tessedit_make_boxes_from_boxes, false,
+ "Generate more boxes from boxed chars", this->params()),
+ BOOL_MEMBER(tessedit_train_line_recognizer, false,
+ "Break input into lines and remap boxes if present",
+ this->params()),
+ BOOL_MEMBER(tessedit_dump_pageseg_images, false,
+ "Dump intermediate images made during page segmentation",
+ this->params()),
+ BOOL_MEMBER(tessedit_do_invert, true,
+ "Try inverting the image in `LSTMRecognizeWord`", this->params()),
+ // The default for pageseg_mode is the old behaviour, so as not to
+ // upset anything that relies on that.
+ INT_MEMBER(
+ tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
+ "Page seg mode: 0=osd only, 1=auto+osd, 2=auto_only, 3=auto, 4=column,"
+ " 5=block_vert, 6=block, 7=line, 8=word, 9=word_circle, 10=char,"
+ "11=sparse_text, 12=sparse_text+osd, 13=raw_line"
+ " (Values from PageSegMode enum in tesseract/publictypes.h)",
+ this->params()),
+ INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT,
+ "Which OCR engine(s) to run (Tesseract, LSTM, both)."
+ " Defaults to loading and running the most accurate"
+ " available.",
+ this->params()),
+ STRING_MEMBER(tessedit_char_blacklist, "",
+ "Blacklist of chars not to recognize", this->params()),
+ STRING_MEMBER(tessedit_char_whitelist, "",
+ "Whitelist of chars to recognize", this->params()),
+ STRING_MEMBER(tessedit_char_unblacklist, "",
+ "List of chars to override tessedit_char_blacklist",
+ this->params()),
+ BOOL_MEMBER(tessedit_ambigs_training, false,
+ "Perform training for ambiguities", this->params()),
+ INT_MEMBER(pageseg_devanagari_split_strategy,
+ tesseract::ShiroRekhaSplitter::NO_SPLIT,
+ "Whether to use the top-line splitting process for Devanagari "
+ "documents while performing page-segmentation.",
+ this->params()),
+ INT_MEMBER(ocr_devanagari_split_strategy,
+ tesseract::ShiroRekhaSplitter::NO_SPLIT,
+ "Whether to use the top-line splitting process for Devanagari "
+ "documents while performing ocr.",
+ this->params()),
+ STRING_MEMBER(tessedit_write_params_to_file, "",
+ "Write all parameters to the given file.", this->params()),
+ BOOL_MEMBER(tessedit_adaption_debug, false,
+ "Generate and print debug"
+ " information for adaption",
+ this->params()),
+ INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
+ INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
+ INT_MEMBER(applybox_page, 0, "Page number to apply boxes from",
+ this->params()),
+ STRING_MEMBER(applybox_exposure_pattern, ".exp",
+ "Exposure value follows"
+ " this pattern in the image filename. The name of the image"
+ " files are expected to be in the form"
+ " [lang].[fontname].exp[num].tif",
+ this->params()),
+ BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
+ "Learn both character fragments (as is done in the"
+ " special low exposure mode) as well as unfragmented"
+ " characters.",
+ this->params()),
+ BOOL_MEMBER(applybox_learn_ngrams_mode, false,
+ "Each bounding box"
+ " is assumed to contain ngrams. Only learn the ngrams"
+ " whose outlines overlap horizontally.",
+ this->params()),
+ BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words",
+ this->params()),
+ BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices",
+ this->params()),
+ BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats",
+ this->params()),
+ BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true,
+ "Try to improve fuzzy spaces", this->params()),
+ BOOL_MEMBER(tessedit_unrej_any_wd, false,
+ "Don't bother with word plausibility", this->params()),
+ BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?",
+ this->params()),
+ BOOL_MEMBER(tessedit_enable_doc_dict, true,
+ "Add words to the document dictionary", this->params()),
+ BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char",
+ this->params()),
+ BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats",
+ this->params()),
+ BOOL_MEMBER(tessedit_enable_bigram_correction, true,
+ "Enable correction based on the word bigram dictionary.",
+ this->params()),
+ BOOL_MEMBER(tessedit_enable_dict_correction, false,
+ "Enable single word correction based on the dictionary.",
+ this->params()),
+ INT_MEMBER(tessedit_bigram_debug, 0,
+ "Amount of debug output for bigram correction.",
+ this->params()),
+ BOOL_MEMBER(enable_noise_removal, true,
+ "Remove and conditionally reassign small outlines when they"
+ " confuse layout analysis, determining diacritics vs noise",
+ this->params()),
+ INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines",
+ this->params()),
+ // Worst (min) certainty, for which a diacritic is allowed to make the
+ // base
+ // character worse and still be included.
+ double_MEMBER(noise_cert_basechar, -8.0,
+ "Hingepoint for base char certainty", this->params()),
+ // Worst (min) certainty, for which a non-overlapping diacritic is allowed
+ // to make the base character worse and still be included.
+ double_MEMBER(noise_cert_disjoint, -1.0,
+ "Hingepoint for disjoint certainty", this->params()),
+ // Worst (min) certainty, for which a diacritic is allowed to make a new
+ // stand-alone blob.
+ double_MEMBER(noise_cert_punc, -3.0,
+ "Threshold for new punc char certainty", this->params()),
+ // Factor of certainty margin for adding diacritics to not count as worse.
+ double_MEMBER(noise_cert_factor, 0.375,
+ "Scaling on certainty diff from Hingepoint",
+ this->params()),
+ INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob",
+ this->params()),
+ INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word",
+ this->params()),
+ INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
+ STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation",
+ this->params()),
+ STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation",
+ this->params()),
+ STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation",
+ this->params()),
+ double_MEMBER(quality_rej_pc, 0.08,
+ "good_quality_doc lte rejection limit", this->params()),
+ double_MEMBER(quality_blob_pc, 0.0,
+ "good_quality_doc gte good blobs limit", this->params()),
+ double_MEMBER(quality_outline_pc, 1.0,
+ "good_quality_doc lte outline error limit", this->params()),
+ double_MEMBER(quality_char_pc, 0.95,
+ "good_quality_doc gte good char limit", this->params()),
+ INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word",
+ this->params()),
+ INT_MEMBER(tessedit_tess_adaption_mode, 0x27,
+ "Adaptation decision algorithm for tess", this->params()),
+ BOOL_MEMBER(tessedit_minimal_rej_pass1, false,
+ "Do minimal rejection on pass 1 output", this->params()),
+ BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria",
+ this->params()),
+ BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
+ double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
+ double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
+ INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.",
+ this->params()),
+ INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
+ this->params()),
+ BOOL_MEMBER(paragraph_text_based, true,
+ "Run paragraph detection on the post-text-recognition "
+ "(more accurate)",
+ this->params()),
+ BOOL_MEMBER(lstm_use_matrix, 1,
+ "Use ratings matrix/beam search with lstm", this->params()),
+ STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
+ this->params()),
+ STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines",
+ this->params()),
+ BOOL_MEMBER(tessedit_good_quality_unrej, true,
+ "Reduce rejection on good docs", this->params()),
+ BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?",
+ this->params()),
+ double_MEMBER(tessedit_reject_doc_percent, 65.00,
+ "%rej allowed before rej whole doc", this->params()),
+ double_MEMBER(tessedit_reject_block_percent, 45.00,
+ "%rej allowed before rej whole block", this->params()),
+ double_MEMBER(tessedit_reject_row_percent, 40.00,
+ "%rej allowed before rej whole row", this->params()),
+ double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00,
+ "Number of row rejects in whole word rejects"
+ " which prevents whole row rejection",
+ this->params()),
+ BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
+ "Only rej partially rejected words in block rejection",
+ this->params()),
+ BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,
+ "Only rej partially rejected words in row rejection",
+ this->params()),
+ BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false,
+ "Use word segmentation quality metric", this->params()),
+ BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false,
+ "Use word segmentation quality metric", this->params()),
+ INT_MEMBER(tessedit_preserve_min_wd_len, 2,
+ "Only preserve wds longer than this", this->params()),
+ BOOL_MEMBER(tessedit_row_rej_good_docs, true,
+ "Apply row rejection to good docs", this->params()),
+ double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1,
+ "rej good doc wd if more than this fraction rejected",
+ this->params()),
+ BOOL_MEMBER(tessedit_reject_bad_qual_wds, true,
+ "Reject all bad quality wds", this->params()),
+ BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats",
+ this->params()),
+ BOOL_MEMBER(tessedit_debug_quality_metrics, false,
+ "Output data to debug file", this->params()),
+ BOOL_MEMBER(bland_unrej, false, "unrej potential with no checks",
+ this->params()),
+ double_MEMBER(quality_rowrej_pc, 1.1,
+ "good_quality_doc gte good char limit", this->params()),
+ BOOL_MEMBER(unlv_tilde_crunching, false,
+ "Mark v.bad words for tilde crunch", this->params()),
+ BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
+ this->params()),
+ BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
+ this->params()),
+ BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
+ this->params()),
+ BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
+ "Take out ~^ early?", this->params()),
+ double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this",
+ this->params()),
+ BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()),
+ double_MEMBER(crunch_poor_garbage_cert, -9.0,
+ "crunch garbage cert lt this", this->params()),
+ double_MEMBER(crunch_poor_garbage_rate, 60,
+ "crunch garbage rating lt this", this->params()),
+ double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this",
+ this->params()),
+ double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this",
+ this->params()),
+ double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this",
+ this->params()),
+ double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this",
+ this->params()),
+ double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this",
+ this->params()),
+ double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this",
+ this->params()),
+ double_MEMBER(crunch_del_min_width, 3.0,
+ "Del if word width lt xht x this", this->params()),
+ double_MEMBER(crunch_del_high_word, 1.5,
+ "Del if word gt xht x this above bl", this->params()),
+ double_MEMBER(crunch_del_low_word, 0.5,
+ "Del if word gt xht x this below bl", this->params()),
+ double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this",
+ this->params()),
+ INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch",
+ this->params()),
+ INT_MEMBER(crunch_pot_indicators, 1,
+ "How many potential indicators needed", this->params()),
+ BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings",
+ this->params()),
+ BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring",
+ this->params()),
+ BOOL_MEMBER(crunch_leave_accept_strings, false,
+ "Don't pot crunch sensible strings", this->params()),
+ BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures",
+ this->params()),
+ INT_MEMBER(crunch_leave_lc_strings, 4,
+ "Don't crunch words with long lower case strings",
+ this->params()),
+ INT_MEMBER(crunch_leave_uc_strings, 4,
+ "Don't crunch words with long lower case strings",
+ this->params()),
+ INT_MEMBER(crunch_long_repetitions, 3,
+ "Crunch words with long repetitions", this->params()),
+ INT_MEMBER(crunch_debug, 0, "As it says", this->params()),
+ INT_MEMBER(fixsp_non_noise_limit, 1,
+ "How many non-noise blbs either side?", this->params()),
+ double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this",
+ this->params()),
+ BOOL_MEMBER(tessedit_prefer_joined_punct, false,
+ "Reward punctuation joins", this->params()),
+ INT_MEMBER(fixsp_done_mode, 1, "What constitues done for spacing",
+ this->params()),
+ INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug",
+ this->params()),
+ STRING_MEMBER(numeric_punctuation, ".,",
+ "Punct. chs expected WITHIN numbers", this->params()),
+ INT_MEMBER(x_ht_acceptance_tolerance, 8,
+ "Max allowed deviation of blob top outside of font data",
+ this->params()),
+ INT_MEMBER(x_ht_min_change, 8,
+ "Min change in xht before actually trying it", this->params()),
+ INT_MEMBER(superscript_debug, 0,
+ "Debug level for sub & superscript fixer", this->params()),
+ double_MEMBER(
+ superscript_worse_certainty, 2.0,
+ "How many times worse "
+ "certainty does a superscript position glyph need to be for "
+ "us to try classifying it as a char with a different "
+ "baseline?",
+ this->params()),
+ double_MEMBER(
+ superscript_bettered_certainty, 0.97,
+ "What reduction in "
+ "badness do we think sufficient to choose a superscript "
+ "over what we'd thought. For example, a value of 0.6 means "
+ "we want to reduce badness of certainty by at least 40%",
+ this->params()),
+ double_MEMBER(superscript_scaledown_ratio, 0.4,
+ "A superscript scaled down more than this is unbelievably "
+ "small. For example, 0.3 means we expect the font size to "
+ "be no smaller than 30% of the text line font size.",
+ this->params()),
+ double_MEMBER(subscript_max_y_top, 0.5,
+ "Maximum top of a character measured as a multiple of "
+ "x-height above the baseline for us to reconsider whether "
+ "it's a subscript.",
+ this->params()),
+ double_MEMBER(superscript_min_y_bottom, 0.3,
+ "Minimum bottom of a character measured as a multiple of "
+ "x-height above the baseline for us to reconsider whether "
+ "it's a superscript.",
+ this->params()),
+ BOOL_MEMBER(tessedit_write_block_separators, false,
+ "Write block separators in output", this->params()),
+ BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code",
+ this->params()),
+ BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file",
+ this->params()),
+ BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file",
+ this->params()),
+ BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
+ this->params()),
+ BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file",
+ this->params()),
+ BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
+ this->params()),
+ BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
+ this->params()),
+ BOOL_MEMBER(tessedit_create_wordstrbox, false, "Write WordStr format .box output file",
+ this->params()),
+ BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
+ this->params()),
+ BOOL_MEMBER(textonly_pdf, false,
+ "Create PDF with only one invisible text layer",
+ this->params()),
+ INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params()),
+ INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image",
+ this->params()),
+ INT_MEMBER(min_characters_to_try, 50,
+ "Specify minimum characters to try during OSD",
+ this->params()),
+ STRING_MEMBER(unrecognised_char, "|",
+ "Output char for unidentified blobs", this->params()),
+ INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
+ INT_MEMBER(suspect_short_words, 2,
+ "Don't suspect dict wds longer than this", this->params()),
+ BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected",
+ this->params()),
+ double_MEMBER(suspect_rating_per_ch, 999.9,
+ "Don't touch bad rating limit", this->params()),
+ double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit",
+ this->params()),
+ BOOL_MEMBER(tessedit_minimal_rejection, false,
+ "Only reject tess failures", this->params()),
+ BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING",
+ this->params()),
+ BOOL_MEMBER(tessedit_word_for_word, false,
+ "Make output have exactly one word per WERD", this->params()),
+ BOOL_MEMBER(tessedit_zero_kelvin_rejection, false,
+ "Don't reject ANYTHING AT ALL", this->params()),
+ INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm",
+ this->params()),
+ BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug",
+ this->params()),
+ BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips",
+ this->params()),
+ double_MEMBER(tessedit_lower_flip_hyphen, 1.5,
+ "Aspect ratio dot/hyphen test", this->params()),
+ double_MEMBER(tessedit_upper_flip_hyphen, 1.8,
+ "Aspect ratio dot/hyphen test", this->params()),
+ BOOL_MEMBER(rej_trust_doc_dawg, false,
+ "Use DOC dawg in 11l conf. detector", this->params()),
+ BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test",
+ this->params()),
+ BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check",
+ this->params()),
+ BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control",
+ this->params()),
+ BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control",
+ this->params()),
+ BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control",
+ this->params()),
+ BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check",
+ this->params()),
+ BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check",
+ this->params()),
+ double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85,
+ "if >this fract", this->params()),
+ INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit",
+ this->params()),
+ STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075",
+ "Allow NN to unrej", this->params()),
+ STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set",
+ this->params()),
+ INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this",
+ this->params()),
+ BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes",
+ this->params()),
+ INT_MEMBER(tessedit_page_number, -1,
+ "-1 -> All pages, else specific page to process",
+ this->params()),
+ BOOL_MEMBER(tessedit_write_images, false,
+ "Capture the image from the IPE", this->params()),
+ BOOL_MEMBER(interactive_display_mode, false, "Run interactively?",
+ this->params()),
+ STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
+ BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word",
+ this->params()),
+ STRING_MEMBER(tessedit_load_sublangs, "",
+ "List of languages to load with this one", this->params()),
+ BOOL_MEMBER(tessedit_use_primary_params_model, false,
+ "In multilingual mode use params model of the"
+ " primary language",
+ this->params()),
+ double_MEMBER(min_orientation_margin, 7.0,
+ "Min acceptable orientation margin", this->params()),
+ BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
+ this->params()),
+ BOOL_MEMBER(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model",
+ this->params()),
+ BOOL_MEMBER(poly_allow_detailed_fx, false,
+ "Allow feature extractors to see the original outline",
+ this->params()),
+ BOOL_INIT_MEMBER(tessedit_init_config_only, false,
+ "Only initialize with the config file. Useful if the "
+ "instance is not going to be used for OCR but say only "
+ "for layout analysis.",
+ this->params()),
+ BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
+ this->params()),
+ BOOL_MEMBER(textord_tabfind_vertical_text, true,
+ "Enable vertical detection", this->params()),
+ BOOL_MEMBER(textord_tabfind_force_vertical_text, false,
+ "Force using vertical text page mode", this->params()),
+ double_MEMBER(
+ textord_tabfind_vertical_text_ratio, 0.5,
+ "Fraction of textlines deemed vertical to use vertical page "
+ "mode",
+ this->params()),
+ double_MEMBER(
+ textord_tabfind_aligned_gap_fraction, 0.75,
+ "Fraction of height used as a minimum gap for aligned blobs.",
+ this->params()),
+ INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
+ this->params()),
+ BOOL_MEMBER(preserve_interword_spaces, false,
+ "Preserve multiple interword spaces", this->params()),
+ STRING_MEMBER(page_separator, "\f",
+ "Page separator (default is form feed control character)",
+ this->params()),
+ INT_MEMBER(lstm_choice_mode, 0,
+ "Allows to include alternative symbols choices in the hOCR output. "
+ "Valid input values are 0, 1 and 2. 0 is the default value. "
+ "With 1 the alternative symbol choices per timestep are included. "
+ "With 2 alternative symbol choices are extracted from the CTC "
+ "process instead of the lattice. The choices are mapped per "
+ "character.",
+ this->params()),
+ INT_MEMBER(
+ lstm_choice_iterations, 5,
+ "Sets the number of cascading iterations for the Beamsearch in "
+ "lstm_choice_mode. Note that lstm_choice_mode must be set to a "
+ "value greater than 0 to produce results.",
+ this->params()),
+ double_MEMBER(
+ lstm_rating_coefficient, 5,
+ "Sets the rating coefficient for the lstm choices. The smaller the "
+ "coefficient, the better are the ratings for each choice and less "
+ "information is lost due to the cut off at 0. The standard value is "
+ "5", this->params()),
+ BOOL_MEMBER(pageseg_apply_music_mask, true,
+ "Detect music staff and remove intersecting components", this->params()),
+
+ backup_config_file_(nullptr),
+ pix_binary_(nullptr),
+ pix_grey_(nullptr),
+ pix_original_(nullptr),
+ pix_thresholds_(nullptr),
+ source_resolution_(0),
+ textord_(this),
+ right_to_left_(false),
+ scaled_color_(nullptr),
+ scaled_factor_(-1),
+ deskew_(1.0f, 0.0f),
+ reskew_(1.0f, 0.0f),
+ most_recently_used_(this),
+ font_table_size_(0),
+ equ_detect_(nullptr),
+ lstm_recognizer_(nullptr),
+ train_line_page_num_(0) {
+}
+
+Tesseract::~Tesseract() {
+ Clear();
+ pixDestroy(&pix_original_);
+ end_tesseract();
+ for (auto* lang : sub_langs_) {
+ delete lang;
+ }
+ delete lstm_recognizer_;
+ lstm_recognizer_ = nullptr;
+}
+
+Dict& Tesseract::getDict() {
+ if (0 == Classify::getDict().NumDawgs() && AnyLSTMLang()) {
+ if (lstm_recognizer_ && lstm_recognizer_->GetDict()) {
+ return *lstm_recognizer_->GetDict();
+ }
+ }
+ return Classify::getDict();
+}
+
+
+void Tesseract::Clear() {
+ STRING debug_name = imagebasename + "_debug.pdf";
+ pixa_debug_.WritePDF(debug_name.c_str());
+ pixDestroy(&pix_binary_);
+ pixDestroy(&pix_grey_);
+ pixDestroy(&pix_thresholds_);
+ pixDestroy(&scaled_color_);
+ deskew_ = FCOORD(1.0f, 0.0f);
+ reskew_ = FCOORD(1.0f, 0.0f);
+ splitter_.Clear();
+ scaled_factor_ = -1;
+ for (int i = 0; i < sub_langs_.size(); ++i)
+ sub_langs_[i]->Clear();
+}
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+void Tesseract::SetEquationDetect(EquationDetect* detector) {
+ equ_detect_ = detector;
+ equ_detect_->SetLangTesseract(this);
+}
+
+// Clear all memory of adaption for this and all subclassifiers.
+void Tesseract::ResetAdaptiveClassifier() {
+ ResetAdaptiveClassifierInternal();
+ for (int i = 0; i < sub_langs_.size(); ++i) {
+ sub_langs_[i]->ResetAdaptiveClassifierInternal();
+ }
+}
+
+#endif //ndef DISABLED_LEGACY_ENGINE
+
+// Clear the document dictionary for this and all subclassifiers.
+void Tesseract::ResetDocumentDictionary() {
+ getDict().ResetDocumentDictionary();
+ for (int i = 0; i < sub_langs_.size(); ++i) {
+ sub_langs_[i]->getDict().ResetDocumentDictionary();
+ }
+}
+
+void Tesseract::SetBlackAndWhitelist() {
+ // Set the white and blacklists (if any)
+ unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
+ tessedit_char_whitelist.c_str(),
+ tessedit_char_unblacklist.c_str());
+ if (lstm_recognizer_) {
+ UNICHARSET& lstm_unicharset = lstm_recognizer_->GetUnicharset();
+ lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
+ tessedit_char_whitelist.c_str(),
+ tessedit_char_unblacklist.c_str());
+ }
+ // Black and white lists should apply to all loaded classifiers.
+ for (int i = 0; i < sub_langs_.size(); ++i) {
+ sub_langs_[i]->unicharset.set_black_and_whitelist(
+ tessedit_char_blacklist.c_str(), tessedit_char_whitelist.c_str(),
+ tessedit_char_unblacklist.c_str());
+ if (sub_langs_[i]->lstm_recognizer_) {
+ UNICHARSET& lstm_unicharset = sub_langs_[i]->lstm_recognizer_->GetUnicharset();
+ lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.c_str(),
+ tessedit_char_whitelist.c_str(),
+ tessedit_char_unblacklist.c_str());
+ }
+ }
+}
+
+// Perform steps to prepare underlying binary image/other data structures for
+// page segmentation.
+void Tesseract::PrepareForPageseg() {
+ textord_.set_use_cjk_fp_model(textord_use_cjk_fp_model);
+ // Find the max splitter strategy over all langs.
+ auto max_pageseg_strategy =
+ static_cast<ShiroRekhaSplitter::SplitStrategy>(
+ static_cast<int32_t>(pageseg_devanagari_split_strategy));
+ for (int i = 0; i < sub_langs_.size(); ++i) {
+ auto pageseg_strategy =
+ static_cast<ShiroRekhaSplitter::SplitStrategy>(
+ static_cast<int32_t>(sub_langs_[i]->pageseg_devanagari_split_strategy));
+ if (pageseg_strategy > max_pageseg_strategy)
+ max_pageseg_strategy = pageseg_strategy;
+ pixDestroy(&sub_langs_[i]->pix_binary_);
+ sub_langs_[i]->pix_binary_ = pixClone(pix_binary());
+ }
+ // Perform shiro-rekha (top-line) splitting and replace the current image by
+ // the newly split image.
+ splitter_.set_orig_pix(pix_binary());
+ splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
+ if (splitter_.Split(true, &pixa_debug_)) {
+ ASSERT_HOST(splitter_.splitted_image());
+ pixDestroy(&pix_binary_);
+ pix_binary_ = pixClone(splitter_.splitted_image());
+ }
+}
+
+// Perform steps to prepare underlying binary image/other data structures for
+// OCR. The current segmentation is required by this method.
+// Note that this method resets pix_binary_ to the original binarized image,
+// which may be different from the image actually used for OCR depending on the
+// value of devanagari_ocr_split_strategy.
+void Tesseract::PrepareForTessOCR(BLOCK_LIST* block_list,
+ Tesseract* osd_tess, OSResults* osr) {
+ // Find the max splitter strategy over all langs.
+ auto max_ocr_strategy =
+ static_cast<ShiroRekhaSplitter::SplitStrategy>(
+ static_cast<int32_t>(ocr_devanagari_split_strategy));
+ for (int i = 0; i < sub_langs_.size(); ++i) {
+ auto ocr_strategy =
+ static_cast<ShiroRekhaSplitter::SplitStrategy>(
+ static_cast<int32_t>(sub_langs_[i]->ocr_devanagari_split_strategy));
+ if (ocr_strategy > max_ocr_strategy)
+ max_ocr_strategy = ocr_strategy;
+ }
+ // Utilize the segmentation information available.
+ splitter_.set_segmentation_block_list(block_list);
+ splitter_.set_ocr_split_strategy(max_ocr_strategy);
+ // Run the splitter for OCR
+ bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
+ // Restore pix_binary to the binarized original pix for future reference.
+ ASSERT_HOST(splitter_.orig_pix());
+ pixDestroy(&pix_binary_);
+ pix_binary_ = pixClone(splitter_.orig_pix());
+ // If the pageseg and ocr strategies are different, refresh the block list
+ // (from the last SegmentImage call) with blobs from the real image to be used
+ // for OCR.
+ if (splitter_.HasDifferentSplitStrategies()) {
+ BLOCK block("", true, 0, 0, 0, 0, pixGetWidth(pix_binary_),
+ pixGetHeight(pix_binary_));
+ Pix* pix_for_ocr = split_for_ocr ? splitter_.splitted_image() :
+ splitter_.orig_pix();
+ extract_edges(pix_for_ocr, &block);
+ splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
+ }
+ // The splitter isn't needed any more after this, so save memory by clearing.
+ splitter_.Clear();
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/tesseractclass.h b/tesseract/src/ccmain/tesseractclass.h
new file mode 100644
index 00000000..159b0ea7
--- /dev/null
+++ b/tesseract/src/ccmain/tesseractclass.h
@@ -0,0 +1,1163 @@
+///////////////////////////////////////////////////////////////////////
+// File: tesseractclass.h
+// Description: The Tesseract class. It holds/owns everything needed
+// to run Tesseract on a single language, and also a set of
+// sub-Tesseracts to run sub-languages. For thread safety, *every*
+// global variable goes in here, directly, or indirectly.
+// This makes it safe to run multiple Tesseracts in different
+// threads in parallel, and keeps the different language
+// instances separate.
+// Author: Ray Smith
+//
+// (C) Copyright 2008, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H_
+#define TESSERACT_CCMAIN_TESSERACTCLASS_H_
+
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h" // DISABLED_LEGACY_ENGINE
+#endif
+
+#include "control.h" // for ACCEPTABLE_WERD_TYPE
+#include "debugpixa.h" // for DebugPixa
+#include "devanagari_processing.h" // for ShiroRekhaSplitter
+#ifndef DISABLED_LEGACY_ENGINE
+#include "docqual.h" // for GARBAGE_LEVEL
+#endif
+#include "pageres.h" // for WERD_RES (ptr only), PAGE_RES (pt...
+#include "params.h" // for BOOL_VAR_H, BoolParam, DoubleParam
+#include "points.h" // for FCOORD
+#include "ratngs.h" // for ScriptPos, WERD_CHOICE (ptr only)
+#include "tessdatamanager.h" // for TessdataManager
+#include "textord.h" // for Textord
+#include "wordrec.h" // for Wordrec
+
+#include "genericvector.h" // for GenericVector, PointerVector
+#include <tesseract/publictypes.h> // for OcrEngineMode, PageSegMode, OEM_L...
+#include "strngs.h" // for STRING
+#include <tesseract/unichar.h> // for UNICHAR_ID
+
+#include "allheaders.h" // for pixDestroy, pixGetWidth, pixGetHe...
+
+#include <cstdint> // for int16_t, int32_t, uint16_t
+#include <cstdio> // for FILE
+
+namespace tesseract {
+
+class BLOCK_LIST;
+class ETEXT_DESC;
+struct OSResults;
+class PAGE_RES;
+class PAGE_RES_IT;
+class ROW;
+class SVMenuNode;
+class TBOX;
+class TO_BLOCK_LIST;
+class WERD;
+class WERD_CHOICE;
+class WERD_RES;
+
+class ColumnFinder;
+class DocumentData;
+class EquationDetect;
+class ImageData;
+class LSTMRecognizer;
+class Tesseract;
+
+// Top-level class for all tesseract global instance data.
+// This class either holds or points to all data used by an instance
+// of Tesseract, including the memory allocator. When this is
+// complete, Tesseract will be thread-safe. UNTIL THEN, IT IS NOT!
+//
+// NOTE to developers: Do not create cyclic dependencies through this class!
+// The directory dependency tree must remain a tree! The keep this clean,
+// lower-level code (eg in ccutil, the bottom level) must never need to
+// know about the content of a higher-level directory.
+// The following scheme will grant the easiest access to lower-level
+// global members without creating a cyclic dependency:
+//
+// Class Hierarchy (^ = inheritance):
+//
+// CCUtil (ccutil/ccutil.h)
+// ^ Members include: UNICHARSET
+// CCStruct (ccstruct/ccstruct.h)
+// ^ Members include: Image
+// Classify (classify/classify.h)
+// ^ Members include: Dict
+// WordRec (wordrec/wordrec.h)
+// ^ Members include: WERD*, DENORM*
+// Tesseract (ccmain/tesseractclass.h)
+// Members include: Pix*
+//
+// Other important classes:
+//
+// TessBaseAPI (tesseract/baseapi.h)
+// Members include: BLOCK_LIST*, PAGE_RES*,
+// Tesseract*, ImageThresholder*
+// Dict (dict/dict.h)
+// Members include: Image* (private)
+//
+// NOTE: that each level contains members that correspond to global
+// data that is defined (and used) at that level, not necessarily where
+// the type is defined so for instance:
+// BOOL_VAR_H(textord_show_blobs, false, "Display unsorted blobs");
+// goes inside the Textord class, not the cc_util class.
+
+// A collection of various variables for statistics and debugging.
+struct TesseractStats {
+ TesseractStats()
+ : adaption_word_number(0),
+ doc_blob_quality(0),
+ doc_outline_errs(0),
+ doc_char_quality(0),
+ good_char_count(0),
+ doc_good_char_quality(0),
+ word_count(0),
+ dict_words(0),
+ tilde_crunch_written(false),
+ last_char_was_newline(true),
+ last_char_was_tilde(false),
+ write_results_empty_block(true) {}
+
+ int32_t adaption_word_number;
+ int16_t doc_blob_quality;
+ int16_t doc_outline_errs;
+ int16_t doc_char_quality;
+ int16_t good_char_count;
+ int16_t doc_good_char_quality;
+ int32_t word_count; // count of word in the document
+ int32_t dict_words; // number of dicitionary words in the document
+ STRING dump_words_str; // accumulator used by dump_words()
+ // Flags used by write_results()
+ bool tilde_crunch_written;
+ bool last_char_was_newline;
+ bool last_char_was_tilde;
+ bool write_results_empty_block;
+};
+
+// Struct to hold all the pointers to relevant data for processing a word.
+struct WordData {
+ WordData()
+ : word(nullptr), row(nullptr), block(nullptr), prev_word(nullptr) {}
+ explicit WordData(const PAGE_RES_IT& page_res_it)
+ : word(page_res_it.word()),
+ row(page_res_it.row()->row),
+ block(page_res_it.block()->block),
+ prev_word(nullptr) {}
+ WordData(BLOCK* block_in, ROW* row_in, WERD_RES* word_res)
+ : word(word_res), row(row_in), block(block_in), prev_word(nullptr) {}
+
+ WERD_RES* word;
+ ROW* row;
+ BLOCK* block;
+ WordData* prev_word;
+ PointerVector<WERD_RES> lang_words;
+};
+
+// Definition of a Tesseract WordRecognizer. The WordData provides the context
+// of row/block, in_word holds an initialized, possibly pre-classified word,
+// that the recognizer may or may not consume (but if so it sets
+// *in_word=nullptr) and produces one or more output words in out_words, which
+// may be the consumed in_word, or may be generated independently. This api
+// allows both a conventional tesseract classifier to work, or a line-level
+// classifier that generates multiple words from a merged input.
+using WordRecognizer = void (Tesseract::*)(const WordData&, WERD_RES**,
+ PointerVector<WERD_RES>*);
+
+class TESS_API Tesseract : public Wordrec {
+ public:
+ Tesseract();
+ ~Tesseract() override;
+
+ // Return appropriate dictionary
+ Dict& getDict() override;
+
+ // Clear as much used memory as possible without resetting the adaptive
+ // classifier or losing any other classifier data.
+ void Clear();
+ // Clear all memory of adaption for this and all subclassifiers.
+ void ResetAdaptiveClassifier();
+ // Clear the document dictionary for this and all subclassifiers.
+ void ResetDocumentDictionary();
+
+ // Set the equation detector.
+ void SetEquationDetect(EquationDetect* detector);
+
+ // Simple accessors.
+ const FCOORD& reskew() const {
+ return reskew_;
+ }
+ // Destroy any existing pix and return a pointer to the pointer.
+ Pix** mutable_pix_binary() {
+ pixDestroy(&pix_binary_);
+ return &pix_binary_;
+ }
+ Pix* pix_binary() const {
+ return pix_binary_;
+ }
+ Pix* pix_grey() const {
+ return pix_grey_;
+ }
+ void set_pix_grey(Pix* grey_pix) {
+ pixDestroy(&pix_grey_);
+ pix_grey_ = grey_pix;
+ }
+ Pix* pix_original() const {
+ return pix_original_;
+ }
+ // Takes ownership of the given original_pix.
+ void set_pix_original(Pix* original_pix) {
+ pixDestroy(&pix_original_);
+ pix_original_ = original_pix;
+ // Clone to sublangs as well.
+ for (int i = 0; i < sub_langs_.size(); ++i) {
+ sub_langs_[i]->set_pix_original(original_pix ? pixClone(original_pix)
+ : nullptr);
+ }
+ }
+ // Returns a pointer to a Pix representing the best available resolution image
+ // of the page, with best available bit depth as second priority. Result can
+ // be of any bit depth, but never color-mapped, as that has always been
+ // removed. Note that in grey and color, 0 is black and 255 is
+ // white. If the input was binary, then black is 1 and white is 0.
+ // To tell the difference pixGetDepth() will return 32, 8 or 1.
+ // In any case, the return value is a borrowed Pix, and should not be
+ // deleted or pixDestroyed.
+ Pix* BestPix() const {
+ if (pixGetWidth(pix_original_) == ImageWidth()) {
+ return pix_original_;
+ } else if (pix_grey_ != nullptr) {
+ return pix_grey_;
+ } else {
+ return pix_binary_;
+ }
+ }
+ void set_pix_thresholds(Pix* thresholds) {
+ pixDestroy(&pix_thresholds_);
+ pix_thresholds_ = thresholds;
+ }
+ int source_resolution() const {
+ return source_resolution_;
+ }
+ void set_source_resolution(int ppi) {
+ source_resolution_ = ppi;
+ }
+ int ImageWidth() const {
+ return pixGetWidth(pix_binary_);
+ }
+ int ImageHeight() const {
+ return pixGetHeight(pix_binary_);
+ }
+ Pix* scaled_color() const {
+ return scaled_color_;
+ }
+ int scaled_factor() const {
+ return scaled_factor_;
+ }
+ void SetScaledColor(int factor, Pix* color) {
+ scaled_factor_ = factor;
+ scaled_color_ = color;
+ }
+ const Textord& textord() const {
+ return textord_;
+ }
+ Textord* mutable_textord() {
+ return &textord_;
+ }
+
+ bool right_to_left() const {
+ return right_to_left_;
+ }
+ int num_sub_langs() const {
+ return sub_langs_.size();
+ }
+ Tesseract* get_sub_lang(int index) const {
+ return sub_langs_[index];
+ }
+ // Returns true if any language uses Tesseract (as opposed to LSTM).
+ bool AnyTessLang() const {
+ if (tessedit_ocr_engine_mode != OEM_LSTM_ONLY)
+ return true;
+ for (int i = 0; i < sub_langs_.size(); ++i) {
+ if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_LSTM_ONLY)
+ return true;
+ }
+ return false;
+ }
+ // Returns true if any language uses the LSTM.
+ bool AnyLSTMLang() const {
+ if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY)
+ return true;
+ for (int i = 0; i < sub_langs_.size(); ++i) {
+ if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ void SetBlackAndWhitelist();
+
+ // Perform steps to prepare underlying binary image/other data structures for
+ // page segmentation. Uses the strategy specified in the global variable
+ // pageseg_devanagari_split_strategy for perform splitting while preparing for
+ // page segmentation.
+ void PrepareForPageseg();
+
+ // Perform steps to prepare underlying binary image/other data structures for
+ // Tesseract OCR. The current segmentation is required by this method.
+ // Uses the strategy specified in the global variable
+ // ocr_devanagari_split_strategy for performing splitting while preparing for
+ // Tesseract ocr.
+ void PrepareForTessOCR(BLOCK_LIST* block_list, Tesseract* osd_tess,
+ OSResults* osr);
+
+ int SegmentPage(const char* input_file, BLOCK_LIST* blocks,
+ Tesseract* osd_tess, OSResults* osr);
+ void SetupWordScripts(BLOCK_LIST* blocks);
+ int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
+ TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs,
+ Tesseract* osd_tess, OSResults* osr);
+ ColumnFinder* SetupPageSegAndDetectOrientation(
+ PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess,
+ OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
+ Pix** music_mask_pix);
+ // par_control.cpp
+ void PrerecAllWordsPar(const std::vector<WordData>& words);
+
+ //// linerec.cpp
+ // Generates training data for training a line recognizer, eg LSTM.
+ // Breaks the page into lines, according to the boxes, and writes them to a
+ // serialized DocumentData based on output_basename.
+ // Return true if successful, false if an error occurred.
+ bool TrainLineRecognizer(const char* input_imagename,
+ const STRING& output_basename,
+ BLOCK_LIST* block_list);
+ // Generates training data for training a line recognizer, eg LSTM.
+ // Breaks the boxes into lines, normalizes them, converts to ImageData and
+ // appends them to the given training_data.
+ void TrainFromBoxes(const std::vector<TBOX>& boxes,
+ const std::vector<STRING>& texts,
+ BLOCK_LIST* block_list, DocumentData* training_data);
+
+ // Returns an Imagedata containing the image of the given textline,
+ // and ground truth boxes/truth text if available in the input.
+ // The image is not normalized in any way.
+ ImageData* GetLineData(const TBOX& line_box, const std::vector<TBOX>& boxes,
+ const std::vector<STRING>& texts, int start_box,
+ int end_box, const BLOCK& block);
+ // Helper gets the image of a rectangle, using the block.re_rotation() if
+ // needed to get to the image, and rotating the result back to horizontal
+ // layout. (CJK characters will be on their left sides) The vertical text flag
+ // is set in the returned ImageData if the text was originally vertical, which
+ // can be used to invoke a different CJK recognition engine. The revised_box
+ // is also returned to enable calculation of output bounding boxes.
+ ImageData* GetRectImage(const TBOX& box, const BLOCK& block, int padding,
+ TBOX* revised_box) const;
+ // Recognizes a word or group of words, converting to WERD_RES in *words.
+ // Analogous to classify_word_pass1, but can handle a group of words as well.
+ void LSTMRecognizeWord(const BLOCK& block, ROW* row, WERD_RES* word,
+ PointerVector<WERD_RES>* words);
+ // Apply segmentation search to the given set of words, within the constraints
+ // of the existing ratings matrix. If there is already a best_choice on a word
+ // leaves it untouched and just sets the done/accepted etc flags.
+ void SearchWords(PointerVector<WERD_RES>* words);
+
+ //// control.h /////////////////////////////////////////////////////////
+ bool ProcessTargetWord(const TBOX& word_box, const TBOX& target_word_box,
+ const char* word_config, int pass);
+ // Sets up the words ready for whichever engine is to be run
+ void SetupAllWordsPassN(int pass_n, const TBOX* target_word_box,
+ const char* word_config, PAGE_RES* page_res,
+ std::vector<WordData>* words);
+ // Sets up the single word ready for whichever engine is to be run.
+ void SetupWordPassN(int pass_n, WordData* word);
+ // Runs word recognition on all the words.
+ bool RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor, PAGE_RES_IT* pr_it,
+ std::vector<WordData>* words);
+ bool recog_all_words(PAGE_RES* page_res, ETEXT_DESC* monitor,
+ const TBOX* target_word_box, const char* word_config,
+ int dopasses);
+ void rejection_passes(PAGE_RES* page_res, ETEXT_DESC* monitor,
+ const TBOX* target_word_box, const char* word_config);
+ void bigram_correction_pass(PAGE_RES* page_res);
+ void blamer_pass(PAGE_RES* page_res);
+ // Sets script positions and detects smallcaps on all output words.
+ void script_pos_pass(PAGE_RES* page_res);
+ // Helper to recognize the word using the given (language-specific) tesseract.
+ // Returns positive if this recognizer found more new best words than the
+ // number kept from best_words.
+ int RetryWithLanguage(const WordData& word_data, WordRecognizer recognizer,
+ bool debug, WERD_RES** in_word,
+ PointerVector<WERD_RES>* best_words);
+ // Moves good-looking "noise"/diacritics from the reject list to the main
+ // blob list on the current word. Returns true if anything was done, and
+ // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
+ bool ReassignDiacritics(int pass, PAGE_RES_IT* pr_it,
+ bool* make_next_word_fuzzy);
+ // Attempts to put noise/diacritic outlines into the blobs that they overlap.
+ // Input: a set of noisy outlines that probably belong to the real_word.
+ // Output: outlines that overlapped blobs are set to nullptr and put back into
+ // the word, either in the blobs or in the reject list.
+ void AssignDiacriticsToOverlappingBlobs(
+ const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
+ PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
+ GenericVector<bool>* overlapped_any_blob,
+ GenericVector<C_BLOB*>* target_blobs);
+ // Attempts to assign non-overlapping outlines to their nearest blobs or
+ // make new blobs out of them.
+ void AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE*>& outlines,
+ int pass, WERD* real_word, PAGE_RES_IT* pr_it,
+ GenericVector<bool>* word_wanted,
+ GenericVector<C_BLOB*>* target_blobs);
+ // Starting with ok_outlines set to indicate which outlines overlap the blob,
+ // chooses the optimal set (approximately) and returns true if any outlines
+ // are desired, in which case ok_outlines indicates which ones.
+ bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold,
+ PAGE_RES_IT* pr_it, C_BLOB* blob,
+ const GenericVector<C_OUTLINE*>& outlines,
+ int num_outlines,
+ std::vector<bool>* ok_outlines);
+ // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
+ // the inclusion of the outlines, and returns the certainty of the raw choice.
+ float ClassifyBlobPlusOutlines(const std::vector<bool>& ok_outlines,
+ const GenericVector<C_OUTLINE*>& outlines,
+ int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
+ STRING* best_str);
+ // Classifies the given blob (part of word_data->word->word) as an individual
+ // word, using languages, chopper etc, returning only the certainty of the
+ // best raw choice, and undoing all the work done to fake out the word.
+ float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
+ STRING* best_str, float* c2);
+ void classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
+ WordData* word_data);
+ void classify_word_pass1(const WordData& word_data, WERD_RES** in_word,
+ PointerVector<WERD_RES>* out_words);
+ void recog_pseudo_word(PAGE_RES* page_res, // blocks to check
+ TBOX& selection_box);
+
+ void fix_rep_char(PAGE_RES_IT* page_res_it);
+
+ ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET& char_set,
+ const char* s,
+ const char* lengths);
+ void match_word_pass_n(int pass_n, WERD_RES* word, ROW* row, BLOCK* block);
+ void classify_word_pass2(const WordData& word_data, WERD_RES** in_word,
+ PointerVector<WERD_RES>* out_words);
+ void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES* word,
+ WERD_RES* new_word);
+ bool RunOldFixXht(WERD_RES* word, BLOCK* block, ROW* row);
+ bool TrainedXheightFix(WERD_RES* word, BLOCK* block, ROW* row);
+ // Runs recognition with the test baseline shift and x-height and returns true
+ // if there was an improvement in recognition result.
+ bool TestNewNormalization(int original_misfits, float baseline_shift,
+ float new_x_ht, WERD_RES* word, BLOCK* block,
+ ROW* row);
+ bool recog_interactive(PAGE_RES_IT* pr_it);
+
+ // Set fonts of this word.
+ void set_word_fonts(WERD_RES* word);
+ void font_recognition_pass(PAGE_RES* page_res);
+ void dictionary_correction_pass(PAGE_RES* page_res);
+ bool check_debug_pt(WERD_RES* word, int location);
+
+ //// superscript.cpp ////////////////////////////////////////////////////
+ bool SubAndSuperscriptFix(WERD_RES* word_res);
+ void GetSubAndSuperscriptCandidates(
+ const WERD_RES* word, int* num_rebuilt_leading, ScriptPos* leading_pos,
+ float* leading_certainty, int* num_rebuilt_trailing,
+ ScriptPos* trailing_pos, float* trailing_certainty, float* avg_certainty,
+ float* unlikely_threshold);
+ WERD_RES* TrySuperscriptSplits(int num_chopped_leading,
+ float leading_certainty, ScriptPos leading_pos,
+ int num_chopped_trailing,
+ float trailing_certainty,
+ ScriptPos trailing_pos, WERD_RES* word,
+ bool* is_good, int* retry_leading,
+ int* retry_trailing);
+ bool BelievableSuperscript(bool debug, const WERD_RES& word,
+ float certainty_threshold, int* left_ok,
+ int* right_ok) const;
+
+ //// output.h //////////////////////////////////////////////////////////
+
+ void output_pass(PAGE_RES_IT& page_res_it, const TBOX* target_word_box);
+ void write_results(PAGE_RES_IT& page_res_it, // full info
+ char newline_type, // type of newline
+ bool force_eol // override tilde crunch?
+ );
+ void set_unlv_suspects(WERD_RES* word);
+ UNICHAR_ID get_rep_char(WERD_RES* word); // what char is repeated?
+ bool acceptable_number_string(const char* s, const char* lengths);
+ int16_t count_alphanums(const WERD_CHOICE& word);
+ int16_t count_alphas(const WERD_CHOICE& word);
+
+ void read_config_file(const char* filename, SetParamConstraint constraint);
+ // Initialize for potentially a set of languages defined by the language
+ // string and recursively any additional languages required by any language
+ // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
+ // See init_tesseract_internal for args.
+ int init_tesseract(const char* arg0, const char* textbase,
+ const char* language, OcrEngineMode oem, char** configs,
+ int configs_size, const std::vector<std::string>* vars_vec,
+ const std::vector<std::string>* vars_values,
+ bool set_only_init_params, TessdataManager* mgr);
+ int init_tesseract(const char* datapath, const char* language,
+ OcrEngineMode oem) {
+ TessdataManager mgr;
+ return init_tesseract(datapath, nullptr, language, oem, nullptr, 0, nullptr,
+ nullptr, false, &mgr);
+ }
+ // Common initialization for a single language.
+ // arg0 is the datapath for the tessdata directory, which could be the
+ // path of the tessdata directory with no trailing /, or (if tessdata
+ // lives in the same directory as the executable, the path of the executable,
+ // hence the name arg0.
+ // textbase is an optional output file basename (used only for training)
+ // language is the language code to load.
+ // oem controls which engine(s) will operate on the image
+ // configs (argv) is an array of config filenames to load variables from.
+ // May be nullptr.
+ // configs_size (argc) is the number of elements in configs.
+ // vars_vec is an optional vector of variables to set.
+ // vars_values is an optional corresponding vector of values for the variables
+ // in vars_vec.
+ // If set_only_init_params is true, then only the initialization variables
+ // will be set.
+ int init_tesseract_internal(const char* arg0, const char* textbase,
+ const char* language, OcrEngineMode oem,
+ char** configs, int configs_size,
+ const std::vector<std::string>* vars_vec,
+ const std::vector<std::string>* vars_values,
+ bool set_only_init_params, TessdataManager* mgr);
+
+ // Set the universal_id member of each font to be unique among all
+ // instances of the same font loaded.
+ void SetupUniversalFontIds();
+
+ int init_tesseract_lm(const char* arg0, const char* textbase,
+ const char* language, TessdataManager* mgr);
+
+ void recognize_page(STRING& image_name);
+ void end_tesseract();
+
+ bool init_tesseract_lang_data(const char* arg0, const char* textbase,
+ const char* language, OcrEngineMode oem,
+ char** configs, int configs_size,
+ const std::vector<std::string>* vars_vec,
+ const std::vector<std::string>* vars_values,
+ bool set_only_init_params,
+ TessdataManager* mgr);
+
+ void ParseLanguageString(const char* lang_str, std::vector<std::string>* to_load,
+ std::vector<std::string>* not_to_load);
+
+ //// pgedit.h //////////////////////////////////////////////////////////
+ SVMenuNode* build_menu_new();
+#ifndef GRAPHICS_DISABLED
+ void pgeditor_main(int width, int height, PAGE_RES* page_res);
+#endif // !GRAPHICS_DISABLED
+ void process_image_event( // action in image win
+ const SVEvent& event);
+ bool process_cmd_win_event( // UI command semantics
+ int32_t cmd_event, // which menu item?
+ char* new_value // any prompt data
+ );
+ void debug_word(PAGE_RES* page_res, const TBOX& selection_box);
+ void do_re_display(
+ bool (tesseract::Tesseract::*word_painter)(PAGE_RES_IT* pr_it));
+ bool word_display(PAGE_RES_IT* pr_it);
+ bool word_bln_display(PAGE_RES_IT* pr_it);
+ bool word_blank_and_set_display(PAGE_RES_IT* pr_its);
+ bool word_set_display(PAGE_RES_IT* pr_it);
+ // #ifndef GRAPHICS_DISABLED
+ bool word_dumper(PAGE_RES_IT* pr_it);
+ // #endif // !GRAPHICS_DISABLED
+ void blob_feature_display(PAGE_RES* page_res, const TBOX& selection_box);
+ //// reject.h //////////////////////////////////////////////////////////
+ // make rej map for word
+ void make_reject_map(WERD_RES* word, ROW* row, int16_t pass);
+ bool one_ell_conflict(WERD_RES* word_res, bool update_map);
+ int16_t first_alphanum_index(const char* word, const char* word_lengths);
+ int16_t first_alphanum_offset(const char* word, const char* word_lengths);
+ int16_t alpha_count(const char* word, const char* word_lengths);
+ bool word_contains_non_1_digit(const char* word, const char* word_lengths);
+ void dont_allow_1Il(WERD_RES* word);
+ int16_t count_alphanums( // how many alphanums
+ WERD_RES* word);
+ void flip_0O(WERD_RES* word);
+ bool non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
+ bool non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
+ bool repeated_nonalphanum_wd(WERD_RES* word, ROW* row);
+ void nn_match_word( // Match a word
+ WERD_RES* word, ROW* row);
+ void nn_recover_rejects(WERD_RES* word, ROW* row);
+ void set_done( // set done flag
+ WERD_RES* word, int16_t pass);
+ int16_t safe_dict_word(const WERD_RES* werd_res); // is best_choice in dict?
+ void flip_hyphens(WERD_RES* word);
+ void reject_I_1_L(WERD_RES* word);
+ void reject_edge_blobs(WERD_RES* word);
+ void reject_mostly_rejects(WERD_RES* word);
+ //// adaptions.h ///////////////////////////////////////////////////////
+ bool word_adaptable( // should we adapt?
+ WERD_RES* word, uint16_t mode);
+
+ //// tfacepp.cpp ///////////////////////////////////////////////////////
+ void recog_word_recursive(WERD_RES* word);
+ void recog_word(WERD_RES* word);
+ void split_and_recog_word(WERD_RES* word);
+ void split_word(WERD_RES* word, int split_pt, WERD_RES** right_piece,
+ BlamerBundle** orig_blamer_bundle) const;
+ void join_words(WERD_RES* word, WERD_RES* word2, BlamerBundle* orig_bb) const;
+ //// fixspace.cpp ///////////////////////////////////////////////////////
+ bool digit_or_numeric_punct(WERD_RES* word, int char_position);
+ int16_t eval_word_spacing(WERD_RES_LIST& word_res_list);
+ void match_current_words(WERD_RES_LIST& words, ROW* row, BLOCK* block);
+ int16_t fp_eval_word_spacing(WERD_RES_LIST& word_res_list);
+ void fix_noisy_space_list(WERD_RES_LIST& best_perm, ROW* row, BLOCK* block);
+ void fix_fuzzy_space_list(WERD_RES_LIST& best_perm, ROW* row, BLOCK* block);
+ void fix_sp_fp_word(WERD_RES_IT& word_res_it, ROW* row, BLOCK* block);
+ void fix_fuzzy_spaces( // find fuzzy words
+ ETEXT_DESC* monitor, // progress monitor
+ int32_t word_count, // count of words in doc
+ PAGE_RES* page_res);
+ void dump_words(WERD_RES_LIST& perm, int16_t score, int16_t mode,
+ bool improved);
+ bool fixspace_thinks_word_done(WERD_RES* word);
+ int16_t worst_noise_blob(WERD_RES* word_res, float* worst_noise_score);
+ float blob_noise_score(TBLOB* blob);
+ void break_noisiest_blob_word(WERD_RES_LIST& words);
+ //// docqual.cpp ////////////////////////////////////////////////////////
+#ifndef DISABLED_LEGACY_ENGINE
+ GARBAGE_LEVEL garbage_word(WERD_RES* word, bool ok_dict_word);
+ bool potential_word_crunch(WERD_RES* word, GARBAGE_LEVEL garbage_level,
+ bool ok_dict_word);
+#endif
+ void tilde_crunch(PAGE_RES_IT& page_res_it);
+ void unrej_good_quality_words( // unreject potential
+ PAGE_RES_IT& page_res_it);
+ void doc_and_block_rejection( // reject big chunks
+ PAGE_RES_IT& page_res_it, bool good_quality_doc);
+ void quality_based_rejection(PAGE_RES_IT& page_res_it, bool good_quality_doc);
+ void convert_bad_unlv_chs(WERD_RES* word_res);
+ void tilde_delete(PAGE_RES_IT& page_res_it);
+ int16_t word_blob_quality(WERD_RES* word);
+ void word_char_quality(WERD_RES* word, int16_t* match_count,
+ int16_t* accepted_match_count);
+ void unrej_good_chs(WERD_RES* word);
+ int16_t count_outline_errs(char c, int16_t outline_count);
+ int16_t word_outline_errs(WERD_RES* word);
+#ifndef DISABLED_LEGACY_ENGINE
+ bool terrible_word_crunch(WERD_RES* word, GARBAGE_LEVEL garbage_level);
+#endif
+ CRUNCH_MODE word_deletable(WERD_RES* word, int16_t& delete_mode);
+ int16_t failure_count(WERD_RES* word);
+ bool noise_outlines(TWERD* word);
+ //// pagewalk.cpp ///////////////////////////////////////////////////////
+ void process_selected_words(
+ PAGE_RES* page_res, // blocks to check
+ // function to call
+ TBOX& selection_box,
+ bool (tesseract::Tesseract::*word_processor)(PAGE_RES_IT* pr_it));
+ //// tessbox.cpp ///////////////////////////////////////////////////////
+ void tess_add_doc_word( // test acceptability
+ WERD_CHOICE* word_choice // after context
+ );
+ void tess_segment_pass_n(int pass_n, WERD_RES* word);
+ bool tess_acceptable_word(WERD_RES* word);
+
+ //// applybox.cpp //////////////////////////////////////////////////////
+ // Applies the box file based on the image name filename, and resegments
+ // the words in the block_list (page), with:
+ // blob-mode: one blob per line in the box file, words as input.
+ // word/line-mode: one blob per space-delimited unit after the #, and one word
+ // per line in the box file. (See comment above for box file format.)
+ // If find_segmentation is true, (word/line mode) then the classifier is used
+ // to re-segment words/lines to match the space-delimited truth string for
+ // each box. In this case, the input box may be for a word or even a whole
+ // text line, and the output words will contain multiple blobs corresponding
+ // to the space-delimited input string.
+ // With find_segmentation false, no classifier is needed, but the chopper
+ // can still be used to correctly segment touching characters with the help
+ // of the input boxes.
+ // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
+ // from normal classification, ie. with a word, chopped_word, rebuild_word,
+ // seam_array, denorm, box_word, and best_state, but NO best_choice or
+ // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
+ // Instead, the correct_text member of WERD_RES is set, and this may be later
+ // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
+ // is not required before calling ApplyBoxTraining.
+ PAGE_RES* ApplyBoxes(const char* filename, bool find_segmentation,
+ BLOCK_LIST* block_list);
+
+ // Any row xheight that is significantly different from the median is set
+ // to the median.
+ void PreenXHeights(BLOCK_LIST* block_list);
+
+ // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
+ // All fuzzy spaces are removed, and all the words are maximally chopped.
+ PAGE_RES* SetupApplyBoxes(const std::vector<TBOX>& boxes,
+ BLOCK_LIST* block_list);
+ // Tests the chopper by exhaustively running chop_one_blob.
+ // The word_res will contain filled chopped_word, seam_array, denorm,
+ // box_word and best_state for the maximally chopped word.
+ void MaximallyChopWord(const std::vector<TBOX>& boxes, BLOCK* block,
+ ROW* row, WERD_RES* word_res);
+ // Gather consecutive blobs that match the given box into the best_state
+ // and corresponding correct_text.
+ // Fights over which box owns which blobs are settled by pre-chopping and
+ // applying the blobs to box or next_box with the least non-overlap.
+ // Returns false if the box was in error, which can only be caused by
+ // failing to find an appropriate blob for a box.
+ // This means that occasionally, blobs may be incorrectly segmented if the
+ // chopper fails to find a suitable chop point.
+ bool ResegmentCharBox(PAGE_RES* page_res, const TBOX* prev_box,
+ const TBOX& box, const TBOX* next_box,
+ const char* correct_text);
+ // Consume all source blobs that strongly overlap the given box,
+ // putting them into a new word, with the correct_text label.
+ // Fights over which box owns which blobs are settled by
+ // applying the blobs to box or next_box with the least non-overlap.
+ // Returns false if the box was in error, which can only be caused by
+ // failing to find an overlapping blob for a box.
+ bool ResegmentWordBox(BLOCK_LIST* block_list, const TBOX& box,
+ const TBOX* next_box, const char* correct_text);
+ // Resegments the words by running the classifier in an attempt to find the
+ // correct segmentation that produces the required string.
+ void ReSegmentByClassification(PAGE_RES* page_res);
+ // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
+ // Returns false if an invalid UNICHAR_ID is encountered.
+ bool ConvertStringToUnichars(const char* utf8,
+ GenericVector<UNICHAR_ID>* class_ids);
+ // Resegments the word to achieve the target_text from the classifier.
+ // Returns false if the re-segmentation fails.
+ // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
+ // applies a full search on the classifier results to find the best classified
+ // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
+ // substitutions ARE used.
+ bool FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
+ WERD_RES* word_res);
+ // Recursive helper to find a match to the target_text (from text_index
+ // position) in the choices (from choices_pos position).
+ // Choices is an array of GenericVectors, of length choices_length, with each
+ // element representing a starting position in the word, and the
+ // GenericVector holding classification results for a sequence of consecutive
+ // blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
+ void SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
+ int choices_pos, int choices_length,
+ const GenericVector<UNICHAR_ID>& target_text,
+ int text_index, float rating,
+ GenericVector<int>* segmentation, float* best_rating,
+ GenericVector<int>* best_segmentation);
+ // Counts up the labelled words and the blobs within.
+ // Deletes all unused or emptied words, counting the unused ones.
+ // Resets W_BOL and W_EOL flags correctly.
+ // Builds the rebuild_word and rebuilds the box_word.
+ void TidyUp(PAGE_RES* page_res);
+ // Logs a bad box by line in the box file and box coords.
+ void ReportFailedBox(int boxfile_lineno, TBOX box, const char* box_ch,
+ const char* err_msg);
+ // Creates a fake best_choice entry in each WERD_RES with the correct text.
+ void CorrectClassifyWords(PAGE_RES* page_res);
+ // Call LearnWord to extract features for labelled blobs within each word.
+ // Features are stored in an internal buffer.
+ void ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res);
+
+ //// fixxht.cpp ///////////////////////////////////////////////////////
+ // Returns the number of misfit blob tops in this word.
+ int CountMisfitTops(WERD_RES* word_res);
+ // Returns a new x-height in pixels (original image coords) that is
+ // maximally compatible with the result in word_res.
+ // Returns 0.0f if no x-height is found that is better than the current
+ // estimate.
+ float ComputeCompatibleXheight(WERD_RES* word_res, float* baseline_shift);
+ //// Data members ///////////////////////////////////////////////////////
+ // TODO(ocr-team): Find and remove obsolete parameters.
+ BOOL_VAR_H(tessedit_resegment_from_boxes, false,
+ "Take segmentation and labeling from box file");
+ BOOL_VAR_H(tessedit_resegment_from_line_boxes, false,
+ "Conversion of word/line box file to char box file");
+ BOOL_VAR_H(tessedit_train_from_boxes, false,
+ "Generate training data from boxed chars");
+ BOOL_VAR_H(tessedit_make_boxes_from_boxes, false,
+ "Generate more boxes from boxed chars");
+ BOOL_VAR_H(tessedit_train_line_recognizer, false,
+ "Break input into lines and remap boxes if present");
+ BOOL_VAR_H(tessedit_dump_pageseg_images, false,
+ "Dump intermediate images made during page segmentation");
+ BOOL_VAR_H(tessedit_do_invert, true,
+ "Try inverting the image in `LSTMRecognizeWord`");
+ INT_VAR_H(tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
+ "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
+ " 5=line, 6=word, 7=char"
+ " (Values from PageSegMode enum in tesseract/publictypes.h)");
+ INT_VAR_H(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT,
+ "Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults"
+ " to loading and running the most accurate available.");
+ STRING_VAR_H(tessedit_char_blacklist, "",
+ "Blacklist of chars not to recognize");
+ STRING_VAR_H(tessedit_char_whitelist, "", "Whitelist of chars to recognize");
+ STRING_VAR_H(tessedit_char_unblacklist, "",
+ "List of chars to override tessedit_char_blacklist");
+ BOOL_VAR_H(tessedit_ambigs_training, false,
+ "Perform training for ambiguities");
+ INT_VAR_H(pageseg_devanagari_split_strategy,
+ tesseract::ShiroRekhaSplitter::NO_SPLIT,
+ "Whether to use the top-line splitting process for Devanagari "
+ "documents while performing page-segmentation.");
+ INT_VAR_H(ocr_devanagari_split_strategy,
+ tesseract::ShiroRekhaSplitter::NO_SPLIT,
+ "Whether to use the top-line splitting process for Devanagari "
+ "documents while performing ocr.");
+ STRING_VAR_H(tessedit_write_params_to_file, "",
+ "Write all parameters to the given file.");
+ BOOL_VAR_H(tessedit_adaption_debug, false,
+ "Generate and print debug information for adaption");
+ INT_VAR_H(bidi_debug, 0, "Debug level for BiDi");
+ INT_VAR_H(applybox_debug, 1, "Debug level");
+ INT_VAR_H(applybox_page, 0, "Page number to apply boxes from");
+ STRING_VAR_H(applybox_exposure_pattern, ".exp",
+ "Exposure value follows this pattern in the image"
+ " filename. The name of the image files are expected"
+ " to be in the form [lang].[fontname].exp[num].tif");
+ BOOL_VAR_H(applybox_learn_chars_and_char_frags_mode, false,
+ "Learn both character fragments (as is done in the"
+ " special low exposure mode) as well as unfragmented"
+ " characters.");
+ BOOL_VAR_H(applybox_learn_ngrams_mode, false,
+ "Each bounding box is assumed to contain ngrams. Only"
+ " learn the ngrams whose outlines overlap horizontally.");
+ BOOL_VAR_H(tessedit_display_outwords, false, "Draw output words");
+ BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices");
+ BOOL_VAR_H(tessedit_timing_debug, false, "Print timing stats");
+ BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true, "Try to improve fuzzy spaces");
+ BOOL_VAR_H(tessedit_unrej_any_wd, false,
+ "Don't bother with word plausibility");
+ BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?");
+ BOOL_VAR_H(tessedit_enable_doc_dict, true,
+ "Add words to the document dictionary");
+ BOOL_VAR_H(tessedit_debug_fonts, false, "Output font info per char");
+ BOOL_VAR_H(tessedit_debug_block_rejection, false, "Block and Row stats");
+ BOOL_VAR_H(tessedit_enable_bigram_correction, true,
+ "Enable correction based on the word bigram dictionary.");
+ BOOL_VAR_H(tessedit_enable_dict_correction, false,
+ "Enable single word correction based on the dictionary.");
+ INT_VAR_H(tessedit_bigram_debug, 0,
+ "Amount of debug output for bigram "
+ "correction.");
+ BOOL_VAR_H(enable_noise_removal, true,
+ "Remove and conditionally reassign small outlines when they"
+ " confuse layout analysis, determining diacritics vs noise");
+ INT_VAR_H(debug_noise_removal, 0, "Debug reassignment of small outlines");
+ // Worst (min) certainty, for which a diacritic is allowed to make the base
+ // character worse and still be included.
+ double_VAR_H(noise_cert_basechar, -8.0, "Hingepoint for base char certainty");
+ // Worst (min) certainty, for which a non-overlapping diacritic is allowed to
+ // make the base character worse and still be included.
+ double_VAR_H(noise_cert_disjoint, -2.5, "Hingepoint for disjoint certainty");
+ // Worst (min) certainty, for which a diacritic is allowed to make a new
+ // stand-alone blob.
+ double_VAR_H(noise_cert_punc, -2.5, "Threshold for new punc char certainty");
+ // Factor of certainty margin for adding diacritics to not count as worse.
+ double_VAR_H(noise_cert_factor, 0.375,
+ "Scaling on certainty diff from Hingepoint");
+ INT_VAR_H(noise_maxperblob, 8, "Max diacritics to apply to a blob");
+ INT_VAR_H(noise_maxperword, 16, "Max diacritics to apply to a word");
+ INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug");
+ STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation");
+ STRING_VAR_H(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
+ STRING_VAR_H(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation");
+ double_VAR_H(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit");
+ double_VAR_H(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit");
+ double_VAR_H(quality_outline_pc, 1.0,
+ "good_quality_doc lte outline error limit");
+ double_VAR_H(quality_char_pc, 0.95, "good_quality_doc gte good char limit");
+ INT_VAR_H(quality_min_initial_alphas_reqd, 2, "alphas in a good word");
+ INT_VAR_H(tessedit_tess_adaption_mode, 0x27,
+ "Adaptation decision algorithm for tess");
+ BOOL_VAR_H(tessedit_minimal_rej_pass1, false,
+ "Do minimal rejection on pass 1 output");
+ BOOL_VAR_H(tessedit_test_adaption, false, "Test adaption criteria");
+ BOOL_VAR_H(test_pt, false, "Test for point");
+ double_VAR_H(test_pt_x, 99999.99, "xcoord");
+ double_VAR_H(test_pt_y, 99999.99, "ycoord");
+ INT_VAR_H(multilang_debug_level, 0, "Print multilang debug info.");
+ INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info.");
+ BOOL_VAR_H(paragraph_text_based, true,
+ "Run paragraph detection on the post-text-recognition "
+ "(more accurate)");
+ BOOL_VAR_H(lstm_use_matrix, 1, "Use ratings matrix/beam searct with lstm");
+ STRING_VAR_H(outlines_odd, "%| ", "Non standard number of outlines");
+ STRING_VAR_H(outlines_2, "ij!?%\":;", "Non standard number of outlines");
+ BOOL_VAR_H(tessedit_good_quality_unrej, true,
+ "Reduce rejection on good docs");
+ BOOL_VAR_H(tessedit_use_reject_spaces, true, "Reject spaces?");
+ double_VAR_H(tessedit_reject_doc_percent, 65.00,
+ "%rej allowed before rej whole doc");
+ double_VAR_H(tessedit_reject_block_percent, 45.00,
+ "%rej allowed before rej whole block");
+ double_VAR_H(tessedit_reject_row_percent, 40.00,
+ "%rej allowed before rej whole row");
+ double_VAR_H(tessedit_whole_wd_rej_row_percent, 70.00,
+ "Number of row rejects in whole word rejects"
+ "which prevents whole row rejection");
+ BOOL_VAR_H(tessedit_preserve_blk_rej_perfect_wds, true,
+ "Only rej partially rejected words in block rejection");
+ BOOL_VAR_H(tessedit_preserve_row_rej_perfect_wds, true,
+ "Only rej partially rejected words in row rejection");
+ BOOL_VAR_H(tessedit_dont_blkrej_good_wds, false,
+ "Use word segmentation quality metric");
+ BOOL_VAR_H(tessedit_dont_rowrej_good_wds, false,
+ "Use word segmentation quality metric");
+ INT_VAR_H(tessedit_preserve_min_wd_len, 2,
+ "Only preserve wds longer than this");
+ BOOL_VAR_H(tessedit_row_rej_good_docs, true,
+ "Apply row rejection to good docs");
+ double_VAR_H(tessedit_good_doc_still_rowrej_wd, 1.1,
+ "rej good doc wd if more than this fraction rejected");
+ BOOL_VAR_H(tessedit_reject_bad_qual_wds, true, "Reject all bad quality wds");
+ BOOL_VAR_H(tessedit_debug_doc_rejection, false, "Page stats");
+ BOOL_VAR_H(tessedit_debug_quality_metrics, false,
+ "Output data to debug file");
+ BOOL_VAR_H(bland_unrej, false, "unrej potential with no checks");
+ double_VAR_H(quality_rowrej_pc, 1.1, "good_quality_doc gte good char limit");
+ BOOL_VAR_H(unlv_tilde_crunching, false, "Mark v.bad words for tilde crunch");
+ BOOL_VAR_H(hocr_font_info, false, "Add font info to hocr output");
+ BOOL_VAR_H(hocr_char_boxes, false,
+ "Add coordinates for each character to hocr output");
+ BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
+ BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
+ double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");
+ BOOL_VAR_H(crunch_terrible_garbage, true, "As it says");
+ double_VAR_H(crunch_poor_garbage_cert, -9.0, "crunch garbage cert lt this");
+ double_VAR_H(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this");
+ double_VAR_H(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this");
+ double_VAR_H(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this");
+ double_VAR_H(crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
+ double_VAR_H(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
+ double_VAR_H(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
+ double_VAR_H(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
+ double_VAR_H(crunch_del_min_width, 3.0, "Del if word width lt xht x this");
+ double_VAR_H(crunch_del_high_word, 1.5, "Del if word gt xht x this above bl");
+ double_VAR_H(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl");
+ double_VAR_H(crunch_small_outlines_size, 0.6, "Small if lt xht x this");
+ INT_VAR_H(crunch_rating_max, 10, "For adj length in rating per ch");
+ INT_VAR_H(crunch_pot_indicators, 1, "How many potential indicators needed");
+ BOOL_VAR_H(crunch_leave_ok_strings, true, "Don't touch sensible strings");
+ BOOL_VAR_H(crunch_accept_ok, true, "Use acceptability in okstring");
+ BOOL_VAR_H(crunch_leave_accept_strings, false,
+ "Don't pot crunch sensible strings");
+ BOOL_VAR_H(crunch_include_numerals, false, "Fiddle alpha figures");
+ INT_VAR_H(crunch_leave_lc_strings, 4,
+ "Don't crunch words with long lower case strings");
+ INT_VAR_H(crunch_leave_uc_strings, 4,
+ "Don't crunch words with long lower case strings");
+ INT_VAR_H(crunch_long_repetitions, 3, "Crunch words with long repetitions");
+ INT_VAR_H(crunch_debug, 0, "As it says");
+ INT_VAR_H(fixsp_non_noise_limit, 1, "How many non-noise blbs either side?");
+ double_VAR_H(fixsp_small_outlines_size, 0.28, "Small if lt xht x this");
+ BOOL_VAR_H(tessedit_prefer_joined_punct, false, "Reward punctuation joins");
+ INT_VAR_H(fixsp_done_mode, 1, "What constitues done for spacing");
+ INT_VAR_H(debug_fix_space_level, 0, "Contextual fixspace debug");
+ STRING_VAR_H(numeric_punctuation, ".,", "Punct. chs expected WITHIN numbers");
+ INT_VAR_H(x_ht_acceptance_tolerance, 8,
+ "Max allowed deviation of blob top outside of font data");
+ INT_VAR_H(x_ht_min_change, 8, "Min change in xht before actually trying it");
+ INT_VAR_H(superscript_debug, 0, "Debug level for sub & superscript fixer");
+ double_VAR_H(superscript_worse_certainty, 2.0,
+ "How many times worse "
+ "certainty does a superscript position glyph need to be for us "
+ "to try classifying it as a char with a different baseline?");
+ double_VAR_H(superscript_bettered_certainty, 0.97,
+ "What reduction in "
+ "badness do we think sufficient to choose a superscript over "
+ "what we'd thought. For example, a value of 0.6 means we want "
+ "to reduce badness of certainty by 40%");
+ double_VAR_H(superscript_scaledown_ratio, 0.4,
+ "A superscript scaled down more than this is unbelievably "
+ "small. For example, 0.3 means we expect the font size to "
+ "be no smaller than 30% of the text line font size.");
+ double_VAR_H(subscript_max_y_top, 0.5,
+ "Maximum top of a character measured as a multiple of x-height "
+ "above the baseline for us to reconsider whether it's a "
+ "subscript.");
+ double_VAR_H(superscript_min_y_bottom, 0.3,
+ "Minimum bottom of a character measured as a multiple of "
+ "x-height above the baseline for us to reconsider whether it's "
+ "a superscript.");
+ BOOL_VAR_H(tessedit_write_block_separators, false,
+ "Write block separators in output");
+ BOOL_VAR_H(tessedit_write_rep_codes, false, "Write repetition char code");
+ BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
+ BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
+ BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
+ BOOL_VAR_H(tessedit_create_alto, false, "Write .xml ALTO output file");
+ BOOL_VAR_H(tessedit_create_lstmbox, false,
+ "Write .box file for LSTM training");
+ BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
+ BOOL_VAR_H(tessedit_create_wordstrbox, false,
+ "Write WordStr format .box output file");
+ BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
+ BOOL_VAR_H(textonly_pdf, false,
+ "Create PDF with only one invisible text layer");
+ INT_VAR_H(jpg_quality, 85, "Set JPEG quality level");
+ INT_VAR_H(user_defined_dpi, 0, "Specify DPI for input image");
+ INT_VAR_H(min_characters_to_try, 50,
+ "Specify minimum characters to try during OSD");
+ STRING_VAR_H(unrecognised_char, "|", "Output char for unidentified blobs");
+ INT_VAR_H(suspect_level, 99, "Suspect marker level");
+ INT_VAR_H(suspect_short_words, 2, "Don't Suspect dict wds longer than this");
+ BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected");
+ double_VAR_H(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit");
+ double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit");
+ BOOL_VAR_H(tessedit_minimal_rejection, false, "Only reject tess failures");
+ BOOL_VAR_H(tessedit_zero_rejection, false, "Don't reject ANYTHING");
+ BOOL_VAR_H(tessedit_word_for_word, false,
+ "Make output have exactly one word per WERD");
+ BOOL_VAR_H(tessedit_zero_kelvin_rejection, false,
+ "Don't reject ANYTHING AT ALL");
+ INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm");
+ BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug");
+ BOOL_VAR_H(tessedit_flip_0O, true, "Contextual 0O O0 flips");
+ double_VAR_H(tessedit_lower_flip_hyphen, 1.5, "Aspect ratio dot/hyphen test");
+ double_VAR_H(tessedit_upper_flip_hyphen, 1.8, "Aspect ratio dot/hyphen test");
+ BOOL_VAR_H(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector");
+ BOOL_VAR_H(rej_1Il_use_dict_word, false, "Use dictword test");
+ BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Don't double check");
+ BOOL_VAR_H(rej_use_tess_accepted, true, "Individual rejection control");
+ BOOL_VAR_H(rej_use_tess_blanks, true, "Individual rejection control");
+ BOOL_VAR_H(rej_use_good_perm, true, "Individual rejection control");
+ BOOL_VAR_H(rej_use_sensible_wd, false, "Extend permuter check");
+ BOOL_VAR_H(rej_alphas_in_number_perm, false, "Extend permuter check");
+ double_VAR_H(rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract");
+ INT_VAR_H(tessedit_image_border, 2, "Rej blbs near image edge limit");
+ STRING_VAR_H(ok_repeated_ch_non_alphanum_wds, "-?*\075", "Allow NN to unrej");
+ STRING_VAR_H(conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
+ INT_VAR_H(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this");
+ BOOL_VAR_H(tessedit_create_boxfile, false, "Output text with boxes");
+ INT_VAR_H(tessedit_page_number, -1,
+ "-1 -> All pages, else specific page to process");
+ BOOL_VAR_H(tessedit_write_images, false, "Capture the image from the IPE");
+ BOOL_VAR_H(interactive_display_mode, false, "Run interactively?");
+ STRING_VAR_H(file_type, ".tif", "Filename extension");
+ BOOL_VAR_H(tessedit_override_permuter, true, "According to dict_word");
+ STRING_VAR_H(tessedit_load_sublangs, "",
+ "List of languages to load with this one");
+ BOOL_VAR_H(tessedit_use_primary_params_model, false,
+ "In multilingual mode use params model of the primary language");
+ // Min acceptable orientation margin (difference in scores between top and 2nd
+ // choice in OSResults::orientations) to believe the page orientation.
+ double_VAR_H(min_orientation_margin, 7.0,
+ "Min acceptable orientation margin");
+ BOOL_VAR_H(textord_tabfind_show_vlines, false, "Debug line finding");
+ BOOL_VAR_H(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model");
+ BOOL_VAR_H(poly_allow_detailed_fx, false,
+ "Allow feature extractors to see the original outline");
+ BOOL_VAR_H(tessedit_init_config_only, false,
+ "Only initialize with the config file. Useful if the instance is "
+ "not going to be used for OCR but say only for layout analysis.");
+ BOOL_VAR_H(textord_equation_detect, false, "Turn on equation detector");
+ BOOL_VAR_H(textord_tabfind_vertical_text, true, "Enable vertical detection");
+ BOOL_VAR_H(textord_tabfind_force_vertical_text, false,
+ "Force using vertical text page mode");
+ double_VAR_H(textord_tabfind_vertical_text_ratio, 0.5,
+ "Fraction of textlines deemed vertical to use vertical page "
+ "mode");
+ double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75,
+ "Fraction of height used as a minimum gap for aligned blobs.");
+ INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible");
+ BOOL_VAR_H(preserve_interword_spaces, false,
+ "Preserve multiple interword spaces");
+ STRING_VAR_H(page_separator, "\f",
+ "Page separator (default is form feed control character)");
+ INT_VAR_H(lstm_choice_mode, 0,
+ "Allows to include alternative symbols choices in the hOCR "
+ "output. "
+ "Valid input values are 0, 1 and 2. 0 is the default value. "
+ "With 1 the alternative symbol choices per timestep are included. "
+ "With 2 the alternative symbol choices are extracted from the CTC "
+ "process instead of the lattice. The choices are mapped per "
+ "character.");
+ INT_VAR_H(lstm_choice_iterations, 5,
+ "Sets the number of cascading iterations for the Beamsearch in "
+ "lstm_choice_mode. Note that lstm_choice_mode must be set to "
+ "a value greater than 0 to produce results.");
+ double_VAR_H(lstm_rating_coefficient, 5,
+ "Sets the rating coefficient for the lstm choices. The smaller "
+ "the coefficient, the better are the ratings for each choice "
+ "and less information is lost due to the cut off at 0. The "
+ "standard value is 5.");
+ BOOL_VAR_H(pageseg_apply_music_mask, true,
+ "Detect music staff and remove intersecting components");
+
+ //// ambigsrecog.cpp /////////////////////////////////////////////////////////
+ FILE* init_recog_training(const char* filename);
+ void recog_training_segmented(const char* filename, PAGE_RES* page_res,
+ volatile ETEXT_DESC* monitor,
+ FILE* output_file);
+ void ambigs_classify_and_output(const char* label, PAGE_RES_IT* pr_it,
+ FILE* output_file);
+
+ private:
+ // The filename of a backup config file. If not null, then we currently
+ // have a temporary debug config file loaded, and backup_config_file_
+ // will be loaded, and set to null when debug is complete.
+ const char* backup_config_file_;
+ // The filename of a config file to read when processing a debug word.
+ STRING word_config_;
+ // Image used for input to layout analysis and tesseract recognition.
+ // May be modified by the ShiroRekhaSplitter to eliminate the top-line.
+ Pix* pix_binary_;
+ // Grey-level input image if the input was not binary, otherwise nullptr.
+ Pix* pix_grey_;
+ // Original input image. Color if the input was color.
+ Pix* pix_original_;
+ // Thresholds that were used to generate the thresholded image from grey.
+ Pix* pix_thresholds_;
+ // Debug images. If non-empty, will be written on destruction.
+ DebugPixa pixa_debug_;
+ // Input image resolution after any scaling. The resolution is not well
+ // transmitted by operations on Pix, so we keep an independent record here.
+ int source_resolution_;
+ // The shiro-rekha splitter object which is used to split top-lines in
+ // Devanagari words to provide a better word and grapheme segmentation.
+ ShiroRekhaSplitter splitter_;
+ // Page segmentation/layout
+ Textord textord_;
+ // True if the primary language uses right_to_left reading order.
+ bool right_to_left_;
+ Pix* scaled_color_;
+ int scaled_factor_;
+ FCOORD deskew_;
+ FCOORD reskew_;
+ TesseractStats stats_;
+ // Sub-languages to be tried in addition to this.
+ std::vector<Tesseract*> sub_langs_;
+ // Most recently used Tesseract out of this and sub_langs_. The default
+ // language for the next word.
+ Tesseract* most_recently_used_;
+ // The size of the font table, ie max possible font id + 1.
+ int font_table_size_;
+ // Equation detector. Note: this pointer is NOT owned by the class.
+ EquationDetect* equ_detect_;
+ // LSTM recognizer, if available.
+ LSTMRecognizer* lstm_recognizer_;
+ // Output "page" number (actually line number) using TrainLineRecognizer.
+ int train_line_page_num_;
+};
+
+} // namespace tesseract
+
+#endif // TESSERACT_CCMAIN_TESSERACTCLASS_H_
diff --git a/tesseract/src/ccmain/tessvars.cpp b/tesseract/src/ccmain/tessvars.cpp
new file mode 100644
index 00000000..f72b0c27
--- /dev/null
+++ b/tesseract/src/ccmain/tessvars.cpp
@@ -0,0 +1,24 @@
+/**********************************************************************
+ * File: tessvars.cpp (Formerly tessvars.c)
+ * Description: Variables and other globals for tessedit.
+ * Author: Ray Smith
+ * Created: Mon Apr 13 13:13:23 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <cstdio>
+
+#include "tessvars.h"
+
+FILE *debug_fp = stderr; // write debug stuff here
diff --git a/tesseract/src/ccmain/tessvars.h b/tesseract/src/ccmain/tessvars.h
new file mode 100644
index 00000000..8c063a11
--- /dev/null
+++ b/tesseract/src/ccmain/tessvars.h
@@ -0,0 +1,27 @@
+/**********************************************************************
+ * File: tessvars.h (Formerly tessvars.h)
+ * Description: Variables and other globals for tessedit.
+ * Author: Ray Smith
+ * Created: Mon Apr 13 13:13:23 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSVARS_H
+#define TESSVARS_H
+
+#include <cstdio>
+
+extern FILE *debug_fp; // write debug stuff here
+
+#endif
diff --git a/tesseract/src/ccmain/tfacepp.cpp b/tesseract/src/ccmain/tfacepp.cpp
new file mode 100644
index 00000000..e5bbb4e4
--- /dev/null
+++ b/tesseract/src/ccmain/tfacepp.cpp
@@ -0,0 +1,322 @@
+/**********************************************************************
+ * File: tfacepp.cpp (Formerly tface++.c)
+ * Description: C++ side of the C/C++ Tess/Editor interface.
+ * Author: Ray Smith
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <cmath>
+
+#include "blamer.h"
+#include "errcode.h"
+#include "ratngs.h"
+#include "reject.h"
+#include "tesseractclass.h"
+#include "werd.h"
+
+#define MAX_UNDIVIDED_LENGTH 24
+
+
+
+/**********************************************************************
+ * recog_word
+ *
+ * Convert the word to tess form and pass it to the tess segmenter.
+ * Convert the output back to editor form.
+ **********************************************************************/
+namespace tesseract {
+void Tesseract::recog_word(WERD_RES *word) {
+ if (wordrec_skip_no_truth_words && (word->blamer_bundle == nullptr ||
+ word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {
+ if (classify_debug_level) tprintf("No truth for word - skipping\n");
+ word->tess_failed = true;
+ return;
+ }
+ ASSERT_HOST(!word->chopped_word->blobs.empty());
+ recog_word_recursive(word);
+ word->SetupBoxWord();
+ if (word->best_choice->length() != word->box_word->length()) {
+ tprintf("recog_word ASSERT FAIL String:\"%s\"; "
+ "Strlen=%d; #Blobs=%d\n",
+ word->best_choice->debug_string().c_str(),
+ word->best_choice->length(), word->box_word->length());
+ }
+ ASSERT_HOST(word->best_choice->length() == word->box_word->length());
+ // Check that the ratings matrix size matches the sum of all the
+ // segmentation states.
+ if (!word->StatesAllValid()) {
+ tprintf("Not all words have valid states relative to ratings matrix!!");
+ word->DebugWordChoices(true, nullptr);
+ ASSERT_HOST(word->StatesAllValid());
+ }
+ if (tessedit_override_permuter) {
+ /* Override the permuter type if a straight dictionary check disagrees. */
+ uint8_t perm_type = word->best_choice->permuter();
+ if ((perm_type != SYSTEM_DAWG_PERM) &&
+ (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
+ uint8_t real_dict_perm_type = dict_word(*word->best_choice);
+ if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
+ (real_dict_perm_type == FREQ_DAWG_PERM) ||
+ (real_dict_perm_type == USER_DAWG_PERM)) &&
+ (alpha_count(word->best_choice->unichar_string().c_str(),
+ word->best_choice->unichar_lengths().c_str()) > 0)) {
+ word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
+ }
+ }
+ if (tessedit_rejection_debug &&
+ perm_type != word->best_choice->permuter()) {
+ tprintf("Permuter Type Flipped from %d to %d\n",
+ perm_type, word->best_choice->permuter());
+ }
+ }
+ // Factored out from control.cpp
+ ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));
+ if (word->best_choice == nullptr || word->best_choice->length() == 0 ||
+ static_cast<int>(strspn(word->best_choice->unichar_string().c_str(),
+ " ")) == word->best_choice->length()) {
+ word->tess_failed = true;
+ word->reject_map.initialise(word->box_word->length());
+ word->reject_map.rej_word_tess_failure();
+ } else {
+ word->tess_failed = false;
+ }
+}
+
+
+/**********************************************************************
+ * recog_word_recursive
+ *
+ * Convert the word to tess form and pass it to the tess segmenter.
+ * Convert the output back to editor form.
+ **********************************************************************/
+void Tesseract::recog_word_recursive(WERD_RES *word) {
+ int word_length = word->chopped_word->NumBlobs(); // no of blobs
+ if (word_length > MAX_UNDIVIDED_LENGTH) {
+ return split_and_recog_word(word);
+ }
+ cc_recog(word);
+ word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
+
+ // Do sanity checks and minor fixes on best_choice.
+ if (word->best_choice->length() > word_length) {
+ word->best_choice->make_bad(); // should never happen
+ tprintf("recog_word: Discarded long string \"%s\""
+ " (%d characters vs %d blobs)\n",
+ word->best_choice->unichar_string().c_str(),
+ word->best_choice->length(), word_length);
+ tprintf("Word is at:");
+ word->word->bounding_box().print();
+ }
+ if (word->best_choice->length() < word_length) {
+ UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
+ while (word->best_choice->length() < word_length) {
+ word->best_choice->append_unichar_id(space_id, 1, 0.0,
+ word->best_choice->certainty());
+ }
+ }
+}
+
+
+/**********************************************************************
+ * split_and_recog_word
+ *
+ * Split the word into 2 smaller pieces at the largest gap.
+ * Recognize the pieces and stick the results back together.
+ **********************************************************************/
+void Tesseract::split_and_recog_word(WERD_RES *word) {
+ // Find the biggest blob gap in the chopped_word.
+ int bestgap = -INT32_MAX;
+ int split_index = 0;
+ for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
+ TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
+ TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
+ int gap = blob_box.left() - prev_box.right();
+ if (gap > bestgap) {
+ bestgap = gap;
+ split_index = b;
+ }
+ }
+ ASSERT_HOST(split_index > 0);
+
+ WERD_RES *word2 = nullptr;
+ BlamerBundle *orig_bb = nullptr;
+ split_word(word, split_index, &word2, &orig_bb);
+
+ // Recognize the first part of the word.
+ recog_word_recursive(word);
+ // Recognize the second part of the word.
+ recog_word_recursive(word2);
+
+ join_words(word, word2, orig_bb);
+}
+
+
+/**********************************************************************
+ * split_word
+ *
+ * Split a given WERD_RES in place into two smaller words for recognition.
+ * split_pt is the index of the first blob to go in the second word.
+ * The underlying word is left alone, only the TWERD (and subsequent data)
+ * are split up. orig_blamer_bundle is set to the original blamer bundle,
+ * and will now be owned by the caller. New blamer bundles are forged for the
+ * two pieces.
+ **********************************************************************/
+void Tesseract::split_word(WERD_RES *word,
+ int split_pt,
+ WERD_RES **right_piece,
+ BlamerBundle **orig_blamer_bundle) const {
+ ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs());
+
+ // Save a copy of the blamer bundle so we can try to reconstruct it below.
+ BlamerBundle *orig_bb =
+ word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : nullptr;
+
+ auto *word2 = new WERD_RES(*word);
+
+ // blow away the copied chopped_word, as we want to work with
+ // the blobs from the input chopped_word so seam_arrays can be merged.
+ TWERD *chopped = word->chopped_word;
+ auto *chopped2 = new TWERD;
+ chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
+ for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
+ chopped2->blobs.push_back(chopped->blobs[i]);
+ }
+ chopped->blobs.truncate(split_pt);
+ word->chopped_word = nullptr;
+ delete word2->chopped_word;
+ word2->chopped_word = nullptr;
+
+ const UNICHARSET &unicharset = *word->uch_set;
+ word->ClearResults();
+ word2->ClearResults();
+ word->chopped_word = chopped;
+ word2->chopped_word = chopped2;
+ word->SetupBasicsFromChoppedWord(unicharset);
+ word2->SetupBasicsFromChoppedWord(unicharset);
+
+ // Try to adjust the blamer bundle.
+ if (orig_bb != nullptr) {
+ // TODO(rays) Looks like a leak to me.
+ // orig_bb should take, rather than copy.
+ word->blamer_bundle = new BlamerBundle();
+ word2->blamer_bundle = new BlamerBundle();
+ orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
+ word2->chopped_word->blobs[0]->bounding_box().left(),
+ wordrec_debug_blamer,
+ word->blamer_bundle, word2->blamer_bundle);
+ }
+
+ *right_piece = word2;
+ *orig_blamer_bundle = orig_bb;
+}
+
+
+/**********************************************************************
+ * join_words
+ *
+ * The opposite of split_word():
+ * join word2 (including any recognized data / seam array / etc)
+ * onto the right of word and then delete word2.
+ * Also, if orig_bb is provided, stitch it back into word.
+ **********************************************************************/
+void Tesseract::join_words(WERD_RES *word,
+ WERD_RES *word2,
+ BlamerBundle *orig_bb) const {
+ TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
+ TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
+ // Tack the word2 outputs onto the end of the word outputs.
+ word->chopped_word->blobs += word2->chopped_word->blobs;
+ word->rebuild_word->blobs += word2->rebuild_word->blobs;
+ word2->chopped_word->blobs.clear();
+ word2->rebuild_word->blobs.clear();
+ TPOINT split_pt;
+ split_pt.x = (prev_box.right() + blob_box.left()) / 2;
+ split_pt.y = (prev_box.top() + prev_box.bottom() +
+ blob_box.top() + blob_box.bottom()) / 4;
+ // Move the word2 seams onto the end of the word1 seam_array.
+ // Since the seam list is one element short, an empty seam marking the
+ // end of the last blob in the first word is needed first.
+ word->seam_array.push_back(new SEAM(0.0f, split_pt));
+ word->seam_array += word2->seam_array;
+ word2->seam_array.truncate(0);
+ // Fix widths and gaps.
+ word->blob_widths += word2->blob_widths;
+ word->blob_gaps += word2->blob_gaps;
+ // Fix the ratings matrix.
+ int rat1 = word->ratings->dimension();
+ int rat2 = word2->ratings->dimension();
+ word->ratings->AttachOnCorner(word2->ratings);
+ ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
+ word->best_state += word2->best_state;
+ // Append the word choices.
+ *word->raw_choice += *word2->raw_choice;
+
+ // How many alt choices from each should we try to get?
+ const int kAltsPerPiece = 2;
+ // When do we start throwing away extra alt choices?
+ const int kTooManyAltChoices = 100;
+
+ // Construct the cartesian product of the best_choices of word(1) and word2.
+ WERD_CHOICE_LIST joined_choices;
+ WERD_CHOICE_IT jc_it(&joined_choices);
+ WERD_CHOICE_IT bc1_it(&word->best_choices);
+ WERD_CHOICE_IT bc2_it(&word2->best_choices);
+ int num_word1_choices = word->best_choices.length();
+ int total_joined_choices = num_word1_choices;
+ // Nota Bene: For the main loop here, we operate only on the 2nd and greater
+ // word2 choices, and put them in the joined_choices list. The 1st word2
+ // choice gets added to the original word1 choices in-place after we have
+ // finished with them.
+ int bc2_index = 1;
+ for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
+ if (total_joined_choices >= kTooManyAltChoices &&
+ bc2_index > kAltsPerPiece)
+ break;
+ int bc1_index = 0;
+ for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
+ ++bc1_index, bc1_it.forward()) {
+ if (total_joined_choices >= kTooManyAltChoices &&
+ bc1_index > kAltsPerPiece)
+ break;
+ auto *wc = new WERD_CHOICE(*bc1_it.data());
+ *wc += *bc2_it.data();
+ jc_it.add_after_then_move(wc);
+ ++total_joined_choices;
+ }
+ }
+ // Now that we've filled in as many alternates as we want, paste the best
+ // choice for word2 onto the original word alt_choices.
+ bc1_it.move_to_first();
+ bc2_it.move_to_first();
+ for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
+ *bc1_it.data() += *bc2_it.data();
+ }
+ bc1_it.move_to_last();
+ bc1_it.add_list_after(&joined_choices);
+
+ // Restore the pointer to original blamer bundle and combine blamer
+ // information recorded in the splits.
+ if (orig_bb != nullptr) {
+ orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle,
+ wordrec_debug_blamer);
+ delete word->blamer_bundle;
+ word->blamer_bundle = orig_bb;
+ }
+ word->SetupBoxWord();
+ word->reject_map.initialise(word->box_word->length());
+ delete word2;
+}
+
+
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/thresholder.cpp b/tesseract/src/ccmain/thresholder.cpp
new file mode 100644
index 00000000..e3934ea6
--- /dev/null
+++ b/tesseract/src/ccmain/thresholder.cpp
@@ -0,0 +1,334 @@
+///////////////////////////////////////////////////////////////////////
+// File: thresholder.cpp
+// Description: Base API for thresholding images in tesseract.
+// Author: Ray Smith
+//
+// (C) Copyright 2008, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "allheaders.h"
+
+#include <tesseract/thresholder.h>
+
+#include <cstdint> // for uint32_t
+#include <cstring>
+
+#include "otsuthr.h"
+#include "tprintf.h" // for tprintf
+
+#if defined(USE_OPENCL)
+#include "openclwrapper.h" // for OpenclDevice
+#endif
+
+namespace tesseract {
+
+ImageThresholder::ImageThresholder()
+ : pix_(nullptr),
+ image_width_(0), image_height_(0),
+ pix_channels_(0), pix_wpl_(0),
+ scale_(1), yres_(300), estimated_res_(300) {
+ SetRectangle(0, 0, 0, 0);
+}
+
+ImageThresholder::~ImageThresholder() {
+ Clear();
+}
+
+// Destroy the Pix if there is one, freeing memory.
+void ImageThresholder::Clear() {
+ pixDestroy(&pix_);
+}
+
+// Return true if no image has been set.
+bool ImageThresholder::IsEmpty() const {
+ return pix_ == nullptr;
+}
+
+// SetImage makes a copy of all the image data, so it may be deleted
+// immediately after this call.
+// Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
+// Palette color images will not work properly and must be converted to
+// 24 bit.
+// Binary images of 1 bit per pixel may also be given but they must be
+// byte packed with the MSB of the first byte being the first pixel, and a
+// one pixel is WHITE. For binary images set bytes_per_pixel=0.
+void ImageThresholder::SetImage(const unsigned char* imagedata,
+ int width, int height,
+ int bytes_per_pixel, int bytes_per_line) {
+ int bpp = bytes_per_pixel * 8;
+ if (bpp == 0) bpp = 1;
+ Pix* pix = pixCreate(width, height, bpp == 24 ? 32 : bpp);
+ l_uint32* data = pixGetData(pix);
+ int wpl = pixGetWpl(pix);
+ switch (bpp) {
+ case 1:
+ for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) {
+ for (int x = 0; x < width; ++x) {
+ if (imagedata[x / 8] & (0x80 >> (x % 8)))
+ CLEAR_DATA_BIT(data, x);
+ else
+ SET_DATA_BIT(data, x);
+ }
+ }
+ break;
+
+ case 8:
+ // Greyscale just copies the bytes in the right order.
+ for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) {
+ for (int x = 0; x < width; ++x)
+ SET_DATA_BYTE(data, x, imagedata[x]);
+ }
+ break;
+
+ case 24:
+ // Put the colors in the correct places in the line buffer.
+ for (int y = 0; y < height; ++y, imagedata += bytes_per_line) {
+ for (int x = 0; x < width; ++x, ++data) {
+ SET_DATA_BYTE(data, COLOR_RED, imagedata[3 * x]);
+ SET_DATA_BYTE(data, COLOR_GREEN, imagedata[3 * x + 1]);
+ SET_DATA_BYTE(data, COLOR_BLUE, imagedata[3 * x + 2]);
+ }
+ }
+ break;
+
+ case 32:
+ // Maintain byte order consistency across different endianness.
+ for (int y = 0; y < height; ++y, imagedata += bytes_per_line, data += wpl) {
+ for (int x = 0; x < width; ++x) {
+ data[x] = (imagedata[x * 4] << 24) | (imagedata[x * 4 + 1] << 16) |
+ (imagedata[x * 4 + 2] << 8) | imagedata[x * 4 + 3];
+ }
+ }
+ break;
+
+ default:
+ tprintf("Cannot convert RAW image to Pix with bpp = %d\n", bpp);
+ }
+ SetImage(pix);
+ pixDestroy(&pix);
+}
+
+// Store the coordinates of the rectangle to process for later use.
+// Doesn't actually do any thresholding.
+void ImageThresholder::SetRectangle(int left, int top, int width, int height) {
+ rect_left_ = left;
+ rect_top_ = top;
+ rect_width_ = width;
+ rect_height_ = height;
+}
+
+// Get enough parameters to be able to rebuild bounding boxes in the
+// original image (not just within the rectangle).
+// Left and top are enough with top-down coordinates, but
+// the height of the rectangle and the image are needed for bottom-up.
+void ImageThresholder::GetImageSizes(int* left, int* top,
+ int* width, int* height,
+ int* imagewidth, int* imageheight) {
+ *left = rect_left_;
+ *top = rect_top_;
+ *width = rect_width_;
+ *height = rect_height_;
+ *imagewidth = image_width_;
+ *imageheight = image_height_;
+}
+
+// Pix vs raw, which to use? Pix is the preferred input for efficiency,
+// since raw buffers are copied.
+// SetImage for Pix clones its input, so the source pix may be pixDestroyed
+// immediately after, but may not go away until after the Thresholder has
+// finished with it.
+void ImageThresholder::SetImage(const Pix* pix) {
+ if (pix_ != nullptr)
+ pixDestroy(&pix_);
+ Pix* src = const_cast<Pix*>(pix);
+ int depth;
+ pixGetDimensions(src, &image_width_, &image_height_, &depth);
+ // Convert the image as necessary so it is one of binary, plain RGB, or
+ // 8 bit with no colormap. Guarantee that we always end up with our own copy,
+ // not just a clone of the input.
+ if (pixGetColormap(src)) {
+ Pix* tmp = pixRemoveColormap(src, REMOVE_CMAP_BASED_ON_SRC);
+ depth = pixGetDepth(tmp);
+ if (depth > 1 && depth < 8) {
+ pix_ = pixConvertTo8(tmp, false);
+ pixDestroy(&tmp);
+ } else {
+ pix_ = tmp;
+ }
+ } else if (depth > 1 && depth < 8) {
+ pix_ = pixConvertTo8(src, false);
+ } else {
+ pix_ = pixCopy(nullptr, src);
+ }
+ depth = pixGetDepth(pix_);
+ pix_channels_ = depth / 8;
+ pix_wpl_ = pixGetWpl(pix_);
+ scale_ = 1;
+ estimated_res_ = yres_ = pixGetYRes(pix_);
+ Init();
+}
+
+// Threshold the source image as efficiently as possible to the output Pix.
+// Creates a Pix and sets pix to point to the resulting pointer.
+// Caller must use pixDestroy to free the created Pix.
+/// Returns false on error.
+bool ImageThresholder::ThresholdToPix(PageSegMode pageseg_mode, Pix** pix) {
+ if (image_width_ > INT16_MAX || image_height_ > INT16_MAX) {
+ tprintf("Image too large: (%d, %d)\n", image_width_, image_height_);
+ return false;
+ }
+ if (pix_channels_ == 0) {
+ // We have a binary image, but it still has to be copied, as this API
+ // allows the caller to modify the output.
+ Pix* original = GetPixRect();
+ *pix = pixCopy(nullptr, original);
+ pixDestroy(&original);
+ } else {
+ OtsuThresholdRectToPix(pix_, pix);
+ }
+ return true;
+}
+
+// Gets a pix that contains an 8 bit threshold value at each pixel. The
+// returned pix may be an integer reduction of the binary image such that
+// the scale factor may be inferred from the ratio of the sizes, even down
+// to the extreme of a 1x1 pixel thresholds image.
+// Ideally the 8 bit threshold should be the exact threshold used to generate
+// the binary image in ThresholdToPix, but this is not a hard constraint.
+// Returns nullptr if the input is binary. PixDestroy after use.
+Pix* ImageThresholder::GetPixRectThresholds() {
+ if (IsBinary()) return nullptr;
+ Pix* pix_grey = GetPixRectGrey();
+ int width = pixGetWidth(pix_grey);
+ int height = pixGetHeight(pix_grey);
+ int* thresholds;
+ int* hi_values;
+ OtsuThreshold(pix_grey, 0, 0, width, height, &thresholds, &hi_values);
+ pixDestroy(&pix_grey);
+ Pix* pix_thresholds = pixCreate(width, height, 8);
+ int threshold = thresholds[0] > 0 ? thresholds[0] : 128;
+ pixSetAllArbitrary(pix_thresholds, threshold);
+ delete [] thresholds;
+ delete [] hi_values;
+ return pix_thresholds;
+}
+
+// Common initialization shared between SetImage methods.
+void ImageThresholder::Init() {
+ SetRectangle(0, 0, image_width_, image_height_);
+}
+
+// Get a clone/copy of the source image rectangle.
+// The returned Pix must be pixDestroyed.
+// This function will be used in the future by the page layout analysis, and
+// the layout analysis that uses it will only be available with Leptonica,
+// so there is no raw equivalent.
+Pix* ImageThresholder::GetPixRect() {
+ if (IsFullImage()) {
+ // Just clone the whole thing.
+ return pixClone(pix_);
+ } else {
+ // Crop to the given rectangle.
+ Box* box = boxCreate(rect_left_, rect_top_, rect_width_, rect_height_);
+ Pix* cropped = pixClipRectangle(pix_, box, nullptr);
+ boxDestroy(&box);
+ return cropped;
+ }
+}
+
+// Get a clone/copy of the source image rectangle, reduced to greyscale,
+// and at the same resolution as the output binary.
+// The returned Pix must be pixDestroyed.
+// Provided to the classifier to extract features from the greyscale image.
+Pix* ImageThresholder::GetPixRectGrey() {
+ auto pix = GetPixRect(); // May have to be reduced to grey.
+ int depth = pixGetDepth(pix);
+ if (depth != 8) {
+ if (depth == 24) {
+ auto tmp = pixConvert24To32(pix);
+ pixDestroy(&pix);
+ pix = tmp;
+ }
+ auto result = pixConvertTo8(pix, false);
+ pixDestroy(&pix);
+ return result;
+ }
+ return pix;
+}
+
+// Otsu thresholds the rectangle, taking the rectangle from *this.
+void ImageThresholder::OtsuThresholdRectToPix(Pix* src_pix,
+ Pix** out_pix) const {
+ int* thresholds;
+ int* hi_values;
+
+ int num_channels = OtsuThreshold(src_pix, rect_left_, rect_top_, rect_width_,
+ rect_height_, &thresholds, &hi_values);
+ // only use opencl if compiled w/ OpenCL and selected device is opencl
+#ifdef USE_OPENCL
+ OpenclDevice od;
+ if (num_channels == 4 &&
+ od.selectedDeviceIsOpenCL() && rect_top_ == 0 && rect_left_ == 0) {
+ od.ThresholdRectToPixOCL((unsigned char*)pixGetData(src_pix), num_channels,
+ pixGetWpl(src_pix) * 4, thresholds, hi_values,
+ out_pix /*pix_OCL*/, rect_height_, rect_width_,
+ rect_top_, rect_left_);
+ } else {
+#endif
+ ThresholdRectToPix(src_pix, num_channels, thresholds, hi_values, out_pix);
+#ifdef USE_OPENCL
+ }
+#endif
+ delete [] thresholds;
+ delete [] hi_values;
+}
+
+/// Threshold the rectangle, taking everything except the src_pix
+/// from the class, using thresholds/hi_values to the output pix.
+/// NOTE that num_channels is the size of the thresholds and hi_values
+// arrays and also the bytes per pixel in src_pix.
+void ImageThresholder::ThresholdRectToPix(Pix* src_pix,
+ int num_channels,
+ const int* thresholds,
+ const int* hi_values,
+ Pix** pix) const {
+ *pix = pixCreate(rect_width_, rect_height_, 1);
+ uint32_t* pixdata = pixGetData(*pix);
+ int wpl = pixGetWpl(*pix);
+ int src_wpl = pixGetWpl(src_pix);
+ uint32_t* srcdata = pixGetData(src_pix);
+ pixSetXRes(*pix, pixGetXRes(src_pix));
+ pixSetYRes(*pix, pixGetYRes(src_pix));
+ for (int y = 0; y < rect_height_; ++y) {
+ const uint32_t* linedata = srcdata + (y + rect_top_) * src_wpl;
+ uint32_t* pixline = pixdata + y * wpl;
+ for (int x = 0; x < rect_width_; ++x) {
+ bool white_result = true;
+ for (int ch = 0; ch < num_channels; ++ch) {
+ int pixel =
+ GET_DATA_BYTE(linedata, (x + rect_left_) * num_channels + ch);
+ if (hi_values[ch] >= 0 &&
+ (pixel > thresholds[ch]) == (hi_values[ch] == 0)) {
+ white_result = false;
+ break;
+ }
+ }
+ if (white_result)
+ CLEAR_DATA_BIT(pixline, x);
+ else
+ SET_DATA_BIT(pixline, x);
+ }
+ }
+}
+
+} // namespace tesseract.
diff --git a/tesseract/src/ccmain/werdit.cpp b/tesseract/src/ccmain/werdit.cpp
new file mode 100644
index 00000000..17834023
--- /dev/null
+++ b/tesseract/src/ccmain/werdit.cpp
@@ -0,0 +1,68 @@
+/**********************************************************************
+ * File: werdit.cpp (Formerly wordit.c)
+ * Description: An iterator for passing over all the words in a document.
+ * Author: Ray Smith
+ * Created: Mon Apr 27 08:51:22 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "werdit.h"
+
+#include "errcode.h" // for ASSERT_HOST
+#include "pageres.h" // for PAGE_RES_IT, PAGE_RES (ptr only), WERD_RES
+#include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
+#include "werd.h" // for WERD
+
+namespace tesseract {
+
+/**********************************************************************
+ * make_pseudo_word
+ *
+ * Make all the blobs inside a selection into a single word.
+ * The returned PAGE_RES_IT* it points to the new word. After use, call
+ * it->DeleteCurrentWord() to delete the fake word, and then
+ * delete it to get rid of the iterator itself.
+ **********************************************************************/
+
+PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box) {
+ PAGE_RES_IT pr_it(page_res);
+ C_BLOB_LIST new_blobs; // list of gathered blobs
+ C_BLOB_IT new_blob_it = &new_blobs; // iterator
+
+ for (WERD_RES* word_res = pr_it.word(); word_res != nullptr;
+ word_res = pr_it.forward()) {
+ WERD* word = word_res->word;
+ if (word->bounding_box().overlap(selection_box)) {
+ C_BLOB_IT blob_it(word->cblob_list());
+ for (blob_it.mark_cycle_pt();
+ !blob_it.cycled_list(); blob_it.forward()) {
+ C_BLOB* blob = blob_it.data();
+ if (blob->bounding_box().overlap(selection_box)) {
+ new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob));
+ }
+ }
+ if (!new_blobs.empty()) {
+ WERD* pseudo_word = new WERD(&new_blobs, 1, nullptr);
+ word_res = pr_it.InsertSimpleCloneWord(*word_res, pseudo_word);
+ auto* it = new PAGE_RES_IT(page_res);
+ while (it->word() != word_res && it->word() != nullptr) it->forward();
+ ASSERT_HOST(it->word() == word_res);
+ return it;
+ }
+ }
+ }
+ return nullptr;
+}
+
+} // namespace tesseract
diff --git a/tesseract/src/ccmain/werdit.h b/tesseract/src/ccmain/werdit.h
new file mode 100644
index 00000000..b49bda29
--- /dev/null
+++ b/tesseract/src/ccmain/werdit.h
@@ -0,0 +1,34 @@
+/**********************************************************************
+ * File: wordit.h
+ * Description: An iterator for passing over all the words in a document.
+ * Author: Ray Smith
+ * Created: Mon Apr 27 08:51:22 BST 1992
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef WERDIT_H
+#define WERDIT_H
+
+#include "rect.h" // for TBOX
+
+namespace tesseract {
+
+class PAGE_RES;
+class PAGE_RES_IT;
+
+PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box);
+
+} // namespace tesseract
+
+#endif