summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'extract/src/join.c')
-rw-r--r--extract/src/join.c215
1 files changed, 111 insertions, 104 deletions
diff --git a/extract/src/join.c b/extract/src/join.c
index 4425de3d..110d3901 100644
--- a/extract/src/join.c
+++ b/extract/src/join.c
@@ -356,7 +356,7 @@ On entry:
On exit:
If we succeed, we return 0, with *o_lines pointing to array of *o_lines_num
line_t*'s, each pointing to a line_t.
-
+
If <rects_num> is zero, each of these line_t's will contain pointers to
items in <spans>; otherwise each of the line_t's will contain new spans
which should be freed by the caller (spans are not necessarily wholy inside
@@ -385,7 +385,7 @@ static int make_lines(
int num_compatible;
int num_joins;
span_t* span = NULL;
-
+
if (rects_num)
{
/* Make <lines> contain new span_t's and char_t's that are inside rects[]. */
@@ -412,7 +412,7 @@ static int make_lines(
{
extract_span_free(alloc, &span);
}
-
+
if (!spans[a]->chars_num)
{
/* All characters in this span are inside table, so remove
@@ -446,7 +446,7 @@ static int make_lines(
outfx("initial line a=%i: %s", a, line_string(lines[a]));
}
}
-
+
num_compatible = 0;
/* For each line, look for nearest aligned line, and append if found. */
@@ -459,7 +459,7 @@ static int make_lines(
line_t* nearest_line = NULL;
span_t* span_a;
double angle_a;
-
+
line_t* line_a = lines[a];
if (!line_a) {
continue;
@@ -580,7 +580,7 @@ static int make_lines(
{
continue;
}
-
+
if (1
&& extract_span_char_last(span_a)->ucs != ' '
&& span_char_first(span_b)->ucs != ' '
@@ -903,7 +903,7 @@ On exit:
are undefined.
*/
static int make_paragraphs(
- extract_alloc_t* alloc,
+ extract_alloc_t* alloc,
line_t** lines,
int lines_num,
paragraph_t*** o_paragraphs,
@@ -941,7 +941,7 @@ static int make_paragraphs(
double angle_a;
int verbose;
int b;
-
+
paragraph_t* paragraph_a = paragraphs[a];
if (!paragraph_a) {
/* This paragraph is empty - already been appended to a different
@@ -1183,9 +1183,9 @@ static int make_paragraphs(
return ret;
}
-static int s_join_page_rects(
+static int s_join_subpage_rects(
extract_alloc_t* alloc,
- extract_page_t* page,
+ subpage_t* subpage,
rect_t* rects,
int rects_num,
line_t*** lines,
@@ -1198,8 +1198,8 @@ rects_num is zero. */
{
if (make_lines(
alloc,
- page->spans,
- &page->spans_num,
+ subpage->spans,
+ &subpage->spans_num,
rects,
rects_num,
lines,
@@ -1212,7 +1212,7 @@ rects_num is zero. */
paragraphs,
paragraphs_num
)) return -1;
-
+
return 0;
}
@@ -1304,17 +1304,17 @@ void extract_cell_init(cell_t* cell)
static int table_find_extend(cell_t** cells, int cells_num_x, int cells_num_y)
-{
+{
/* Find cell extensions to right and down by looking at cells' .left and
.above flags.
-
+
For example for adjacent cells ABC..., we extend A to include cells BC..
until we reach a cell with .left set to one.
-
+
ABCDE
FGHIJ
KLMNO
-
+
When looking to extend cell A, we only look at cells in the same column or
same row, (i.e. in the above example we look at BCDE and FK, and not at
GHIJ and LMNO).
@@ -1349,7 +1349,7 @@ static int table_find_extend(cell_t** cells, int cells_num_x, int cells_num_y)
}
cell->extend_down = yy - y;
cell->rect.max.y = cells[(yy-1) * cells_num_x + x]->rect.max.y;
-
+
/* Clear .above and .left in enclosed cells. */
for (xx = x; xx < x + cell->extend_right; ++xx)
{
@@ -1384,7 +1384,7 @@ static int table_find_extend(cell_t** cells, int cells_num_x, int cells_num_y)
}
-static int table_find_cells_text(extract_alloc_t* alloc, extract_page_t* page,
+static int table_find_cells_text(extract_alloc_t* alloc, subpage_t* subpage,
cell_t** cells, int cells_num_x, int cells_num_y)
/* Sets each cell to contain the text that is within the cell's boundary. We
remove any found text from the page. */
@@ -1398,9 +1398,9 @@ remove any found text from the page. */
{
cell_t* cell = cells[i];
if (!cell->above || !cell->left) continue;
- if (s_join_page_rects(
+ if (s_join_subpage_rects(
alloc,
- page,
+ subpage,
&cell->rect,
1 /*rects_num*/,
&cell->lines,
@@ -1409,17 +1409,17 @@ remove any found text from the page. */
&cell->paragraphs_num
)) return -1;
}
-
+
/* Append the table we have found to page->tables[]. */
- if (extract_realloc(alloc, &page->tables, sizeof(*page->tables) * (page->tables_num + 1))) goto end;
- if (extract_malloc(alloc, &page->tables[page->tables_num], sizeof(*page->tables[page->tables_num]))) goto end;
- page->tables[page->tables_num]->pos.x = cells[0]->rect.min.x;
- page->tables[page->tables_num]->pos.y = cells[0]->rect.min.y;
- page->tables[page->tables_num]->cells = cells;
- page->tables[page->tables_num]->cells_num_x = cells_num_x;
- page->tables[page->tables_num]->cells_num_y = cells_num_y;
- page->tables_num += 1;
-
+ if (extract_realloc(alloc, &subpage->tables, sizeof(*subpage->tables) * (subpage->tables_num + 1))) goto end;
+ if (extract_malloc(alloc, &subpage->tables[subpage->tables_num], sizeof(*subpage->tables[subpage->tables_num]))) goto end;
+ subpage->tables[subpage->tables_num]->pos.x = cells[0]->rect.min.x;
+ subpage->tables[subpage->tables_num]->pos.y = cells[0]->rect.min.y;
+ subpage->tables[subpage->tables_num]->cells = cells;
+ subpage->tables[subpage->tables_num]->cells_num_x = cells_num_x;
+ subpage->tables[subpage->tables_num]->cells_num_y = cells_num_y;
+ subpage->tables_num += 1;
+
if (0)
{
/* For debugging. */
@@ -1442,24 +1442,24 @@ remove any found text from the page. */
}
fprintf(stderr, "\n");
}
-
+
}
-
+
e = 0;
end:
return e;
}
-static int table_find(extract_alloc_t* alloc, extract_page_t* page, double y_min, double y_max)
+static int table_find(extract_alloc_t* alloc, subpage_t* subpage, double y_min, double y_max)
/* Finds single table made from lines whose y coordinates are in the range
y_min..y_max. */
{
- tablelines_t* all_h = &page->tablelines_horizontal;
- tablelines_t* all_v = &page->tablelines_vertical;
+ tablelines_t* all_h = &subpage->tablelines_horizontal;
+ tablelines_t* all_v = &subpage->tablelines_vertical;
int e = -1;
int i;
-
+
/* Find subset of vertical and horizontal lines that are within range
y_min..y_max, and sort by y coordinate. */
tablelines_t tl_h = {NULL, 0};
@@ -1472,14 +1472,14 @@ y_min..y_max. */
int y;
outf("y=(%f %f)", y_min, y_max);
-
+
if (table_find_y_range(alloc, all_h, y_min, y_max, &tl_h)) goto end;
if (table_find_y_range(alloc, all_v, y_min, y_max, &tl_v)) goto end;
/* Suppress false coverity warning - qsort() does not dereference null
pointer if nmemb is zero. */
/* coverity[var_deref_model] */
qsort(tl_v.tablelines, tl_v.tablelines_num, sizeof(*tl_v.tablelines), tablelines_compare_x);
-
+
if (0)
{
/* Show raw lines info. */
@@ -1519,28 +1519,28 @@ y_min..y_max. */
break;
}
cells_num_y += 1;
-
+
for (j=0; j<tl_v.tablelines_num; )
{
int j_next;
int ii;
int jj;
cell_t* cell;
-
+
for (j_next = j+1; j_next<tl_v.tablelines_num; ++j_next)
{
if (tl_v.tablelines[j_next].rect.min.x - tl_v.tablelines[j].rect.min.x > 0.5) break;
}
outf("i=%i j=%i tl_v.tablelines[j].rect=%s", i, j, extract_rect_string(&tl_v.tablelines[j].rect));
-
+
if (j_next == tl_v.tablelines_num) break;
-
+
if (extract_realloc(alloc, &cells, sizeof(*cells) * (cells_num+1))) goto end;
if (extract_malloc(alloc, &cells[cells_num], sizeof(*cells[cells_num]))) goto end;
cell = cells[cells_num];
cells_num += 1;
if (i==0) cells_num_x += 1;
-
+
cell->rect.min.x = tl_v.tablelines[j].rect.min.x;
cell->rect.min.y = tl_h.tablelines[i].rect.min.y;
cell->rect.max.x = (j_next < tl_v.tablelines_num) ? tl_v.tablelines[j_next].rect.min.x : cell->rect.min.x;
@@ -1553,7 +1553,7 @@ y_min..y_max. */
cell->lines_num = 0;
cell->paragraphs = NULL;
cell->paragraphs_num = 0;
-
+
/* Set cell->above if there is a horizontal line above the cell. */
outf("Looking to set above for i=%i j=%i rect=%s", i, j, extract_rect_string(&cell->rect));
for (ii = i; ii < i_next; ++ii)
@@ -1570,7 +1570,7 @@ y_min..y_max. */
break;
}
}
-
+
/* Set cell->left if there is a vertical line to the left of the cell. */
for (jj = j; jj < j_next; ++jj)
{
@@ -1586,15 +1586,15 @@ y_min..y_max. */
break;
}
}
-
+
j = j_next;
}
-
+
i = i_next;
}
-
+
assert(cells_num == cells_num_x * cells_num_y);
-
+
/* Remove cols and rows where no cells have .above and .left - these
will not appear. It also avoids spurious empty columns when table uses
closely-spaced double lines as separators. */
@@ -1629,7 +1629,7 @@ y_min..y_max. */
cells_num_x -= 1;
}
}
-
+
if (cells_num == 0)
{
e = 0;
@@ -1637,9 +1637,9 @@ y_min..y_max. */
}
if (table_find_extend(cells, cells_num_x, cells_num_y)) goto end;
-
- if (table_find_cells_text(alloc, page, cells, cells_num_x, cells_num_y)) goto end;
-
+
+ if (table_find_cells_text(alloc, subpage, cells, cells_num_x, cells_num_y)) goto end;
+
e = 0;
end:
extract_free(alloc, &tl_h.tablelines);
@@ -1656,9 +1656,9 @@ y_min..y_max. */
}
-static int extract_page_tables_find_lines(
+static int extract_subpage_tables_find_lines(
extract_alloc_t* alloc,
- extract_page_t* page
+ subpage_t* subpage
)
/* Finds tables in <page> by looking for lines in page->tablelines_horizontal
and page->tablelines_vertical that look like table dividers.
@@ -1671,45 +1671,45 @@ Any text found inside tables is removed from page->spans[].
double margin = 1;
int iv;
int ih;
- outf("page->tablelines_horizontal.tablelines_num=%i", page->tablelines_horizontal.tablelines_num);
- outf("page->tablelines_vertical.tablelines_num=%i", page->tablelines_vertical.tablelines_num);
-
+ outf("page->tablelines_horizontal.tablelines_num=%i", subpage->tablelines_horizontal.tablelines_num);
+ outf("page->tablelines_vertical.tablelines_num=%i", subpage->tablelines_vertical.tablelines_num);
+
/* Sort all lines by y coordinate. */
qsort(
- page->tablelines_horizontal.tablelines,
- page->tablelines_horizontal.tablelines_num,
- sizeof(*page->tablelines_horizontal.tablelines),
+ subpage->tablelines_horizontal.tablelines,
+ subpage->tablelines_horizontal.tablelines_num,
+ sizeof(*subpage->tablelines_horizontal.tablelines),
tablelines_compare_y
);
qsort(
- page->tablelines_vertical.tablelines,
- page->tablelines_vertical.tablelines_num,
- sizeof(*page->tablelines_vertical.tablelines),
+ subpage->tablelines_vertical.tablelines,
+ subpage->tablelines_vertical.tablelines_num,
+ sizeof(*subpage->tablelines_vertical.tablelines),
tablelines_compare_y
);
-
+
if (0)
{
/* Show info about lines. */
int i;
outf0("tablelines_horizontal:");
- for (i=0; i<page->tablelines_horizontal.tablelines_num; ++i)
+ for (i=0; i<subpage->tablelines_horizontal.tablelines_num; ++i)
{
outf0(" color=%f: %s",
- page->tablelines_horizontal.tablelines[i].color,
- extract_rect_string(&page->tablelines_horizontal.tablelines[i].rect)
+ subpage->tablelines_horizontal.tablelines[i].color,
+ extract_rect_string(&subpage->tablelines_horizontal.tablelines[i].rect)
);
}
outf0("tablelines_vertical:");
- for (i=0; i<page->tablelines_vertical.tablelines_num; ++i)
+ for (i=0; i<subpage->tablelines_vertical.tablelines_num; ++i)
{
outf0(" color=%f: %s",
- page->tablelines_vertical.tablelines[i].color,
- extract_rect_string(&page->tablelines_vertical.tablelines[i].rect)
+ subpage->tablelines_vertical.tablelines[i].color,
+ extract_rect_string(&subpage->tablelines_vertical.tablelines[i].rect)
);
}
}
-
+
/* Look for completely separate vertical regions that define different
tables, by looking for vertical gaps between the rects of each
horizontal/vertical line. */
@@ -1722,22 +1722,22 @@ Any text found inside tables is removed from page->spans[].
tableline_t* tlv = NULL;
tableline_t* tlh = NULL;
tableline_t* tl;
- if (iv < page->tablelines_vertical.tablelines_num)
+ if (iv < subpage->tablelines_vertical.tablelines_num)
{
- tlv = &page->tablelines_vertical.tablelines[iv];
+ tlv = &subpage->tablelines_vertical.tablelines[iv];
}
/* We only consider horizontal lines that are not white. This is a bit
of a cheat to get the right behaviour with twotables_2.pdf. */
- while (ih < page->tablelines_horizontal.tablelines_num)
+ while (ih < subpage->tablelines_horizontal.tablelines_num)
{
- if (page->tablelines_horizontal.tablelines[ih].color == 1)
+ if (subpage->tablelines_horizontal.tablelines[ih].color == 1)
{
/* Ignore white horizontal lines. */
++ih;
}
else
{
- tlh = &page->tablelines_horizontal.tablelines[ih];
+ tlh = &subpage->tablelines_horizontal.tablelines[ih];
break;
}
}
@@ -1756,16 +1756,16 @@ Any text found inside tables is removed from page->spans[].
{
outf("New table. maxy=%f miny=%f", maxy, miny);
/* Find table. */
- table_find(alloc, page, miny - margin, maxy + margin);
+ table_find(alloc, subpage, miny - margin, maxy + margin);
}
miny = tl->rect.min.y;
}
if (tl->rect.max.y > maxy) maxy = tl->rect.max.y;
}
-
+
/* Find last table. */
- table_find(alloc, page, miny - margin, maxy + margin);
-
+ table_find(alloc, subpage, miny - margin, maxy + margin);
+
return 0;
}
@@ -1793,9 +1793,9 @@ static void show_tables(table_t** tables, int tables_num)
}
}
-static int extract_page_tables_find(
+static int extract_subpage_tables_find(
extract_alloc_t* alloc,
- extract_page_t* page
+ subpage_t* subpage
)
/* Find tables in <page>.
@@ -1804,53 +1804,53 @@ will call other functions that find tables in different ways, e.g. by analysing
an image of a page, or looking for blocks of whitespace in between chunks of
text. */
{
- if (extract_page_tables_find_lines(alloc, page)) return -1;
+ if (extract_subpage_tables_find_lines(alloc, subpage)) return -1;
if (0)
{
outf0("=== tables from extract_page_tables_find_lines():");
- show_tables(page->tables, page->tables_num);
+ show_tables(subpage->tables, subpage->tables_num);
}
return 0;
}
-static int extract_document_join_page(
+static int extract_join_subpage(
extract_alloc_t* alloc,
- extract_page_t* page
+ subpage_t* subpage
)
/* Finds tables and paragraphs on <page>. */
{
/* Find tables on this page first. This will remove text that is within
tables from page->spans, so that text doesn't appearing more than once in
the final output. */
- if (extract_page_tables_find(alloc, page)) return -1;
+ if (extract_subpage_tables_find(alloc, subpage)) return -1;
/* Now join remaining spans into lines and paragraphs. */
- if (s_join_page_rects(
+ if (s_join_subpage_rects(
alloc,
- page,
+ subpage,
NULL /*rects*/,
0 /*rects_num*/,
- &page->lines,
- &page->lines_num,
- &page->paragraphs,
- &page->paragraphs_num
+ &subpage->lines,
+ &subpage->lines_num,
+ &subpage->paragraphs,
+ &subpage->paragraphs_num
))
{
- outf0("s_join_page_rects failed. page->spans_num=%i page->lines_num=%i page->paragraphs_num=%i",
- page->spans_num,
- page->lines_num,
- page->paragraphs_num
+ outf0("s_join_subpage_rects failed. subpage->spans_num=%i subpage->lines_num=%i subpage->paragraphs_num=%i",
+ subpage->spans_num,
+ subpage->lines_num,
+ subpage->paragraphs_num
);
return -1;
}
-
+
return 0;
}
-int extract_document_join(extract_alloc_t* alloc, document_t* document)
+int extract_document_join(extract_alloc_t* alloc, document_t* document, int layout_analysis)
{
/* For each page in <document> we find tables and join spans into lines and paragraphs.
@@ -1861,9 +1861,16 @@ int extract_document_join(extract_alloc_t* alloc, document_t* document)
int p;
for (p=0; p<document->pages_num; ++p) {
extract_page_t* page = document->pages[p];
-
- outf("processing page %i: num_spans=%i", p, page->spans_num);
- if (extract_document_join_page(alloc, page)) return -1;
+ int c;
+
+ if (layout_analysis && extract_page_analyse(alloc, page)) return -1;
+
+ for (c=0; c<page->subpages_num; ++c) {
+ subpage_t* subpage = page->subpages[c];
+
+ outf("processing page %i, subpage %i: num_spans=%i", p, c, subpage->spans_num);
+ if (extract_join_subpage(alloc, subpage)) return -1;
+ }
}
return 0;