\n")) goto end;
for (x=0; xcells_num_x; ++x)
cell_t* cell = table->cells[y*table->cells_num_x + x];
if (!cell->above || !cell->left)
/* HTML does not require anything for cells that are subsumed
by other cells that extend horizontally and vertically. */
if (extract_astring_cat(alloc, content, " ")) goto end;
if (extract_astring_cat(alloc, content, "extend_right > 1)
if (extract_astring_catf(alloc, content, " colspan=\"%i\"", cell->extend_right)) goto end;
if (cell->extend_down > 1)
if (extract_astring_catf(alloc, content, " rowspan=\"%i\"", cell->extend_down)) goto end;
if (extract_astring_cat(alloc, content, ">")) goto end;
if (paragraphs_to_html_content(alloc, state, cell->paragraphs, cell->paragraphs_num, 1 /* single_line*/, content)) goto end;
if (extract_astring_cat(alloc, content, " | ")) goto end;
if (extract_astring_cat(alloc, content, "\n")) goto end;
if (content_state_reset(alloc, state, content)) goto end;
if (extract_astring_cat(alloc, content, "
\n")) goto end;
if (extract_astring_cat(alloc, content, "\n\n")) goto end;
e = 0;
return e;
static char_t* paragraph_first_char(const paragraph_t* paragraph)
line_t* line = paragraph->lines[paragraph->lines_num - 1];
span_t* span = line->spans[line->spans_num - 1];
return &span->chars[0];
static int compare_paragraph_y(const void* a, const void* b)
const paragraph_t* const* a_paragraph = a;
const paragraph_t* const* b_paragraph = b;
double a_y = paragraph_first_char(*a_paragraph)->y;
double b_y = paragraph_first_char(*b_paragraph)->y;
if (a_y > b_y) return +1;
if (a_y < b_y) return -1;
return 0;
static int
split_to_html(extract_alloc_t *alloc, split_t* split, subpage_t*** ppsubpage, extract_astring_t *output)
int p;
int s;
int t;
subpage_t* subpage;
paragraph_t** paragraphs = NULL;
content_state_t state;
if (split == NULL) {
/* fall through to below - SPLIT_NONE */
} else if (split->type == SPLIT_HORIZONTAL) {
int ret = 0;
double total = 0;
for (s = 0; s < split->count; s++) {
total += split->split[s]->weight;
if (split->count > 1)
extract_astring_cat(alloc, output, "\n");
for (s = 0; s < split->count; s++) {
if (split->count > 1)
if (total == 0)
extract_astring_catf(alloc, output, "
extract_astring_catf(alloc, output, "
\n", 100.0*split->split[s]->weight/total);
ret = split_to_html(alloc, split->split[s], ppsubpage, output);
if (ret)
if (split->count > 1)
extract_astring_cat(alloc, output, "
if (split->count > 1)
extract_astring_cat(alloc, output, "
return ret;
} else if (split->type == SPLIT_VERTICAL) {
int ret = 0;
for (s = 0; s < split->count; s++) {
ret = split_to_html(alloc, split->split[s], ppsubpage, output);
if (ret)
return ret;
/* We'll deal with the next subpage entry. Increment the pointer for the
* next caller. */
subpage = **ppsubpage;
*ppsubpage = (*ppsubpage)+1;
/* Output paragraphs and tables in order of increasing
Unfortunately the paragraph ordering we do in page->paragraphs[]
isn't quite right and results in bad ordering if ctm/trm matrices are
inconsistent. So we create our own list of paragraphs sorted strictly
by y coordinate of the first char of each paragraph. */
if (extract_malloc(alloc, ¶graphs, sizeof(*paragraphs) * subpage->paragraphs_num)) goto end;
for (p = 0; p < subpage->paragraphs_num; ++p)
paragraphs[p] = subpage->paragraphs[p];
qsort(paragraphs, subpage->paragraphs_num, sizeof(*paragraphs), compare_paragraph_y);
if (0)
int p;
outf0("paragraphs are:");
for (p=0; pparagraphs_num; ++p)
paragraph_t* paragraph = subpage->paragraphs[p];
line_t* line = paragraph->lines[0];
span_t* span = line->spans[0];
outf0(" p=%i: %s", p, extract_span_string(NULL, span));
p = 0;
t = 0;
double y_paragraph;
double y_table;
paragraph_t* paragraph = (p == subpage->paragraphs_num) ? NULL : paragraphs[p];
table_t* table = (t == subpage->tables_num) ? NULL : subpage->tables[t];
if (!paragraph && !table) break;
y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX;
y_table = (table) ? table->pos.y : DBL_MAX;
outf("p=%i y_paragraph=%f", p, y_paragraph);
outf("t=%i y_table=%f", t, y_table);
if (paragraph && y_paragraph < y_table)
//extract_astring_catf(alloc, output, "@@@ paragraph %i y=%f @@@)
\n", p, y_paragraph);
if (paragraph_to_html_content(alloc, &state, paragraph, 0 /*single_line*/, output)) goto end;
if (content_state_reset(alloc, &state, output)) goto end;
p += 1;
else if (table)
//extract_astring_catf(alloc, output, "@@@ table %t y=%f @@@)
\n", p, y_table);
if (append_table(alloc, &state, table, output)) goto end;
t += 1;
extract_free(alloc, ¶graphs);
return 0;
extract_free(alloc, ¶graphs);
return -1;
int extract_document_to_html_content(
extract_alloc_t* alloc,
document_t* document,
int rotation,
int images,
extract_astring_t* content
int ret = -1;
int n;
paragraph_t** paragraphs = NULL;
(void) rotation;
(void) images;
extract_astring_cat(alloc, content, "\n");
extract_astring_cat(alloc, content, "\n");
/* Write paragraphs into . */
for (n=0; npages_num; ++n)
extract_page_t* page = document->pages[n];
subpage_t **psubpage = page->subpages;
/* Every page gets its own div. */
extract_astring_cat(alloc, content, "\n");
ret = split_to_html(alloc, page->split, &psubpage, content);
if (ret)
goto end;
extract_astring_cat(alloc, content, "
extract_astring_cat(alloc, content, "\n");
extract_astring_cat(alloc, content, "\n");
ret = 0;
extract_free(alloc, ¶graphs);
return ret;