/* These extract_html_*() functions generate docx content and docx zip archive data. Caller must call things in a sensible order to create valid content - e.g. don't call docx_paragraph_start() twice without intervening call to docx_paragraph_finish(). */ #include "../include/extract.h" #include "astring.h" #include "document.h" #include "html.h" #include "mem.h" #include "memento.h" #include "outf.h" #include "sys.h" #include "text.h" #include "zip.h" #include #include #include #include #include #include #include #include static void content_state_init(content_state_t* content_state) { content_state->font.name = NULL; content_state->font.size = 0; content_state->font.bold = 0; content_state->font.italic = 0; content_state->ctm_prev = NULL; } static int content_state_reset(extract_alloc_t* alloc, content_state_t* content_state, extract_astring_t* content) { int e = -1; if (content_state->font.bold) { if (extract_astring_cat(alloc, content, "")) goto end; content_state->font.bold = 0; } if (content_state->font.italic) { if (extract_astring_cat(alloc, content, "")) goto end; content_state->font.italic = 0; } e = 0; end: return e; } static int paragraph_to_html_content( extract_alloc_t* alloc, content_state_t* content_state, paragraph_t* paragraph, int single_line, extract_astring_t* content ) { int e = -1; const char* endl = (single_line) ? "" : "\n"; int l; if (extract_astring_catf(alloc, content, "%s%s

", endl, endl)) goto end; for (l=0; llines_num; ++l) { line_t* line = paragraph->lines[l]; int s; for (s=0; sspans_num; ++s) { int c; span_t* span = line->spans[s]; content_state->ctm_prev = &span->ctm; if (span->flags.font_bold != content_state->font.bold) { if (extract_astring_cat(alloc, content, span->flags.font_bold ? "" : "" )) goto end; content_state->font.bold = span->flags.font_bold; } if (span->flags.font_italic != content_state->font.italic) { if ( extract_astring_cat(alloc, content, span->flags.font_italic ? "" : "" )) goto end; content_state->font.italic = span->flags.font_italic; } for (c=0; cchars_num; ++c) { char_t* char_ = &span->chars[c]; if (extract_astring_catc_unicode_xml(alloc, content, char_->ucs)) goto end; } } if (content->chars_num && l+1 < paragraph->lines_num) { if (content->chars[content->chars_num-1] == '-') content->chars_num -= 1; else if (content->chars[content->chars_num-1] != ' ') { extract_astring_catc(alloc, content, ' '); } } } if (extract_astring_catf(alloc, content, "%s

", endl)) goto end; e = 0; end: return e; } static int paragraphs_to_html_content( extract_alloc_t* alloc, content_state_t* state, paragraph_t** paragraphs, int paragraphs_num, int single_line, extract_astring_t* content ) /* Append html for paragraphs[] to . Updates *state if we change font etc. */ { int e = -1; int p; for (p=0; p\n")) goto end; for (y=0; ycells_num_y; ++y) { /* If 1, we put each ... on a separate line. */ int x; if (extract_astring_cat(alloc, content, " \n")) goto end; for (x=0; xcells_num_x; ++x) { cell_t* cell = table->cells[y*table->cells_num_x + x]; if (!cell->above || !cell->left) { /* HTML does not require anything for cells that are subsumed by other cells that extend horizontally and vertically. */ continue; } if (extract_astring_cat(alloc, content, " ")) goto end; if (extract_astring_cat(alloc, content, "extend_right > 1) { if (extract_astring_catf(alloc, content, " colspan=\"%i\"", cell->extend_right)) goto end; } if (cell->extend_down > 1) { if (extract_astring_catf(alloc, content, " rowspan=\"%i\"", cell->extend_down)) goto end; } if (extract_astring_cat(alloc, content, ">")) goto end; if (paragraphs_to_html_content(alloc, state, cell->paragraphs, cell->paragraphs_num, 1 /* single_line*/, content)) goto end; if (extract_astring_cat(alloc, content, "")) goto end; if (extract_astring_cat(alloc, content, "\n")) goto end; if (content_state_reset(alloc, state, content)) goto end; } if (extract_astring_cat(alloc, content, " \n")) goto end; } if (extract_astring_cat(alloc, content, "\n\n")) goto end; e = 0; end: return e; } static char_t* paragraph_first_char(const paragraph_t* paragraph) { line_t* line = paragraph->lines[paragraph->lines_num - 1]; span_t* span = line->spans[line->spans_num - 1]; return &span->chars[0]; } static int compare_paragraph_y(const void* a, const void* b) { const paragraph_t* const* a_paragraph = a; const paragraph_t* const* b_paragraph = b; double a_y = paragraph_first_char(*a_paragraph)->y; double b_y = paragraph_first_char(*b_paragraph)->y; if (a_y > b_y) return +1; if (a_y < b_y) return -1; return 0; } /* */ static int split_to_html(extract_alloc_t *alloc, split_t* split, subpage_t*** ppsubpage, extract_astring_t *output) { int p; int s; int t; subpage_t* subpage; paragraph_t** paragraphs = NULL; content_state_t state; content_state_init(&state); if (split == NULL) { /* fall through to below - SPLIT_NONE */ } else if (split->type == SPLIT_HORIZONTAL) { int ret = 0; double total = 0; for (s = 0; s < split->count; s++) { total += split->split[s]->weight; } if (split->count > 1) extract_astring_cat(alloc, output, "
\n"); for (s = 0; s < split->count; s++) { if (split->count > 1) { if (total == 0) { extract_astring_catf(alloc, output, "
\n"); } else { extract_astring_catf(alloc, output, "
\n", 100.0*split->split[s]->weight/total); } } ret = split_to_html(alloc, split->split[s], ppsubpage, output); if (ret) break; if (split->count > 1) extract_astring_cat(alloc, output, "
\n"); } if (split->count > 1) extract_astring_cat(alloc, output, "
\n"); return ret; } else if (split->type == SPLIT_VERTICAL) { int ret = 0; for (s = 0; s < split->count; s++) { ret = split_to_html(alloc, split->split[s], ppsubpage, output); if (ret) break; } return ret; } /* We'll deal with the next subpage entry. Increment the pointer for the * next caller. */ subpage = **ppsubpage; *ppsubpage = (*ppsubpage)+1; /* Output paragraphs and tables in order of increasing coordinate. Unfortunately the paragraph ordering we do in page->paragraphs[] isn't quite right and results in bad ordering if ctm/trm matrices are inconsistent. So we create our own list of paragraphs sorted strictly by y coordinate of the first char of each paragraph. */ if (extract_malloc(alloc, ¶graphs, sizeof(*paragraphs) * subpage->paragraphs_num)) goto end; for (p = 0; p < subpage->paragraphs_num; ++p) { paragraphs[p] = subpage->paragraphs[p]; } qsort(paragraphs, subpage->paragraphs_num, sizeof(*paragraphs), compare_paragraph_y); if (0) { int p; outf0("paragraphs are:"); for (p=0; pparagraphs_num; ++p) { paragraph_t* paragraph = subpage->paragraphs[p]; line_t* line = paragraph->lines[0]; span_t* span = line->spans[0]; outf0(" p=%i: %s", p, extract_span_string(NULL, span)); } } p = 0; t = 0; for(;;) { double y_paragraph; double y_table; paragraph_t* paragraph = (p == subpage->paragraphs_num) ? NULL : paragraphs[p]; table_t* table = (t == subpage->tables_num) ? NULL : subpage->tables[t]; if (!paragraph && !table) break; y_paragraph = (paragraph) ? paragraph->lines[0]->spans[0]->chars[0].y : DBL_MAX; y_table = (table) ? table->pos.y : DBL_MAX; outf("p=%i y_paragraph=%f", p, y_paragraph); outf("t=%i y_table=%f", t, y_table); if (paragraph && y_paragraph < y_table) { //extract_astring_catf(alloc, output, "

@@@ paragraph %i y=%f @@@)

\n", p, y_paragraph); if (paragraph_to_html_content(alloc, &state, paragraph, 0 /*single_line*/, output)) goto end; if (content_state_reset(alloc, &state, output)) goto end; p += 1; } else if (table) { //extract_astring_catf(alloc, output, "

@@@ table %t y=%f @@@)

\n", p, y_table); if (append_table(alloc, &state, table, output)) goto end; t += 1; } } extract_free(alloc, ¶graphs); return 0; end: extract_free(alloc, ¶graphs); return -1; } int extract_document_to_html_content( extract_alloc_t* alloc, document_t* document, int rotation, int images, extract_astring_t* content ) { int ret = -1; int n; paragraph_t** paragraphs = NULL; (void) rotation; (void) images; extract_astring_cat(alloc, content, "\n"); extract_astring_cat(alloc, content, "\n"); /* Write paragraphs into . */ for (n=0; npages_num; ++n) { extract_page_t* page = document->pages[n]; subpage_t **psubpage = page->subpages; /* Every page gets its own div. */ extract_astring_cat(alloc, content, "
\n"); ret = split_to_html(alloc, page->split, &psubpage, content); if (ret) goto end; extract_astring_cat(alloc, content, "
\n"); } extract_astring_cat(alloc, content, "\n"); extract_astring_cat(alloc, content, "\n"); ret = 0; end: extract_free(alloc, ¶graphs); return ret; }