diff options
Diffstat (limited to 'pdf/pdf_deref.c')
-rw-r--r-- | pdf/pdf_deref.c | 1028 |
1 files changed, 1028 insertions, 0 deletions
diff --git a/pdf/pdf_deref.c b/pdf/pdf_deref.c new file mode 100644 index 00000000..fb3ce67b --- /dev/null +++ b/pdf/pdf_deref.c @@ -0,0 +1,1028 @@ +/* Copyright (C) 2020-2021 Artifex Software, Inc. + All Rights Reserved. + + This software is provided AS-IS with no warranty, either express or + implied. + + This software is distributed under license and may not be copied, + modified or distributed except as expressly authorized under the terms + of the license contained in the file LICENSE in this distribution. + + Refer to licensing information at http://www.artifex.com or contact + Artifex Software, Inc., 1305 Grant Avenue - Suite 200, Novato, + CA 94945, U.S.A., +1(415)492-9861, for further information. +*/ + +/* Functions to deal with dereferencing indirect objects + * for the PDF interpreter. In here we also keep the code + * for dealing with the object cache, because the dereferencing + * functions are currently the only place that deals with it. + */ + +#include "pdf_int.h" +#include "pdf_stack.h" +#include "pdf_loop_detect.h" +#include "strmio.h" +#include "stream.h" +#include "pdf_file.h" +#include "pdf_misc.h" +#include "pdf_dict.h" +#include "pdf_array.h" +#include "pdf_deref.h" +#include "pdf_repair.h" + +/* Start with the object caching functions */ + +/* given an object, create a cache entry for it. If we have too many entries + * then delete the leat-recently-used cache entry. Make the new entry be the + * most-recently-used entry. The actual entries are attached to the xref table + * (as well as being a double-linked list), because we detect an existing + * cache entry by seeing that the xref table for the object number has a non-NULL + * 'cache' member. + * So we need to update the xref as well if we add or delete cache entries. + */ +static int pdfi_add_to_cache(pdf_context *ctx, pdf_obj *o) +{ + pdf_obj_cache_entry *entry; + + if (ctx->xref_table->xref[o->object_num].cache != NULL) { +#if DEBUG_CACHE + dmprintf1(ctx->memory, "Attempting to add object %d to cache when the object is already cached!\n", o->object_num); +#endif + return_error(gs_error_unknownerror); + } + + if (o->object_num > ctx->xref_table->xref_size) + return_error(gs_error_rangecheck); + + if (ctx->cache_entries == MAX_OBJECT_CACHE_SIZE) + { +#if DEBUG_CACHE + dbgmprintf(ctx->memory, "Cache full, evicting LRU\n"); +#endif + if (ctx->cache_LRU) { + entry = ctx->cache_LRU; + ctx->cache_LRU = entry->next; + if (entry->next) + ((pdf_obj_cache_entry *)entry->next)->previous = NULL; + ctx->xref_table->xref[entry->o->object_num].cache = NULL; + pdfi_countdown(entry->o); + ctx->cache_entries--; + gs_free_object(ctx->memory, entry, "pdfi_add_to_cache, free LRU"); + } else + return_error(gs_error_unknownerror); + } + entry = (pdf_obj_cache_entry *)gs_alloc_bytes(ctx->memory, sizeof(pdf_obj_cache_entry), "pdfi_add_to_cache"); + if (entry == NULL) + return_error(gs_error_VMerror); + + memset(entry, 0x00, sizeof(pdf_obj_cache_entry)); + + entry->o = o; + pdfi_countup(o); + if (ctx->cache_MRU) { + entry->previous = ctx->cache_MRU; + ctx->cache_MRU->next = entry; + } + ctx->cache_MRU = entry; + if (ctx->cache_LRU == NULL) + ctx->cache_LRU = entry; + + ctx->cache_entries++; + ctx->xref_table->xref[o->object_num].cache = entry; + return 0; +} + +/* Given an existing cache entry, promote it to be the most-recently-used + * cache entry. + */ +static void pdfi_promote_cache_entry(pdf_context *ctx, pdf_obj_cache_entry *cache_entry) +{ + if (ctx->cache_MRU && cache_entry != ctx->cache_MRU) { + if ((pdf_obj_cache_entry *)cache_entry->next != NULL) + ((pdf_obj_cache_entry *)cache_entry->next)->previous = cache_entry->previous; + if ((pdf_obj_cache_entry *)cache_entry->previous != NULL) + ((pdf_obj_cache_entry *)cache_entry->previous)->next = cache_entry->next; + else { + /* the existing entry is the current least recently used, we need to make the 'next' + * cache entry into the LRU. + */ + ctx->cache_LRU = cache_entry->next; + } + cache_entry->next = NULL; + cache_entry->previous = ctx->cache_MRU; + ctx->cache_MRU->next = cache_entry; + ctx->cache_MRU = cache_entry; + } + return; +} + +/* This one's a bit of an oddity, its used for fonts. When we build a PDF font object + * we want the object cache to reference *that* object, not the dictionary which was + * read out of the PDF file, so this allows us to replace the font dictionary in the + * cache with the actual font object, so that later dereferences will get this font + * object. + */ +int replace_cache_entry(pdf_context *ctx, pdf_obj *o) +{ + xref_entry *entry; + pdf_obj_cache_entry *cache_entry; + pdf_obj *old_cached_obj = NULL; + + /* Limited error checking here, we assume that things like the + * validity of the object (eg not a free oobject) have already been handled. + */ + + entry = &ctx->xref_table->xref[o->object_num]; + cache_entry = entry->cache; + + if (cache_entry == NULL) { + return(pdfi_add_to_cache(ctx, o)); + } else { + /* NOTE: We grab the object without decrementing, to avoid triggering + * a warning message for freeing an object that's in the cache + */ + if (cache_entry->o != NULL) + old_cached_obj = cache_entry->o; + + /* Put new entry in the cache */ + cache_entry->o = o; + pdfi_countup(o); + pdfi_promote_cache_entry(ctx, cache_entry); + + /* Now decrement the old cache entry, if any */ + pdfi_countdown(old_cached_obj); + } + return 0; +} + +/* Now the dereferencing functions */ + +/* + * Technically we can accept a stream other than the main PDF file stream here. This is + * really for the case of compressed objects where we read tokens from the compressed + * stream, but it also (with some judicious tinkering) allows us to layer a SubFileDecode + * on top of the main file stream, which may be useful. Note that this cannot work with + * objects in compressed object streams! They should always pass a value of 0 for the stream_offset. + * The stream_offset is the offset from the start of the underlying uncompressed PDF file of + * the stream we are using. See the comments below when keyword is PDF_STREAM. + */ + +/* Determine if a PDF object is in a compressed ObjStm. Returns < 0 + * for an error, 0 if it is not in a compressed ObjStm and 1 if it is. + * Currently errors are inmpossible. This is only used by the decryption code + * to determine if a string is in a compressed object stream, if it is then + * it can't be used for decryption. + */ +int is_compressed_object(pdf_context *ctx, uint32_t obj, uint32_t gen) +{ + xref_entry *entry; + + /* Can't possibly be a compressed object before we have finished reading + * the xref. + */ + if (ctx->xref_table == NULL) + return 0; + + entry = &ctx->xref_table->xref[obj]; + + if (entry->compressed) + return 1; + + return 0; +} + +/* We should never read a 'stream' keyword from a compressed object stream + * so this case should never end up here. + */ +static int pdfi_read_stream_object(pdf_context *ctx, pdf_c_stream *s, gs_offset_t stream_offset, + uint32_t objnum, uint32_t gen) +{ + int code = 0; + int64_t i; + pdf_keyword *keyword = NULL; + pdf_dict *dict = NULL; + gs_offset_t offset; + pdf_stream *stream_obj = NULL; + + /* Strange code time.... + * If we are using a stream which is *not* the PDF uncompressed main file stream + * then doing stell on it will only tell us how many bytes have been read from + * that stream, it won't tell us the underlying file position. So we add on the + * 'unread' bytes, *and* we add on the position of the start of the stream in + * the actual main file. This is all done so that we can check the /Length + * of the object. Note that this will *only* work for regular objects it can + * not be used for compressed object streams, but those don't need checking anyway + * they have a different mechanism altogether and should never get here. + */ + offset = stell(s->s) - s->unread_size + stream_offset; + code = pdfi_seek(ctx, ctx->main_stream, offset, SEEK_SET); + + if (pdfi_count_stack(ctx) < 1) + return_error(gs_error_stackunderflow); + + dict = (pdf_dict *)ctx->stack_top[-1]; + dict->indirect_num = dict->object_num = objnum; + dict->indirect_gen = dict->generation_num = gen; + + if (dict->type != PDF_DICT) { + pdfi_pop(ctx, 1); + return_error(gs_error_syntaxerror); + } + + /* Convert the dict into a stream */ + code = pdfi_obj_dict_to_stream(ctx, dict, &stream_obj, true); + if (code < 0) { + pdfi_pop(ctx, 1); + return code; + } + /* Pop off the dict and push the stream */ + pdfi_pop(ctx, 1); + dict = NULL; + pdfi_push(ctx, (pdf_obj *)stream_obj); + pdfi_countdown(stream_obj); /* get rid of extra ref */ + + stream_obj->stream_dict->indirect_num = stream_obj->stream_dict->object_num = objnum; + stream_obj->stream_dict->indirect_gen = stream_obj->stream_dict->generation_num = gen; + stream_obj->stream_offset = offset; + + /* This code may be a performance overhead, it simply skips over the stream contents + * and checks that the stream ends with a 'endstream endobj' pair. We could add a + * 'go faster' flag for users who are certain their PDF files are well-formed. This + * could also allow us to skip all kinds of other checking..... + */ + + code = pdfi_dict_get_int(ctx, (pdf_dict *)stream_obj->stream_dict, "Length", &i); + if (code < 0) { + char extra_info[gp_file_name_sizeof]; + + gs_sprintf(extra_info, "Stream object %u missing mandatory keyword /Length, unable to verify the stream length.\n", objnum); + pdfi_set_error(ctx, 0, NULL, E_PDF_BADSTREAM, "pdfi_read_stream_object", extra_info); + return 0; + } + + if (i < 0 || (i + offset)> ctx->main_stream_length) { + char extra_info[gp_file_name_sizeof]; + + gs_sprintf(extra_info, "Stream object %u has /Length which, when added to offset of object, exceeds file size.\n", objnum); + pdfi_set_error(ctx, 0, NULL, E_PDF_BADSTREAM, "pdfi_read_stream_object", extra_info); + } else { + code = pdfi_seek(ctx, ctx->main_stream, i, SEEK_CUR); + if (code < 0) { + pdfi_pop(ctx, 1); + return code; + } + + stream_obj->Length = 0; + stream_obj->length_valid = false; + + code = pdfi_read_token(ctx, ctx->main_stream, objnum, gen); + if (code < 0 || pdfi_count_stack(ctx) < 2) { + char extra_info[gp_file_name_sizeof]; + + gs_sprintf(extra_info, "Failed to find a valid object at end of stream object %u.\n", objnum); + pdfi_log_info(ctx, "pdfi_read_stream_object", extra_info); + } + else { + if (((pdf_obj *)ctx->stack_top[-1])->type != PDF_KEYWORD) { + char extra_info[gp_file_name_sizeof]; + + gs_sprintf(extra_info, "Failed to find 'endstream' keyword at end of stream object %u.\n", objnum); + pdfi_set_error(ctx, 0, NULL, E_PDF_MISSINGENDOBJ, "pdfi_read_stream_object", extra_info); + } else { + keyword = ((pdf_keyword *)ctx->stack_top[-1]); + if (keyword->key != TOKEN_ENDSTREAM) { + char extra_info[gp_file_name_sizeof]; + + gs_sprintf(extra_info, "Stream object %u has an incorrect /Length of %"PRIu64"\n", objnum, i); + pdfi_log_info(ctx, "pdfi_read_stream_object", extra_info); + } else { + /* Cache the Length in the stream object and mark it valid */ + stream_obj->Length = i; + stream_obj->length_valid = true; + } + } + pdfi_pop(ctx, 1); + } + } + + /* If we failed to find a valid object, or the object wasn't a keyword, or the + * keywrod wasn't 'endstream' then the Length is wrong. We need to have the correct + * Length for streams if we have encrypted files, because we must install a + * SubFileDecode filter iwth a Length (EODString is incompatible with AES encryption) + * Rather than mess about checking for encryption, we'll choose to just correctly + * calculate the Length of all streams. Although this takes time, it will only + * happen for files which are invalid. + */ + if (stream_obj->length_valid != true) { + char Buffer[10]; + unsigned int loop, bytes, total = 0; + + code = pdfi_seek(ctx, ctx->main_stream, stream_obj->stream_offset, SEEK_SET); + if (code < 0) { + pdfi_pop(ctx, 1); + return code; + } + memset(Buffer, 0x00, 10); + bytes = pdfi_read_bytes(ctx, (byte *)Buffer, 1, 9, ctx->main_stream); + if (bytes < 9) + return_error(gs_error_ioerror); + + total = bytes; + do { + if (memcmp(Buffer, "endstream", 9) == 0) { + stream_obj->Length = total - 9; + stream_obj->length_valid = true; + break; + } + if (memcmp(Buffer, "endobj", 6) == 0) { + stream_obj->Length = total - 6; + stream_obj->length_valid = true; + break; + } + for (loop = 0;loop < 9;loop++){ + Buffer[loop] = Buffer[loop + 1]; + } + bytes = pdfi_read_bytes(ctx, (byte *)&Buffer[9], 1, 1, ctx->main_stream); + total += bytes; + } while(bytes); + if (bytes <= 0) + return_error(gs_error_ioerror); + return 0; + } + + code = pdfi_read_token(ctx, ctx->main_stream, objnum, gen); + if (code < 0) { + if (ctx->args.pdfstoponerror) + return code; + else + /* Something went wrong looking for endobj, but we found endstream, so assume + * for now that will suffice. + */ + pdfi_set_error(ctx, 0, NULL, E_PDF_MISSINGENDOBJ, "pdfi_read_stream_object", NULL); + return 0; + } + + if (pdfi_count_stack(ctx) < 2) + return_error(gs_error_stackunderflow); + + if (((pdf_obj *)ctx->stack_top[-1])->type != PDF_KEYWORD) { + pdfi_pop(ctx, 1); + if (ctx->args.pdfstoponerror) + return_error(gs_error_typecheck); + pdfi_set_error(ctx, 0, NULL, E_PDF_MISSINGENDOBJ, "pdfi_read_stream_object", NULL); + /* Didn't find an endobj, but we have an endstream, so assume + * for now that will suffice + */ + return 0; + } + keyword = ((pdf_keyword *)ctx->stack_top[-1]); + if (keyword->key != TOKEN_ENDOBJ) { + pdfi_pop(ctx, 2); + return_error(gs_error_typecheck); + } + pdfi_pop(ctx, 1); + return 0; +} + +/* This reads an object *after* the x y obj keyword has been found. Its broken out + * separately for the benefit of the repair code when reading the dictionary following + * the 'trailer' keyword, which does not have a 'obj' keyword. Note that it also does + * not have an 'endobj', we rely on the error handling to take care of that for us. + */ +int pdfi_read_bare_object(pdf_context *ctx, pdf_c_stream *s, gs_offset_t stream_offset, uint32_t objnum, uint32_t gen) +{ + int code = 0; + pdf_keyword *keyword = NULL; + gs_offset_t saved_offset[3]; + + saved_offset[0] = saved_offset[1] = saved_offset[2] = 0; + + code = pdfi_read_token(ctx, s, objnum, gen); + if (code < 0) + return code; + + do { + /* move all the saved offsets up by one */ + saved_offset[0] = saved_offset[1]; + saved_offset[1] = saved_offset[2]; + saved_offset[2] = pdfi_unread_tell(ctx);; + + code = pdfi_read_token(ctx, s, objnum, gen); + if (code < 0) { + pdfi_clearstack(ctx); + return code; + } + if (s->eof) + return_error(gs_error_syntaxerror); + }while (ctx->stack_top[-1]->type != PDF_KEYWORD); + + keyword = ((pdf_keyword *)ctx->stack_top[-1]); + if (keyword->key == TOKEN_ENDOBJ) { + pdf_obj *o; + + if (pdfi_count_stack(ctx) < 2) { + pdfi_clearstack(ctx); + return_error(gs_error_stackunderflow); + } + + o = ctx->stack_top[-2]; + + pdfi_pop(ctx, 1); + + o->indirect_num = o->object_num = objnum; + o->indirect_gen = o->generation_num = gen; + return code; + } + if (keyword->key == TOKEN_STREAM) { + pdfi_pop(ctx, 1); + return pdfi_read_stream_object(ctx, s, stream_offset, objnum, gen); + } + if (keyword->key == TOKEN_OBJ) { + pdf_obj *o; + + pdfi_set_error(ctx, 0, NULL, E_PDF_MISSINGENDOBJ, "pdfi_read_bare_object", NULL); + + /* 4 for; the object we want, the object number, generation number and 'obj' keyword */ + if (pdfi_count_stack(ctx) < 4) + return_error(gs_error_stackunderflow); + + /* If we have that many objects, assume that we can throw away the x y obj and just use the remaining object */ + o = ctx->stack_top[-4]; + + pdfi_pop(ctx, 3); + + o->indirect_num = o->object_num = objnum; + o->indirect_gen = o->generation_num = gen; + if (saved_offset[0] > 0) + (void)pdfi_seek(ctx, s, saved_offset[0], SEEK_SET); + return 0; + } + + /* Assume that any other keyword means a missing 'endobj' */ + if (!ctx->args.pdfstoponerror) { + pdf_obj *o; + + pdfi_set_error(ctx, 0, NULL, E_PDF_MISSINGENDOBJ, "pdfi_read_bare_object", NULL); + + if (pdfi_count_stack(ctx) < 2) + return_error(gs_error_stackunderflow); + + o = ctx->stack_top[-2]; + + pdfi_pop(ctx, 1); + + o->indirect_num = o->object_num = objnum; + o->indirect_gen = o->generation_num = gen; + return code; + } + pdfi_pop(ctx, 2); + return_error(gs_error_syntaxerror); +} + +static int pdfi_read_object(pdf_context *ctx, pdf_c_stream *s, gs_offset_t stream_offset) +{ + int code = 0, stack_size = pdfi_count_stack(ctx); + uint64_t objnum = 0, gen = 0; + pdf_keyword *keyword = NULL; + + /* An object consists of 'num gen obj' followed by a token, follwed by an endobj + * A stream dictionary might have a 'stream' instead of an 'endobj', in which case we + * want to deal with it specially by getting the Length, jumping to the end and checking + * for an endobj. Or not, possibly, because it would be slow. + */ + code = pdfi_read_token(ctx, s, 0, 0); + if (code < 0) + return code; + if (stack_size >= pdfi_count_stack(ctx)) + return gs_note_error(gs_error_ioerror); + if (((pdf_obj *)ctx->stack_top[-1])->type != PDF_INT) { + pdfi_pop(ctx, 1); + return_error(gs_error_typecheck); + } + objnum = ((pdf_num *)ctx->stack_top[-1])->value.i; + pdfi_pop(ctx, 1); + + code = pdfi_read_token(ctx, s, 0, 0); + if (code < 0) + return code; + if (stack_size >= pdfi_count_stack(ctx)) + return gs_note_error(gs_error_ioerror); + if (((pdf_obj *)ctx->stack_top[-1])->type != PDF_INT) { + pdfi_pop(ctx, 1); + return_error(gs_error_typecheck); + } + gen = ((pdf_num *)ctx->stack_top[-1])->value.i; + pdfi_pop(ctx, 1); + + code = pdfi_read_token(ctx, s, 0, 0); + if (code < 0) + return code; + if (stack_size >= pdfi_count_stack(ctx)) + return gs_note_error(gs_error_ioerror); + if (((pdf_obj *)ctx->stack_top[-1])->type != PDF_KEYWORD) { + pdfi_pop(ctx, 1); + return_error(gs_error_typecheck); + } + keyword = ((pdf_keyword *)ctx->stack_top[-1]); + if (keyword->key != TOKEN_OBJ) { + pdfi_pop(ctx, 1); + return_error(gs_error_syntaxerror); + } + pdfi_pop(ctx, 1); + + return pdfi_read_bare_object(ctx, s, stream_offset, objnum, gen); +} + +static int pdfi_deref_compressed(pdf_context *ctx, uint64_t obj, uint64_t gen, pdf_obj **object, + const xref_entry *entry) +{ + int code = 0; + xref_entry *compressed_entry = &ctx->xref_table->xref[entry->u.compressed.compressed_stream_num]; + pdf_c_stream *compressed_stream = NULL; + pdf_c_stream *SubFile_stream = NULL; + pdf_c_stream *Object_stream = NULL; + char Buffer[256]; + int i = 0, object_length = 0; + int64_t num_entries, found_object; + int64_t Length; + gs_offset_t offset = 0; + pdf_stream *compressed_object = NULL; + pdf_dict *compressed_sdict = NULL; /* alias */ + pdf_name *Type = NULL; + pdf_obj *temp_obj; + + if (ctx->args.pdfdebug) { + dmprintf1(ctx->memory, "%% Reading compressed object (%"PRIi64" 0 obj)", obj); + dmprintf1(ctx->memory, " from ObjStm with object number %"PRIi64"\n", compressed_entry->object_num); + } + + if (compressed_entry->cache == NULL) { +#if CACHE_STATISTICS + ctx->compressed_misses++; +#endif + code = pdfi_seek(ctx, ctx->main_stream, compressed_entry->u.uncompressed.offset, SEEK_SET); + if (code < 0) + goto exit; + + code = pdfi_read_object(ctx, ctx->main_stream, 0); + if (code < 0) + goto exit; + + if ((ctx->stack_top[-1])->type != PDF_STREAM) { + pdfi_pop(ctx, 1); + code = gs_note_error(gs_error_typecheck); + goto exit; + } + if (ctx->stack_top[-1]->object_num != compressed_entry->object_num) { + pdfi_pop(ctx, 1); + /* Same error (undefined) as when we read an uncompressed object with the wrong number */ + code = gs_note_error(gs_error_undefined); + goto exit; + } + compressed_object = (pdf_stream *)ctx->stack_top[-1]; + pdfi_countup(compressed_object); + pdfi_pop(ctx, 1); + code = pdfi_add_to_cache(ctx, (pdf_obj *)compressed_object); + if (code < 0) + goto exit; + } else { +#if CACHE_STATISTICS + ctx->compressed_hits++; +#endif + compressed_object = (pdf_stream *)compressed_entry->cache->o; + pdfi_countup(compressed_object); + pdfi_promote_cache_entry(ctx, compressed_entry->cache); + } + code = pdfi_dict_from_obj(ctx, (pdf_obj *)compressed_object, &compressed_sdict); + if (code < 0) + return code; + + /* Check its an ObjStm ! */ + code = pdfi_dict_get_type(ctx, compressed_sdict, "Type", PDF_NAME, (pdf_obj **)&Type); + if (code < 0) + goto exit; + + if (!pdfi_name_is(Type, "ObjStm")){ + code = gs_note_error(gs_error_syntaxerror); + goto exit; + } + + /* Need to check the /N entry to see if the object is actually in this stream! */ + code = pdfi_dict_get_int(ctx, compressed_sdict, "N", &num_entries); + if (code < 0) + goto exit; + + if (num_entries < 0 || num_entries > ctx->xref_table->xref_size) { + code = gs_note_error(gs_error_rangecheck); + goto exit; + } + + code = pdfi_seek(ctx, ctx->main_stream, pdfi_stream_offset(ctx, compressed_object), SEEK_SET); + if (code < 0) + goto exit; + + code = pdfi_dict_get_int(ctx, compressed_sdict, "Length", &Length); + if (code < 0) + goto exit; + + code = pdfi_apply_SubFileDecode_filter(ctx, Length, NULL, ctx->main_stream, &SubFile_stream, false); + if (code < 0) + goto exit; + + code = pdfi_filter(ctx, compressed_object, SubFile_stream, &compressed_stream, false); + if (code < 0) + goto exit; + + for (i=0;i < num_entries;i++) + { + code = pdfi_read_token(ctx, compressed_stream, obj, gen); + if (code < 0) + goto exit; + temp_obj = ctx->stack_top[-1]; + if (temp_obj->type != PDF_INT) { + code = gs_note_error(gs_error_typecheck); + pdfi_pop(ctx, 1); + goto exit; + } + found_object = ((pdf_num *)temp_obj)->value.i; + pdfi_pop(ctx, 1); + code = pdfi_read_token(ctx, compressed_stream, obj, gen); + if (code < 0) + goto exit; + temp_obj = ctx->stack_top[-1]; + if (temp_obj->type != PDF_INT) { + pdfi_pop(ctx, 1); + goto exit; + } + if (i == entry->u.compressed.object_index) { + if (found_object != obj) { + pdfi_pop(ctx, 1); + code = gs_note_error(gs_error_undefined); + goto exit; + } + offset = ((pdf_num *)temp_obj)->value.i; + } + if (i == entry->u.compressed.object_index + 1) + object_length = ((pdf_num *)temp_obj)->value.i - offset; + pdfi_pop(ctx, 1); + } + + /* Skip to the offset of the object we want to read */ + for (i=0;i < offset;i++) + { + code = pdfi_read_bytes(ctx, (byte *)&Buffer[0], 1, 1, compressed_stream); + if (code <= 0) { + code = gs_note_error(gs_error_ioerror); + goto exit; + } + } + + /* If object_length is not 0, then we want to apply a SubFileDecode filter to limit + * the number of bytes we read to the declared size of the object (difference between + * the offsets of the object we want to read, and the next object). If it is 0 then + * we're reading the last object in the stream, so we just rely on the SubFileDecode + * we set up when we created compressed_stream to limit the bytes to the length of + * that stream. + */ + if (object_length > 0) { + code = pdfi_apply_SubFileDecode_filter(ctx, object_length, NULL, compressed_stream, &Object_stream, false); + if (code < 0) + goto exit; + } else { + Object_stream = compressed_stream; + } + + code = pdfi_read_token(ctx, Object_stream, obj, gen); + if (code < 0) + goto exit; + if (ctx->stack_top[-1]->type == PDF_ARRAY_MARK || ctx->stack_top[-1]->type == PDF_DICT_MARK) { + int start_depth = pdfi_count_stack(ctx); + + /* Need to read all the elements from COS objects */ + do { + code = pdfi_read_token(ctx, Object_stream, obj, gen); + if (code < 0) + goto exit; + if (compressed_stream->eof == true) { + code = gs_note_error(gs_error_ioerror); + goto exit; + } + }while ((ctx->stack_top[-1]->type != PDF_ARRAY && ctx->stack_top[-1]->type != PDF_DICT) || pdfi_count_stack(ctx) > start_depth); + } + + *object = ctx->stack_top[-1]; + /* For compressed objects we don't get a 'obj gen obj' sequence which is what sets + * the object number for uncompressed objects. So we need to do that here. + */ + (*object)->indirect_num = (*object)->object_num = obj; + (*object)->indirect_gen = (*object)->generation_num = gen; + pdfi_countup(*object); + pdfi_pop(ctx, 1); + + code = pdfi_add_to_cache(ctx, *object); + if (code < 0) { + pdfi_countdown(*object); + goto exit; + } + + exit: + if (Object_stream) + pdfi_close_file(ctx, Object_stream); + if (Object_stream != compressed_stream) + if (compressed_stream) + pdfi_close_file(ctx, compressed_stream); + if (SubFile_stream) + pdfi_close_file(ctx, SubFile_stream); + pdfi_countdown(compressed_object); + pdfi_countdown(Type); + return code; +} + +/* pdf_dereference returns an object with a reference count of at least 1, this represents the + * reference being held by the caller (in **object) when we return from this function. + */ +int pdfi_dereference(pdf_context *ctx, uint64_t obj, uint64_t gen, pdf_obj **object) +{ + xref_entry *entry; + int code, stack_depth = pdfi_count_stack(ctx); + gs_offset_t saved_stream_offset; + bool saved_decrypt_strings = ctx->encryption.decrypt_strings; + + *object = NULL; + + if (ctx->xref_table == NULL) + return_error(gs_error_typecheck); + + if (obj >= ctx->xref_table->xref_size) { + char extra_info[gp_file_name_sizeof]; + + gs_sprintf(extra_info, "Error, attempted to dereference object %"PRIu64", which is not present in the xref table\n", obj); + pdfi_set_error(ctx, 0, NULL, E_PDF_BADOBJNUMBER, "pdfi_dereference", extra_info); + + if(ctx->args.pdfstoponerror) + return_error(gs_error_rangecheck); + + code = pdfi_object_alloc(ctx, PDF_NULL, 0, object); + if (code == 0) + pdfi_countup(*object); + return code; + } + + entry = &ctx->xref_table->xref[obj]; + + if(entry->object_num == 0) + return_error(gs_error_undefined); + + if (entry->free) { + char extra_info[gp_file_name_sizeof]; + + gs_sprintf(extra_info, "Attempt to dereference free object %"PRIu64", trying next object number as offset.\n", entry->object_num); + pdfi_set_error(ctx, 0, NULL, E_PDF_DEREF_FREE_OBJ, "pdfi_dereference", extra_info); + } + + if (ctx->loop_detection) { + if (pdfi_loop_detector_check_object(ctx, obj) == true) + return_error(gs_error_circular_reference); + } + if (entry->cache != NULL){ + pdf_obj_cache_entry *cache_entry = entry->cache; + +#if CACHE_STATISTICS + ctx->hits++; +#endif + *object = cache_entry->o; + pdfi_countup(*object); + + pdfi_promote_cache_entry(ctx, cache_entry); + } else { + saved_stream_offset = pdfi_unread_tell(ctx); + + if (entry->compressed) { + /* This is an object in a compressed object stream */ + ctx->encryption.decrypt_strings = false; + + code = pdfi_deref_compressed(ctx, obj, gen, object, entry); + if (code < 0 || *object == NULL) + goto error; + } else { + pdf_c_stream *SubFile_stream = NULL; + pdf_string *EODString; +#if CACHE_STATISTICS + ctx->misses++; +#endif + ctx->encryption.decrypt_strings = true; + + code = pdfi_seek(ctx, ctx->main_stream, entry->u.uncompressed.offset, SEEK_SET); + if (code < 0) + goto error; + + code = pdfi_name_alloc(ctx, (byte *)"trailer", 6, (pdf_obj **)&EODString); + if (code < 0) + goto error; + pdfi_countup(EODString); + + code = pdfi_apply_SubFileDecode_filter(ctx, 0, EODString, ctx->main_stream, &SubFile_stream, false); + if (code < 0) { + pdfi_countdown(EODString); + goto error; + } + + code = pdfi_read_object(ctx, SubFile_stream, entry->u.uncompressed.offset); + + pdfi_countdown(EODString); + pdfi_close_file(ctx, SubFile_stream); + if (code < 0) { + int code1 = 0; + if (entry->free) { + dmprintf2(ctx->memory, "Dereference of free object %"PRIu64", next object number as offset failed (code = %d), returning NULL object.\n", entry->object_num, code); + code = pdfi_object_alloc(ctx, PDF_NULL, 1, object); + if (code >= 0) { + pdfi_countup(*object); + goto free_obj; + } + } + ctx->encryption.decrypt_strings = saved_decrypt_strings; + (void)pdfi_seek(ctx, ctx->main_stream, saved_stream_offset, SEEK_SET); + pdfi_pop(ctx, pdfi_count_stack(ctx) - stack_depth); + + code1 = pdfi_repair_file(ctx); + if (code1 == 0) + return pdfi_dereference(ctx, obj, gen, object); + /* Repair failed, just give up and return an error */ + return code; + } + + if (pdfi_count_stack(ctx) > 0 && (ctx->stack_top[-1])->object_num == obj) { + *object = ctx->stack_top[-1]; + pdfi_countup(*object); + pdfi_pop(ctx, 1); + code = pdfi_add_to_cache(ctx, *object); + if (code < 0) { + pdfi_countdown(*object); + goto error; + } + } else { + pdfi_pop(ctx, 1); + if (entry->free) { + dmprintf1(ctx->memory, "Dereference of free object %"PRIu64", next object number as offset failed, returning NULL object.\n", entry->object_num); + code = pdfi_object_alloc(ctx, PDF_NULL, 1, object); + if (code >= 0) + pdfi_countup(*object); + return code; + } + code = gs_note_error(gs_error_undefined); + goto error; + } + } +free_obj: + (void)pdfi_seek(ctx, ctx->main_stream, saved_stream_offset, SEEK_SET); + } + + if (ctx->loop_detection && (*object)->object_num != 0) { + code = pdfi_loop_detector_add_object(ctx, (*object)->object_num); + if (code < 0) { + ctx->encryption.decrypt_strings = saved_decrypt_strings; + return code; + } + } + ctx->encryption.decrypt_strings = saved_decrypt_strings; + return 0; + +error: + ctx->encryption.decrypt_strings = saved_decrypt_strings; + (void)pdfi_seek(ctx, ctx->main_stream, saved_stream_offset, SEEK_SET); + /* Return the stack to the state at entry */ + pdfi_pop(ctx, pdfi_count_stack(ctx) - stack_depth); + return code; +} + +/* do a derefence with loop detection */ +int pdfi_deref_loop_detect(pdf_context *ctx, uint64_t obj, uint64_t gen, pdf_obj **object) +{ + int code; + + code = pdfi_loop_detector_mark(ctx); + if (code < 0) + return code; + + code = pdfi_dereference(ctx, obj, gen, object); + (void)pdfi_loop_detector_cleartomark(ctx); + return code; +} + + +static int pdfi_resolve_indirect_array(pdf_context *ctx, pdf_obj *obj, bool recurse) +{ + int code = 0; + uint64_t index, arraysize; + pdf_obj *object = NULL; + pdf_array *array = (pdf_array *)obj; + + arraysize = pdfi_array_size(array); + for (index = 0; index < arraysize; index++) { + code = pdfi_array_get_no_store_R(ctx, array, index, &object); + if (code == gs_error_circular_reference) { + /* Just leave as an indirect ref */ + code = 0; + } else { + if (code < 0) goto exit; + /* don't store the object if it's a stream (leave as a ref) */ + if (object->type != PDF_STREAM) + code = pdfi_array_put(ctx, array, index, object); + if (recurse) + code = pdfi_resolve_indirect(ctx, object, recurse); + } + if (code < 0) goto exit; + + pdfi_countdown(object); + object = NULL; + } + + exit: + pdfi_countdown(object); + return code; +} + +static int pdfi_resolve_indirect_dict(pdf_context *ctx, pdf_obj *obj, bool recurse) +{ + int code = 0; + pdf_dict *dict = (pdf_dict *)obj; + pdf_name *Key = NULL; + pdf_obj *Value = NULL; + uint64_t index, dictsize; + + dictsize = pdfi_dict_entries(dict); + + /* Note: I am not using pdfi_dict_first/next because of needing to handle + * circular references. + */ + for (index=0; index<dictsize; index ++) { + Key = (pdf_name *)dict->keys[index]; + code = pdfi_dict_get_no_store_R_key(ctx, dict, Key, &Value); + if (code == gs_error_circular_reference) { + /* Just leave as an indirect ref */ + code = 0; + } else { + if (code < 0) goto exit; + /* don't store the object if it's a stream (leave as a ref) */ + if (Value->type != PDF_STREAM) + pdfi_dict_put_obj(ctx, dict, (pdf_obj *)Key, Value); + if (recurse) + code = pdfi_resolve_indirect(ctx, Value, recurse); + } + if (code < 0) goto exit; + + pdfi_countdown(Value); + Value = NULL; + } + + exit: + pdfi_countdown(Value); + return code; +} + +/* Resolve all the indirect references for an object + * Note: This can be recursive + */ +int pdfi_resolve_indirect(pdf_context *ctx, pdf_obj *value, bool recurse) +{ + int code = 0; + + switch(value->type) { + case PDF_ARRAY: + code = pdfi_resolve_indirect_array(ctx, value, recurse); + break; + case PDF_DICT: + code = pdfi_resolve_indirect_dict(ctx, value, recurse); + break; + default: + break; + } + return code; +} + +/* Resolve all the indirect references for an object + * Resolve indirect references, either one level or recursively, with loop detect on + * the parent (can by NULL) and the value. + */ +int pdfi_resolve_indirect_loop_detect(pdf_context *ctx, pdf_obj *parent, pdf_obj *value, bool recurse) +{ + int code = 0; + + code = pdfi_loop_detector_mark(ctx); + if (code < 0) goto exit; + if (parent && parent->object_num != 0) { + code = pdfi_loop_detector_add_object(ctx, parent->object_num); + if (code < 0) goto exit; + } + if (value->object_num != 0) { + code = pdfi_loop_detector_add_object(ctx, value->object_num); + if (code < 0) goto exit; + } + code = pdfi_resolve_indirect(ctx, value, false); + + exit: + (void)pdfi_loop_detector_cleartomark(ctx); /* Clear to the mark for the current loop */ + return code; +} |