diff options
Diffstat (limited to 'cvs2svn_lib/record_table.py')
-rw-r--r-- | cvs2svn_lib/record_table.py | 399 |
1 files changed, 0 insertions, 399 deletions
diff --git a/cvs2svn_lib/record_table.py b/cvs2svn_lib/record_table.py deleted file mode 100644 index 41ab84a..0000000 --- a/cvs2svn_lib/record_table.py +++ /dev/null @@ -1,399 +0,0 @@ -# (Be in -*- python -*- mode.) -# -# ==================================================================== -# Copyright (c) 2000-2008 CollabNet. All rights reserved. -# -# This software is licensed as described in the file COPYING, which -# you should have received as part of this distribution. The terms -# are also available at http://subversion.tigris.org/license-1.html. -# If newer versions of this license are posted there, you may use a -# newer version instead, at your option. -# -# This software consists of voluntary contributions made by many -# individuals. For exact contribution history, see the revision -# history and logs, available at http://cvs2svn.tigris.org/. -# ==================================================================== - -"""Classes to manage Databases of fixed-length records. - -The databases map small, non-negative integers to fixed-size records. -The records are written in index order to a disk file. Gaps in the -index sequence leave gaps in the data file, so for best space -efficiency the indexes of existing records should be approximately -continuous. - -To use a RecordTable, you need a class derived from Packer which can -serialize/deserialize your records into fixed-size strings. Deriving -classes have to specify how to pack records into strings and unpack -strings into records by overwriting the pack() and unpack() methods -respectively. - -Note that these classes keep track of gaps in the records that have -been written by filling them with packer.empty_value. If a record is -read which contains packer.empty_value, then a KeyError is raised.""" - - -import os -import types -import struct -import mmap - -from cvs2svn_lib.common import DB_OPEN_READ -from cvs2svn_lib.common import DB_OPEN_WRITE -from cvs2svn_lib.common import DB_OPEN_NEW -from cvs2svn_lib.log import Log - - -# A unique value that can be used to stand for "unset" without -# preventing the use of None. -_unset = object() - - -class Packer(object): - def __init__(self, record_len, empty_value=None): - self.record_len = record_len - if empty_value is None: - self.empty_value = '\0' * self.record_len - else: - assert type(empty_value) is types.StringType - assert len(empty_value) == self.record_len - self.empty_value = empty_value - - def pack(self, v): - """Pack record V into a string of length self.record_len.""" - - raise NotImplementedError() - - def unpack(self, s): - """Unpack string S into a record.""" - - raise NotImplementedError() - - -class StructPacker(Packer): - def __init__(self, format, empty_value=_unset): - self.format = format - if empty_value is not _unset: - empty_value = self.pack(empty_value) - else: - empty_value = None - - Packer.__init__(self, struct.calcsize(self.format), - empty_value=empty_value) - - def pack(self, v): - return struct.pack(self.format, v) - - def unpack(self, v): - return struct.unpack(self.format, v)[0] - - -class UnsignedIntegerPacker(StructPacker): - def __init__(self, empty_value=0): - StructPacker.__init__(self, '=I', empty_value) - - -class SignedIntegerPacker(StructPacker): - def __init__(self, empty_value=0): - StructPacker.__init__(self, '=i', empty_value) - - -class FileOffsetPacker(Packer): - """A packer suitable for file offsets. - - We store the 5 least significant bytes of the file offset. This is - enough bits to represent 1 TiB. Of course if the computer - doesn't have large file support, only the lowest 31 bits can be - nonzero, and the offsets are limited to 2 GiB.""" - - # Convert file offsets to 8-bit little-endian unsigned longs... - INDEX_FORMAT = '<Q' - # ...but then truncate to 5 bytes. - INDEX_FORMAT_LEN = 5 - - PAD = '\0' * (struct.calcsize(INDEX_FORMAT) - INDEX_FORMAT_LEN) - - def __init__(self): - Packer.__init__(self, self.INDEX_FORMAT_LEN) - - def pack(self, v): - return struct.pack(self.INDEX_FORMAT, v)[:self.INDEX_FORMAT_LEN] - - def unpack(self, s): - return struct.unpack(self.INDEX_FORMAT, s + self.PAD)[0] - - -class RecordTableAccessError(RuntimeError): - pass - - -class AbstractRecordTable: - def __init__(self, filename, mode, packer): - self.filename = filename - self.mode = mode - self.packer = packer - # Simplify and speed access to this oft-needed quantity: - self._record_len = self.packer.record_len - - def __str__(self): - return '%s(%r)' % (self.__class__.__name__, self.filename,) - - def _set_packed_record(self, i, s): - """Set the value for index I to the packed value S.""" - - raise NotImplementedError() - - def __setitem__(self, i, v): - self._set_packed_record(i, self.packer.pack(v)) - - def _get_packed_record(self, i): - """Return the packed record for index I. - - Raise KeyError if it is not present.""" - - raise NotImplementedError() - - def __getitem__(self, i): - """Return the item for index I. - - Raise KeyError if that item has never been set (or if it was set - to self.packer.empty_value).""" - - s = self._get_packed_record(i) - - if s == self.packer.empty_value: - raise KeyError(i) - - return self.packer.unpack(s) - - def get_many(self, indexes, default=None): - """Yield (index, item) typles for INDEXES in arbitrary order. - - Yield (index,default) for indices for which not item is defined.""" - - indexes = list(indexes) - # Sort the indexes to reduce disk seeking: - indexes.sort() - for i in indexes: - yield (i, self.get(i, default)) - - def get(self, i, default=None): - try: - return self[i] - except KeyError: - return default - - def __delitem__(self, i): - """Delete the item for index I. - - Raise KeyError if that item has never been set (or if it was set - to self.packer.empty_value).""" - - if self.mode == DB_OPEN_READ: - raise RecordTableAccessError() - - # Check that the value was set (otherwise raise KeyError): - self[i] - self._set_packed_record(i, self.packer.empty_value) - - def iterkeys(self): - """Yield the keys in the map in key order.""" - - for i in xrange(0, self._limit): - try: - self[i] - yield i - except KeyError: - pass - - def itervalues(self): - """Yield the values in the map in key order. - - Skip over values that haven't been defined.""" - - for i in xrange(0, self._limit): - try: - yield self[i] - except KeyError: - pass - - -class RecordTable(AbstractRecordTable): - # The approximate amount of memory that should be used for the cache - # for each instance of this class: - CACHE_MEMORY = 4 * 1024 * 1024 - - # Empirically, each entry in the cache table has an overhead of - # about 96 bytes on a 32-bit computer. - CACHE_OVERHEAD_PER_ENTRY = 96 - - def __init__(self, filename, mode, packer, cache_memory=CACHE_MEMORY): - AbstractRecordTable.__init__(self, filename, mode, packer) - if self.mode == DB_OPEN_NEW: - self.f = open(self.filename, 'wb+') - elif self.mode == DB_OPEN_WRITE: - self.f = open(self.filename, 'rb+') - elif self.mode == DB_OPEN_READ: - self.f = open(self.filename, 'rb') - else: - raise RuntimeError('Invalid mode %r' % self.mode) - self.cache_memory = cache_memory - - # Number of items that can be stored in the write cache. - self._max_memory_cache = ( - self.cache_memory - / (self.CACHE_OVERHEAD_PER_ENTRY + self._record_len)) - - # Read and write cache; a map {i : (dirty, s)}, where i is an - # index, dirty indicates whether the value has to be written to - # disk, and s is the packed value for the index. Up to - # self._max_memory_cache items can be stored here. When the cache - # fills up, it is written to disk in one go and then cleared. - self._cache = {} - - # The index just beyond the last record ever written: - self._limit = os.path.getsize(self.filename) // self._record_len - - # The index just beyond the last record ever written to disk: - self._limit_written = self._limit - - def flush(self): - Log().debug('Flushing cache for %s' % (self,)) - - pairs = [(i, s) for (i, (dirty, s)) in self._cache.items() if dirty] - - if pairs: - pairs.sort() - old_i = None - f = self.f - for (i, s) in pairs: - if i == old_i: - # No seeking needed - pass - elif i <= self._limit_written: - # Just jump there: - f.seek(i * self._record_len) - else: - # Jump to the end of the file then write _empty_values until - # we reach the correct location: - f.seek(self._limit_written * self._record_len) - while self._limit_written < i: - f.write(self.packer.empty_value) - self._limit_written += 1 - f.write(s) - old_i = i + 1 - self._limit_written = max(self._limit_written, old_i) - - self.f.flush() - - self._cache.clear() - - def _set_packed_record(self, i, s): - if self.mode == DB_OPEN_READ: - raise RecordTableAccessError() - if i < 0: - raise KeyError() - self._cache[i] = (True, s) - if len(self._cache) >= self._max_memory_cache: - self.flush() - self._limit = max(self._limit, i + 1) - - def _get_packed_record(self, i): - try: - return self._cache[i][1] - except KeyError: - if not 0 <= i < self._limit_written: - raise KeyError(i) - self.f.seek(i * self._record_len) - s = self.f.read(self._record_len) - self._cache[i] = (False, s) - if len(self._cache) >= self._max_memory_cache: - self.flush() - - return s - - def close(self): - self.flush() - self._cache = None - self.f.close() - self.f = None - - -class MmapRecordTable(AbstractRecordTable): - GROWTH_INCREMENT = 65536 - - def __init__(self, filename, mode, packer): - AbstractRecordTable.__init__(self, filename, mode, packer) - if self.mode == DB_OPEN_NEW: - self.python_file = open(self.filename, 'wb+') - self.python_file.write('\0' * self.GROWTH_INCREMENT) - self.python_file.flush() - self._filesize = self.GROWTH_INCREMENT - self.f = mmap.mmap( - self.python_file.fileno(), self._filesize, - access=mmap.ACCESS_WRITE - ) - - # The index just beyond the last record ever written: - self._limit = 0 - elif self.mode == DB_OPEN_WRITE: - self.python_file = open(self.filename, 'rb+') - self._filesize = os.path.getsize(self.filename) - self.f = mmap.mmap( - self.python_file.fileno(), self._filesize, - access=mmap.ACCESS_WRITE - ) - - # The index just beyond the last record ever written: - self._limit = os.path.getsize(self.filename) // self._record_len - elif self.mode == DB_OPEN_READ: - self.python_file = open(self.filename, 'rb') - self._filesize = os.path.getsize(self.filename) - self.f = mmap.mmap( - self.python_file.fileno(), self._filesize, - access=mmap.ACCESS_READ - ) - - # The index just beyond the last record ever written: - self._limit = os.path.getsize(self.filename) // self._record_len - else: - raise RuntimeError('Invalid mode %r' % self.mode) - - def flush(self): - self.f.flush() - - def _set_packed_record(self, i, s): - if self.mode == DB_OPEN_READ: - raise RecordTableAccessError() - if i < 0: - raise KeyError() - if i >= self._limit: - # This write extends the range of valid indices. First check - # whether the file has to be enlarged: - new_size = (i + 1) * self._record_len - if new_size > self._filesize: - self._filesize = ( - (new_size + self.GROWTH_INCREMENT - 1) - // self.GROWTH_INCREMENT - * self.GROWTH_INCREMENT - ) - self.f.resize(self._filesize) - if i > self._limit: - # Pad up to the new record with empty_value: - self.f[self._limit * self._record_len:i * self._record_len] = \ - self.packer.empty_value * (i - self._limit) - self._limit = i + 1 - - self.f[i * self._record_len:(i + 1) * self._record_len] = s - - def _get_packed_record(self, i): - if not 0 <= i < self._limit: - raise KeyError(i) - return self.f[i * self._record_len:(i + 1) * self._record_len] - - def close(self): - self.flush() - self.f.close() - self.python_file.close() - - |