aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'cvs2svn_lib/collect_data.py')
-rw-r--r--cvs2svn_lib/collect_data.py1431
1 files changed, 0 insertions, 1431 deletions
diff --git a/cvs2svn_lib/collect_data.py b/cvs2svn_lib/collect_data.py
deleted file mode 100644
index 160d7b9..0000000
--- a/cvs2svn_lib/collect_data.py
+++ /dev/null
@@ -1,1431 +0,0 @@
-# (Be in -*- python -*- mode.)
-#
-# ====================================================================
-# Copyright (c) 2000-2009 CollabNet. All rights reserved.
-#
-# This software is licensed as described in the file COPYING, which
-# you should have received as part of this distribution. The terms
-# are also available at http://subversion.tigris.org/license-1.html.
-# If newer versions of this license are posted there, you may use a
-# newer version instead, at your option.
-#
-# This software consists of voluntary contributions made by many
-# individuals. For exact contribution history, see the revision
-# history and logs, available at http://cvs2svn.tigris.org/.
-# ====================================================================
-
-"""Data collection classes.
-
-This module contains the code used to collect data from the CVS
-repository. It parses *,v files, recording all useful information
-except for the actual file contents (though even the file contents
-might be recorded by the RevisionRecorder if one is configured).
-
-As a *,v file is parsed, the information pertaining to the file is
-accumulated in memory, mostly in _RevisionData, _BranchData, and
-_TagData objects. When parsing is complete, a final pass is made over
-the data to create some final dependency links, collect statistics,
-etc., then the _*Data objects are converted into CVSItem objects
-(CVSRevision, CVSBranch, and CVSTag respectively) and the CVSItems are
-dumped into databases.
-
-During the data collection, persistent unique ids are allocated to
-many types of objects: CVSFile, Symbol, and CVSItems. CVSItems are a
-special case. CVSItem ids are unique across all CVSItem types, and
-the ids are carried over from the corresponding data collection
-objects:
-
- _RevisionData -> CVSRevision
-
- _BranchData -> CVSBranch
-
- _TagData -> CVSTag
-
-In a later pass it is possible to convert tags <-> branches. But even
-if this occurs, the new branch or tag uses the same id as the old tag
-or branch.
-
-"""
-
-
-import os
-import stat
-import re
-
-from cvs2svn_lib import config
-from cvs2svn_lib.common import DB_OPEN_NEW
-from cvs2svn_lib.common import FatalError
-from cvs2svn_lib.common import warning_prefix
-from cvs2svn_lib.common import error_prefix
-from cvs2svn_lib.common import IllegalSVNPathError
-from cvs2svn_lib.common import verify_svn_filename_legal
-from cvs2svn_lib.log import Log
-from cvs2svn_lib.context import Ctx
-from cvs2svn_lib.artifact_manager import artifact_manager
-from cvs2svn_lib.project import FileInAndOutOfAtticException
-from cvs2svn_lib.cvs_file import CVSPath
-from cvs2svn_lib.cvs_file import CVSDirectory
-from cvs2svn_lib.cvs_file import CVSFile
-from cvs2svn_lib.symbol import Symbol
-from cvs2svn_lib.symbol import Trunk
-from cvs2svn_lib.cvs_item import CVSRevision
-from cvs2svn_lib.cvs_item import CVSBranch
-from cvs2svn_lib.cvs_item import CVSTag
-from cvs2svn_lib.cvs_item import cvs_revision_type_map
-from cvs2svn_lib.cvs_file_items import VendorBranchError
-from cvs2svn_lib.cvs_file_items import CVSFileItems
-from cvs2svn_lib.key_generator import KeyGenerator
-from cvs2svn_lib.cvs_item_database import NewCVSItemStore
-from cvs2svn_lib.symbol_statistics import SymbolStatisticsCollector
-from cvs2svn_lib.metadata_database import MetadataDatabase
-from cvs2svn_lib.metadata_database import MetadataLogger
-
-import cvs2svn_rcsparse
-
-
-# A regular expression defining "valid" revision numbers (used to
-# check that symbol definitions are reasonable).
-_valid_revision_re = re.compile(r'''
- ^
- (?:\d+\.)+ # Digit groups with trailing dots
- \d+ # And the last digit group.
- $
- ''', re.VERBOSE)
-
-_branch_revision_re = re.compile(r'''
- ^
- ((?:\d+\.\d+\.)+) # A nonzero even number of digit groups w/trailing dot
- (?:0\.)? # CVS sticks an extra 0 here; RCS does not
- (\d+) # And the last digit group
- $
- ''', re.VERBOSE)
-
-
-def rev_tuple(rev):
- """Return a tuple of integers corresponding to revision number REV.
-
- For example, if REV is '1.2.3.4', then return (1,2,3,4)."""
-
- return tuple([int(x) for x in rev.split('.')])
-
-
-def is_trunk_revision(rev):
- """Return True iff REV is a trunk revision.
-
- REV is a revision number corresponding to a specific revision (i.e.,
- not a whole branch)."""
-
- return rev.count('.') == 1
-
-
-def is_branch_revision_number(rev):
- """Return True iff REV is a branch revision number.
-
- REV is a CVS revision number in canonical form (i.e., with zeros
- removed). Return True iff it refers to a whole branch, as opposed
- to a single revision."""
-
- return rev.count('.') % 2 == 0
-
-
-def is_same_line_of_development(rev1, rev2):
- """Return True if rev1 and rev2 are on the same line of
- development (i.e., both on trunk, or both on the same branch);
- return False otherwise. Either rev1 or rev2 can be None, in
- which case automatically return False."""
-
- if rev1 is None or rev2 is None:
- return False
- if rev1.count('.') == 1 and rev2.count('.') == 1:
- return True
- if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
- return True
- return False
-
-
-class _RevisionData:
- """We track the state of each revision so that in set_revision_info,
- we can determine if our op is an add/change/delete. We can do this
- because in set_revision_info, we'll have all of the _RevisionData
- for a file at our fingertips, and we need to examine the state of
- our prev_rev to determine if we're an add or a change. Without the
- state of the prev_rev, we are unable to distinguish between an add
- and a change."""
-
- def __init__(self, cvs_rev_id, rev, timestamp, author, state):
- # The id of this revision:
- self.cvs_rev_id = cvs_rev_id
- self.rev = rev
- self.timestamp = timestamp
- self.author = author
- self.original_timestamp = timestamp
- self.state = state
-
- # If this is the first revision on a branch, then this is the
- # branch_data of that branch; otherwise it is None.
- self.parent_branch_data = None
-
- # The revision number of the parent of this revision along the
- # same line of development, if any. For the first revision R on a
- # branch, we consider the revision from which R sprouted to be the
- # 'parent'. If this is the root revision in the file's revision
- # tree, then this field is None.
- #
- # Note that this revision can't be determined arithmetically (due
- # to cvsadmin -o), which is why this field is necessary.
- self.parent = None
-
- # The revision number of the primary child of this revision (the
- # child along the same line of development), if any; otherwise,
- # None.
- self.child = None
-
- # The _BranchData instances of branches that sprout from this
- # revision, sorted in ascending order by branch number. It would
- # be inconvenient to initialize it here because we would have to
- # scan through all branches known by the _SymbolDataCollector to
- # find the ones having us as the parent. Instead, this
- # information is filled in by
- # _FileDataCollector._resolve_dependencies() and sorted by
- # _FileDataCollector._sort_branches().
- self.branches_data = []
-
- # The revision numbers of the first commits on any branches on
- # which commits occurred. This dependency is kept explicitly
- # because otherwise a revision-only topological sort would miss
- # the dependency that exists via branches_data.
- self.branches_revs_data = []
-
- # The _TagData instances of tags that are connected to this
- # revision.
- self.tags_data = []
-
- # A token that may be returned from
- # RevisionRecorder.record_text(). It can be used by
- # RevisionReader to obtain the text again.
- self.revision_recorder_token = None
-
- def get_first_on_branch_id(self):
- return self.parent_branch_data and self.parent_branch_data.id
-
-
-class _SymbolData:
- """Collection area for information about a symbol in a single CVSFile.
-
- SYMBOL is an instance of Symbol, undifferentiated as a Branch or a
- Tag regardless of whether self is a _BranchData or a _TagData."""
-
- def __init__(self, id, symbol):
- """Initialize an object for SYMBOL."""
-
- # The unique id that will be used for this particular symbol in
- # this particular file. This same id will be used for the CVSItem
- # that is derived from this instance.
- self.id = id
-
- # An instance of Symbol.
- self.symbol = symbol
-
-
-class _BranchData(_SymbolData):
- """Collection area for information about a Branch in a single CVSFile."""
-
- def __init__(self, id, symbol, branch_number):
- _SymbolData.__init__(self, id, symbol)
-
- # The branch number (e.g., '1.5.2') of this branch.
- self.branch_number = branch_number
-
- # The revision number of the revision from which this branch
- # sprouts (e.g., '1.5').
- self.parent = self.branch_number[:self.branch_number.rindex(".")]
-
- # The revision number of the first commit on this branch, if any
- # (e.g., '1.5.2.1'); otherwise, None.
- self.child = None
-
-
-class _TagData(_SymbolData):
- """Collection area for information about a Tag in a single CVSFile."""
-
- def __init__(self, id, symbol, rev):
- _SymbolData.__init__(self, id, symbol)
-
- # The revision number being tagged (e.g., '1.5.2.3').
- self.rev = rev
-
-
-class _SymbolDataCollector(object):
- """Collect information about symbols in a single CVSFile."""
-
- def __init__(self, fdc, cvs_file):
- self.fdc = fdc
- self.cvs_file = cvs_file
-
- self.pdc = self.fdc.pdc
- self.collect_data = self.fdc.collect_data
-
- # A list [(name, revision), ...] of symbols defined in the header
- # of the file. The name has already been transformed using the
- # symbol transform rules. If the symbol transform rules indicate
- # that the symbol should be ignored, then it is never added to
- # this list. This list is processed then deleted in
- # process_symbols().
- self._symbol_defs = []
-
- # A set containing the transformed names of symbols in this file
- # (used to detect duplicats during processing of unlabeled
- # branches):
- self._defined_symbols = set()
-
- # Map { branch_number : _BranchData }, where branch_number has an
- # odd number of digits.
- self.branches_data = { }
-
- # Map { revision : [ tag_data ] }, where revision has an even
- # number of digits, and the value is a list of _TagData objects
- # for tags that apply to that revision.
- self.tags_data = { }
-
- def _add_branch(self, name, branch_number):
- """Record that BRANCH_NUMBER is the branch number for branch NAME,
- and derive and record the revision from which NAME sprouts.
- BRANCH_NUMBER is an RCS branch number with an odd number of
- components, for example '1.7.2' (never '1.7.0.2'). Return the
- _BranchData instance (which is usually newly-created)."""
-
- branch_data = self.branches_data.get(branch_number)
-
- if branch_data is not None:
- Log().warn(
- "%s: in '%s':\n"
- " branch '%s' already has name '%s',\n"
- " cannot also have name '%s', ignoring the latter\n"
- % (warning_prefix,
- self.cvs_file.filename, branch_number,
- branch_data.symbol.name, name)
- )
- return branch_data
-
- symbol = self.pdc.get_symbol(name)
- branch_data = _BranchData(
- self.collect_data.item_key_generator.gen_id(), symbol, branch_number
- )
- self.branches_data[branch_number] = branch_data
- return branch_data
-
- def _construct_distinct_name(self, name, original_name):
- """Construct a distinct symbol name from NAME.
-
- If NAME is distinct, return it. If it is already used in this
- file (as determined from its presence in self._defined_symbols),
- construct and return a new name that is not already used."""
-
- if name not in self._defined_symbols:
- return name
- else:
- index = 1
- while True:
- dup_name = '%s-DUPLICATE-%d' % (name, index,)
- if dup_name not in self._defined_symbols:
- self.collect_data.record_fatal_error(
- "Symbol name '%s' is already used in '%s'.\n"
- "The unlabeled branch '%s' must be renamed using "
- "--symbol-transform."
- % (name, self.cvs_file.filename, original_name,)
- )
- return dup_name
-
- def _add_unlabeled_branch(self, branch_number):
- original_name = "unlabeled-" + branch_number
- name = self.transform_symbol(original_name, branch_number)
- if name is None:
- self.collect_data.record_fatal_error(
- "The unlabeled branch '%s' in '%s' contains commits.\n"
- "It may not be ignored via a symbol transform. (Use --exclude "
- "instead.)"
- % (original_name, self.cvs_file.filename,)
- )
- # Retain the original name to allow the conversion to continue:
- name = original_name
-
- distinct_name = self._construct_distinct_name(name, original_name)
- self._defined_symbols.add(distinct_name)
- return self._add_branch(distinct_name, branch_number)
-
- def _add_tag(self, name, revision):
- """Record that tag NAME refers to the specified REVISION."""
-
- symbol = self.pdc.get_symbol(name)
- tag_data = _TagData(
- self.collect_data.item_key_generator.gen_id(), symbol, revision
- )
- self.tags_data.setdefault(revision, []).append(tag_data)
- return tag_data
-
- def transform_symbol(self, name, revision):
- """Transform a symbol according to the project's symbol transforms.
-
- Transform the symbol with the original name NAME and canonicalized
- revision number REVISION. Return the new symbol name or None if
- the symbol should be ignored entirely.
-
- Log the results of the symbol transform if necessary."""
-
- old_name = name
- # Apply any user-defined symbol transforms to the symbol name:
- name = self.cvs_file.project.transform_symbol(
- self.cvs_file, name, revision
- )
-
- if name is None:
- # Ignore symbol:
- self.pdc.log_symbol_transform(old_name, None)
- Log().verbose(
- " symbol '%s'=%s ignored in %s"
- % (old_name, revision, self.cvs_file.filename,)
- )
- else:
- if name != old_name:
- self.pdc.log_symbol_transform(old_name, name)
- Log().verbose(
- " symbol '%s'=%s transformed to '%s' in %s"
- % (old_name, revision, name, self.cvs_file.filename,)
- )
-
- return name
-
- def define_symbol(self, name, revision):
- """Record a symbol definition for later processing."""
-
- # Canonicalize the revision number:
- revision = _branch_revision_re.sub(r'\1\2', revision)
-
- # Apply any user-defined symbol transforms to the symbol name:
- name = self.transform_symbol(name, revision)
-
- if name is not None:
- # Verify that the revision number is valid:
- if _valid_revision_re.match(revision):
- # The revision number is valid; record it for later processing:
- self._symbol_defs.append( (name, revision) )
- else:
- Log().warn(
- 'In %r:\n'
- ' branch %r references invalid revision %s\n'
- ' and will be ignored.'
- % (self.cvs_file.filename, name, revision,)
- )
-
- def _eliminate_trivial_duplicate_defs(self, symbol_defs):
- """Iterate through SYMBOL_DEFS, Removing identical duplicate definitions.
-
- Duplicate definitions of symbol names have been seen in the wild,
- and they can also happen when --symbol-transform is used. If a
- symbol is defined to the same revision number repeatedly, then
- ignore all but the last definition."""
-
- # Make a copy, since we have to iterate through the definitions
- # twice:
- symbol_defs = list(symbol_defs)
-
- # A map { (name, revision) : [index,...] } of the indexes where
- # symbol definitions name=revision were found:
- known_definitions = {}
- for (i, symbol_def) in enumerate(symbol_defs):
- known_definitions.setdefault(symbol_def, []).append(i)
-
- # A set of the indexes of entries that have to be removed from
- # symbol_defs:
- dup_indexes = set()
- for ((name, revision), indexes) in known_definitions.iteritems():
- if len(indexes) > 1:
- Log().verbose(
- "in %r:\n"
- " symbol %s:%s defined multiple times; ignoring duplicates\n"
- % (self.cvs_file.filename, name, revision,)
- )
- dup_indexes.update(indexes[:-1])
-
- for (i, symbol_def) in enumerate(symbol_defs):
- if i not in dup_indexes:
- yield symbol_def
-
- def _process_duplicate_defs(self, symbol_defs):
- """Iterate through SYMBOL_DEFS, processing duplicate names.
-
- Duplicate definitions of symbol names have been seen in the wild,
- and they can also happen when --symbol-transform is used. If a
- symbol is defined multiple times, then it is a fatal error. This
- method should be called after _eliminate_trivial_duplicate_defs()."""
-
- # Make a copy, since we have to access multiple times:
- symbol_defs = list(symbol_defs)
-
- # A map {name : [index,...]} mapping the names of symbols to a
- # list of their definitions' indexes in symbol_defs:
- known_symbols = {}
- for (i, (name, revision)) in enumerate(symbol_defs):
- known_symbols.setdefault(name, []).append(i)
-
- known_symbols = known_symbols.items()
- known_symbols.sort()
- dup_indexes = set()
- for (name, indexes) in known_symbols:
- if len(indexes) > 1:
- # This symbol was defined multiple times.
- self.collect_data.record_fatal_error(
- "Multiple definitions of the symbol '%s' in '%s': %s" % (
- name, self.cvs_file.filename,
- ' '.join([symbol_defs[i][1] for i in indexes]),
- )
- )
- # Ignore all but the last definition for now, to allow the
- # conversion to proceed:
- dup_indexes.update(indexes[:-1])
-
- for (i, symbol_def) in enumerate(symbol_defs):
- if i not in dup_indexes:
- yield symbol_def
-
- def _process_symbol(self, name, revision):
- """Process a symbol called NAME, which is associated with REVISON.
-
- REVISION is a canonical revision number with zeros removed, for
- example: '1.7', '1.7.2', or '1.1.1' or '1.1.1.1'. NAME is a
- transformed branch or tag name."""
-
- # Add symbol to our records:
- if is_branch_revision_number(revision):
- self._add_branch(name, revision)
- else:
- self._add_tag(name, revision)
-
- def process_symbols(self):
- """Process the symbol definitions from SELF._symbol_defs."""
-
- symbol_defs = self._symbol_defs
- del self._symbol_defs
-
- symbol_defs = self._eliminate_trivial_duplicate_defs(symbol_defs)
- symbol_defs = self._process_duplicate_defs(symbol_defs)
-
- for (name, revision) in symbol_defs:
- self._defined_symbols.add(name)
- self._process_symbol(name, revision)
-
- @staticmethod
- def rev_to_branch_number(revision):
- """Return the branch_number of the branch on which REVISION lies.
-
- REVISION is a branch revision number with an even number of
- components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
- The return value is the branch number (for example, '1.7.2').
- Return none iff REVISION is a trunk revision such as '1.2'."""
-
- if is_trunk_revision(revision):
- return None
- return revision[:revision.rindex(".")]
-
- def rev_to_branch_data(self, revision):
- """Return the branch_data of the branch on which REVISION lies.
-
- REVISION must be a branch revision number with an even number of
- components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
- Raise KeyError iff REVISION is unknown."""
-
- assert not is_trunk_revision(revision)
-
- return self.branches_data[self.rev_to_branch_number(revision)]
-
- def rev_to_lod(self, revision):
- """Return the line of development on which REVISION lies.
-
- REVISION must be a revision number with an even number of
- components. Raise KeyError iff REVISION is unknown."""
-
- if is_trunk_revision(revision):
- return self.pdc.trunk
- else:
- return self.rev_to_branch_data(revision).symbol
-
-
-class _FileDataCollector(cvs2svn_rcsparse.Sink):
- """Class responsible for collecting RCS data for a particular file.
-
- Any collected data that need to be remembered are stored into the
- referenced CollectData instance."""
-
- def __init__(self, pdc, cvs_file):
- """Create an object that is prepared to receive data for CVS_FILE.
- CVS_FILE is a CVSFile instance. COLLECT_DATA is used to store the
- information collected about the file."""
-
- self.pdc = pdc
- self.cvs_file = cvs_file
-
- self.collect_data = self.pdc.collect_data
- self.project = self.cvs_file.project
-
- # A place to store information about the symbols in this file:
- self.sdc = _SymbolDataCollector(self, self.cvs_file)
-
- # { revision : _RevisionData instance }
- self._rev_data = { }
-
- # Lists [ (parent, child) ] of revision number pairs indicating
- # that revision child depends on revision parent along the main
- # line of development.
- self._primary_dependencies = []
-
- # If set, this is an RCS branch number -- rcsparse calls this the
- # "principal branch", but CVS and RCS refer to it as the "default
- # branch", so that's what we call it, even though the rcsparse API
- # setter method is still 'set_principal_branch'.
- self.default_branch = None
-
- # True iff revision 1.1 of the file appears to have been imported
- # (as opposed to added normally).
- self._file_imported = False
-
- def _get_rev_id(self, revision):
- if revision is None:
- return None
- return self._rev_data[revision].cvs_rev_id
-
- def set_principal_branch(self, branch):
- """This is a callback method declared in Sink."""
-
- if branch.find('.') == -1:
- # This just sets the default branch to trunk. Normally this
- # shouldn't occur, but it has been seen in at least one CVS
- # repository. Just ignore it.
- pass
- else:
- self.default_branch = branch
-
- def set_expansion(self, mode):
- """This is a callback method declared in Sink."""
-
- self.cvs_file.mode = mode
-
- def define_tag(self, name, revision):
- """Remember the symbol name and revision, but don't process them yet.
-
- This is a callback method declared in Sink."""
-
- self.sdc.define_symbol(name, revision)
-
- def admin_completed(self):
- """This is a callback method declared in Sink."""
-
- self.sdc.process_symbols()
-
- def define_revision(self, revision, timestamp, author, state,
- branches, next):
- """This is a callback method declared in Sink."""
-
- for branch in branches:
- try:
- branch_data = self.sdc.rev_to_branch_data(branch)
- except KeyError:
- # Normally we learn about the branches from the branch names
- # and numbers parsed from the symbolic name header. But this
- # must have been an unlabeled branch that slipped through the
- # net. Generate a name for it and create a _BranchData record
- # for it now.
- branch_data = self.sdc._add_unlabeled_branch(
- self.sdc.rev_to_branch_number(branch))
-
- assert branch_data.child is None
- branch_data.child = branch
-
- if revision in self._rev_data:
- # This revision has already been seen.
- Log().error('File %r contains duplicate definitions of revision %s.'
- % (self.cvs_file.filename, revision,))
- raise RuntimeError
-
- # Record basic information about the revision:
- rev_data = _RevisionData(
- self.collect_data.item_key_generator.gen_id(),
- revision, int(timestamp), author, state)
- self._rev_data[revision] = rev_data
-
- # When on trunk, the RCS 'next' revision number points to what
- # humans might consider to be the 'previous' revision number. For
- # example, 1.3's RCS 'next' is 1.2.
- #
- # However, on a branch, the RCS 'next' revision number really does
- # point to what humans would consider to be the 'next' revision
- # number. For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
- #
- # In other words, in RCS, 'next' always means "where to find the next
- # deltatext that you need this revision to retrieve.
- #
- # That said, we don't *want* RCS's behavior here, so we determine
- # whether we're on trunk or a branch and set the dependencies
- # accordingly.
- if next:
- if is_trunk_revision(revision):
- self._primary_dependencies.append( (next, revision,) )
- else:
- self._primary_dependencies.append( (revision, next,) )
-
- def _resolve_primary_dependencies(self):
- """Resolve the dependencies listed in self._primary_dependencies."""
-
- for (parent, child,) in self._primary_dependencies:
- parent_data = self._rev_data[parent]
- assert parent_data.child is None
- parent_data.child = child
-
- child_data = self._rev_data[child]
- assert child_data.parent is None
- child_data.parent = parent
-
- def _resolve_branch_dependencies(self):
- """Resolve dependencies involving branches."""
-
- for branch_data in self.sdc.branches_data.values():
- # The branch_data's parent has the branch as a child regardless
- # of whether the branch had any subsequent commits:
- try:
- parent_data = self._rev_data[branch_data.parent]
- except KeyError:
- Log().warn(
- 'In %r:\n'
- ' branch %r references non-existing revision %s\n'
- ' and will be ignored.'
- % (self.cvs_file.filename, branch_data.symbol.name,
- branch_data.parent,))
- del self.sdc.branches_data[branch_data.branch_number]
- else:
- parent_data.branches_data.append(branch_data)
-
- # If the branch has a child (i.e., something was committed on
- # the branch), then we store a reference to the branch_data
- # there, define the child's parent to be the branch's parent,
- # and list the child in the branch parent's branches_revs_data:
- if branch_data.child is not None:
- child_data = self._rev_data[branch_data.child]
- assert child_data.parent_branch_data is None
- child_data.parent_branch_data = branch_data
- assert child_data.parent is None
- child_data.parent = branch_data.parent
- parent_data.branches_revs_data.append(branch_data.child)
-
- def _sort_branches(self):
- """Sort the branches sprouting from each revision in creation order.
-
- Creation order is taken to be the reverse of the order that they
- are listed in the symbols part of the RCS file. (If a branch is
- created then deleted, a later branch can be assigned the recycled
- branch number; therefore branch numbers are not an indication of
- creation order.)"""
-
- for rev_data in self._rev_data.values():
- rev_data.branches_data.sort(lambda a, b: - cmp(a.id, b.id))
-
- def _resolve_tag_dependencies(self):
- """Resolve dependencies involving tags."""
-
- for (rev, tag_data_list) in self.sdc.tags_data.items():
- try:
- parent_data = self._rev_data[rev]
- except KeyError:
- Log().warn(
- 'In %r:\n'
- ' the following tag(s) reference non-existing revision %s\n'
- ' and will be ignored:\n'
- ' %s' % (
- self.cvs_file.filename, rev,
- ', '.join([repr(tag_data.symbol.name)
- for tag_data in tag_data_list]),))
- del self.sdc.tags_data[rev]
- else:
- for tag_data in tag_data_list:
- assert tag_data.rev == rev
- # The tag_data's rev has the tag as a child:
- parent_data.tags_data.append(tag_data)
-
- def _determine_operation(self, rev_data):
- prev_rev_data = self._rev_data.get(rev_data.parent)
- return cvs_revision_type_map[(
- rev_data.state != 'dead',
- prev_rev_data is not None and prev_rev_data.state != 'dead',
- )]
-
- def _get_cvs_revision(self, rev_data):
- """Create and return a CVSRevision for REV_DATA."""
-
- branch_ids = [
- branch_data.id
- for branch_data in rev_data.branches_data
- ]
-
- branch_commit_ids = [
- self._get_rev_id(rev)
- for rev in rev_data.branches_revs_data
- ]
-
- tag_ids = [
- tag_data.id
- for tag_data in rev_data.tags_data
- ]
-
- revision_type = self._determine_operation(rev_data)
-
- return revision_type(
- self._get_rev_id(rev_data.rev), self.cvs_file,
- rev_data.timestamp, None,
- self._get_rev_id(rev_data.parent),
- self._get_rev_id(rev_data.child),
- rev_data.rev,
- True,
- self.sdc.rev_to_lod(rev_data.rev),
- rev_data.get_first_on_branch_id(),
- False, None, None,
- tag_ids, branch_ids, branch_commit_ids,
- rev_data.revision_recorder_token)
-
- def _get_cvs_revisions(self):
- """Generate the CVSRevisions present in this file."""
-
- for rev_data in self._rev_data.itervalues():
- yield self._get_cvs_revision(rev_data)
-
- def _get_cvs_branches(self):
- """Generate the CVSBranches present in this file."""
-
- for branch_data in self.sdc.branches_data.values():
- yield CVSBranch(
- branch_data.id, self.cvs_file, branch_data.symbol,
- branch_data.branch_number,
- self.sdc.rev_to_lod(branch_data.parent),
- self._get_rev_id(branch_data.parent),
- self._get_rev_id(branch_data.child),
- None,
- )
-
- def _get_cvs_tags(self):
- """Generate the CVSTags present in this file."""
-
- for tags_data in self.sdc.tags_data.values():
- for tag_data in tags_data:
- yield CVSTag(
- tag_data.id, self.cvs_file, tag_data.symbol,
- self.sdc.rev_to_lod(tag_data.rev),
- self._get_rev_id(tag_data.rev),
- None,
- )
-
- def tree_completed(self):
- """The revision tree has been parsed.
-
- Analyze it for consistency and connect some loose ends.
-
- This is a callback method declared in Sink."""
-
- self._resolve_primary_dependencies()
- self._resolve_branch_dependencies()
- self._sort_branches()
- self._resolve_tag_dependencies()
-
- # Compute the preliminary CVSFileItems for this file:
- cvs_items = []
- cvs_items.extend(self._get_cvs_revisions())
- cvs_items.extend(self._get_cvs_branches())
- cvs_items.extend(self._get_cvs_tags())
- self._cvs_file_items = CVSFileItems(
- self.cvs_file, self.pdc.trunk, cvs_items
- )
-
- self._cvs_file_items.check_link_consistency()
-
- # Tell the revision recorder about the file dependency tree.
- self.collect_data.revision_recorder.start_file(self._cvs_file_items)
-
- def set_revision_info(self, revision, log, text):
- """This is a callback method declared in Sink."""
-
- rev_data = self._rev_data[revision]
- cvs_rev = self._cvs_file_items[rev_data.cvs_rev_id]
-
- if cvs_rev.metadata_id is not None:
- # Users have reported problems with repositories in which the
- # deltatext block for revision 1.1 appears twice. It is not
- # known whether this results from a CVS/RCS bug, or from botched
- # hand-editing of the repository. In any case, empirically, cvs
- # and rcs both use the first version when checking out data, so
- # that's what we will do. (For the record: "cvs log" fails on
- # such a file; "rlog" prints the log message from the first
- # block and ignores the second one.)
- Log().warn(
- "%s: in '%s':\n"
- " Deltatext block for revision %s appeared twice;\n"
- " ignoring the second occurrence.\n"
- % (warning_prefix, self.cvs_file.filename, revision,)
- )
- return
-
- if is_trunk_revision(revision):
- branch_name = None
- else:
- branch_name = self.sdc.rev_to_branch_data(revision).symbol.name
-
- cvs_rev.metadata_id = self.collect_data.metadata_logger.store(
- self.project, branch_name, rev_data.author, log
- )
- cvs_rev.deltatext_exists = bool(text)
-
- # If this is revision 1.1, determine whether the file appears to
- # have been created via 'cvs add' instead of 'cvs import'. The
- # test is that the log message CVS uses for 1.1 in imports is
- # "Initial revision\n" with no period. (This fact helps determine
- # whether this file might have had a default branch in the past.)
- if revision == '1.1':
- self._file_imported = (log == 'Initial revision\n')
-
- cvs_rev.revision_recorder_token = \
- self.collect_data.revision_recorder.record_text(cvs_rev, log, text)
-
- def parse_completed(self):
- """Finish the processing of this file.
-
- This is a callback method declared in Sink."""
-
- # Make sure that there was an info section for each revision:
- for cvs_item in self._cvs_file_items.values():
- if isinstance(cvs_item, CVSRevision) and cvs_item.metadata_id is None:
- self.collect_data.record_fatal_error(
- '%r has no deltatext section for revision %s'
- % (self.cvs_file.filename, cvs_item.rev,)
- )
-
- def _process_ntdbrs(self):
- """Fix up any non-trunk default branch revisions (if present).
-
- If a non-trunk default branch is determined to have existed, yield
- the _RevisionData.ids for all revisions that were once non-trunk
- default revisions, in dependency order.
-
- There are two cases to handle:
-
- One case is simple. The RCS file lists a default branch
- explicitly in its header, such as '1.1.1'. In this case, we know
- that every revision on the vendor branch is to be treated as head
- of trunk at that point in time.
-
- But there's also a degenerate case. The RCS file does not
- currently have a default branch, yet we can deduce that for some
- period in the past it probably *did* have one. For example, the
- file has vendor revisions 1.1.1.1 -> 1.1.1.96, all of which are
- dated before 1.2, and then it has 1.1.1.97 -> 1.1.1.100 dated
- after 1.2. In this case, we should record 1.1.1.96 as the last
- vendor revision to have been the head of the default branch.
-
- If any non-trunk default branch revisions are found:
-
- - Set their ntdbr members to True.
-
- - Connect the last one with revision 1.2.
-
- - Remove revision 1.1 if it is not needed.
-
- """
-
- try:
- if self.default_branch:
- vendor_cvs_branch_id = self.sdc.branches_data[self.default_branch].id
- vendor_lod_items = self._cvs_file_items.get_lod_items(
- self._cvs_file_items[vendor_cvs_branch_id]
- )
- if not self._cvs_file_items.process_live_ntdb(vendor_lod_items):
- return
- elif self._file_imported:
- vendor_branch_data = self.sdc.branches_data.get('1.1.1')
- if vendor_branch_data is None:
- return
- else:
- vendor_lod_items = self._cvs_file_items.get_lod_items(
- self._cvs_file_items[vendor_branch_data.id]
- )
- if not self._cvs_file_items.process_historical_ntdb(
- vendor_lod_items
- ):
- return
- else:
- return
- except VendorBranchError, e:
- self.collect_data.record_fatal_error(str(e))
- return
-
- if self._file_imported:
- self._cvs_file_items.imported_remove_1_1(vendor_lod_items)
-
- self._cvs_file_items.check_link_consistency()
-
- def get_cvs_file_items(self):
- """Finish up and return a CVSFileItems instance for this file.
-
- This method must only be called once."""
-
- self._process_ntdbrs()
-
- # Break a circular reference loop, allowing the memory for self
- # and sdc to be freed.
- del self.sdc
-
- return self._cvs_file_items
-
-
-class _ProjectDataCollector:
- def __init__(self, collect_data, project):
- self.collect_data = collect_data
- self.project = project
- self.num_files = 0
-
- # The Trunk LineOfDevelopment object for this project:
- self.trunk = Trunk(
- self.collect_data.symbol_key_generator.gen_id(), self.project
- )
- self.project.trunk_id = self.trunk.id
-
- # This causes a record for self.trunk to spring into existence:
- self.collect_data.symbol_stats[self.trunk]
-
- # A map { name -> Symbol } for all known symbols in this project.
- # The symbols listed here are undifferentiated into Branches and
- # Tags because the same name might appear as a branch in one file
- # and a tag in another.
- self.symbols = {}
-
- # A map { (old_name, new_name) : count } indicating how many files
- # were affected by each each symbol name transformation:
- self.symbol_transform_counts = {}
-
- def get_symbol(self, name):
- """Return the Symbol object for the symbol named NAME in this project.
-
- If such a symbol does not yet exist, allocate a new symbol_id,
- create a Symbol instance, store it in self.symbols, and return it."""
-
- symbol = self.symbols.get(name)
- if symbol is None:
- symbol = Symbol(
- self.collect_data.symbol_key_generator.gen_id(),
- self.project, name)
- self.symbols[name] = symbol
- return symbol
-
- def log_symbol_transform(self, old_name, new_name):
- """Record that OLD_NAME was transformed to NEW_NAME in one file.
-
- This information is used to generated a statistical summary of
- symbol transforms."""
-
- try:
- self.symbol_transform_counts[old_name, new_name] += 1
- except KeyError:
- self.symbol_transform_counts[old_name, new_name] = 1
-
- def summarize_symbol_transforms(self):
- if self.symbol_transform_counts and Log().is_on(Log.NORMAL):
- log = Log()
- log.normal('Summary of symbol transforms:')
- transforms = self.symbol_transform_counts.items()
- transforms.sort()
- for ((old_name, new_name), count) in transforms:
- if new_name is None:
- log.normal(' "%s" ignored in %d files' % (old_name, count,))
- else:
- log.normal(
- ' "%s" transformed to "%s" in %d files'
- % (old_name, new_name, count,)
- )
-
- def _process_cvs_file_items(self, cvs_file_items):
- """Process the CVSFileItems from one CVSFile."""
-
- # Remove CVSRevisionDeletes that are not needed:
- cvs_file_items.remove_unneeded_deletes(self.collect_data.metadata_db)
-
- # Remove initial branch deletes that are not needed:
- cvs_file_items.remove_initial_branch_deletes(
- self.collect_data.metadata_db
- )
-
- # If this is a --trunk-only conversion, discard all branches and
- # tags, then draft any non-trunk default branch revisions to
- # trunk:
- if Ctx().trunk_only:
- cvs_file_items.exclude_non_trunk()
-
- self.collect_data.revision_recorder.finish_file(cvs_file_items)
- self.collect_data.add_cvs_file_items(cvs_file_items)
- self.collect_data.symbol_stats.register(cvs_file_items)
-
- def process_file(self, cvs_file):
- Log().normal(cvs_file.filename)
- fdc = _FileDataCollector(self, cvs_file)
- try:
- cvs2svn_rcsparse.parse(open(cvs_file.filename, 'rb'), fdc)
- except (cvs2svn_rcsparse.common.RCSParseError, ValueError, RuntimeError):
- self.collect_data.record_fatal_error(
- "%r is not a valid ,v file" % (cvs_file.filename,)
- )
- # Abort the processing of this file, but let the pass continue
- # with other files:
- return
- except:
- Log().warn("Exception occurred while parsing %s" % cvs_file.filename)
- raise
- else:
- self.num_files += 1
-
- cvs_file_items = fdc.get_cvs_file_items()
-
- del fdc
-
- self._process_cvs_file_items(cvs_file_items)
-
-
-class CollectData:
- """Repository for data collected by parsing the CVS repository files.
-
- This class manages the databases into which information collected
- from the CVS repository is stored. The data are stored into this
- class by _FileDataCollector instances, one of which is created for
- each file to be parsed."""
-
- def __init__(self, revision_recorder, stats_keeper):
- self.revision_recorder = revision_recorder
- self._cvs_item_store = NewCVSItemStore(
- artifact_manager.get_temp_file(config.CVS_ITEMS_STORE))
- self.metadata_db = MetadataDatabase(
- artifact_manager.get_temp_file(config.METADATA_STORE),
- artifact_manager.get_temp_file(config.METADATA_INDEX_TABLE),
- DB_OPEN_NEW,
- )
- self.metadata_logger = MetadataLogger(self.metadata_db)
- self.fatal_errors = []
- self.num_files = 0
- self.symbol_stats = SymbolStatisticsCollector()
- self.stats_keeper = stats_keeper
-
- # Key generator for CVSFiles:
- self.file_key_generator = KeyGenerator()
-
- # Key generator for CVSItems:
- self.item_key_generator = KeyGenerator()
-
- # Key generator for Symbols:
- self.symbol_key_generator = KeyGenerator()
-
- self.revision_recorder.start()
-
- def record_fatal_error(self, err):
- """Record that fatal error ERR was found.
-
- ERR is a string (without trailing newline) describing the error.
- Output the error to stderr immediately, and record a copy to be
- output again in a summary at the end of CollectRevsPass."""
-
- err = '%s: %s' % (error_prefix, err,)
- Log().error(err + '\n')
- self.fatal_errors.append(err)
-
- def add_cvs_directory(self, cvs_directory):
- """Record CVS_DIRECTORY."""
-
- Ctx()._cvs_file_db.log_file(cvs_directory)
-
- def add_cvs_file_items(self, cvs_file_items):
- """Record the information from CVS_FILE_ITEMS.
-
- Store the CVSFile to _cvs_file_db under its persistent id, store
- the CVSItems, and record the CVSItems to self.stats_keeper."""
-
- Ctx()._cvs_file_db.log_file(cvs_file_items.cvs_file)
- self._cvs_item_store.add(cvs_file_items)
-
- self.stats_keeper.record_cvs_file(cvs_file_items.cvs_file)
- for cvs_item in cvs_file_items.values():
- self.stats_keeper.record_cvs_item(cvs_item)
-
- def _get_cvs_file(
- self, parent_directory, basename, file_in_attic, leave_in_attic=False
- ):
- """Return a CVSFile describing the file with name BASENAME.
-
- PARENT_DIRECTORY is the CVSDirectory instance describing the
- directory that physically holds this file in the filesystem.
- BASENAME must be the base name of a *,v file within
- PARENT_DIRECTORY.
-
- FILE_IN_ATTIC is a boolean telling whether the specified file is
- in an Attic subdirectory. If FILE_IN_ATTIC is True, then:
-
- - If LEAVE_IN_ATTIC is True, then leave the 'Attic' component in
- the filename.
-
- - Otherwise, raise FileInAndOutOfAtticException if a file with the
- same filename appears outside of Attic.
-
- The CVSFile is assigned a new unique id. All of the CVSFile
- information is filled in except mode (which can only be determined
- by parsing the file).
-
- Raise FatalError if the resulting filename would not be legal in
- SVN."""
-
- filename = os.path.join(parent_directory.filename, basename)
- try:
- verify_svn_filename_legal(basename[:-2])
- except IllegalSVNPathError, e:
- raise FatalError(
- 'File %r would result in an illegal SVN filename: %s'
- % (filename, e,)
- )
-
- if file_in_attic and not leave_in_attic:
- in_attic = True
- logical_parent_directory = parent_directory.parent_directory
-
- # If this file also exists outside of the attic, it's a fatal
- # error:
- non_attic_filename = os.path.join(
- logical_parent_directory.filename, basename,
- )
- if os.path.exists(non_attic_filename):
- raise FileInAndOutOfAtticException(non_attic_filename, filename)
- else:
- in_attic = False
- logical_parent_directory = parent_directory
-
- file_stat = os.stat(filename)
-
- # The size of the file in bytes:
- file_size = file_stat[stat.ST_SIZE]
-
- # Whether or not the executable bit is set:
- file_executable = bool(file_stat[0] & stat.S_IXUSR)
-
- # mode is not known, so we temporarily set it to None.
- return CVSFile(
- self.file_key_generator.gen_id(),
- parent_directory.project, logical_parent_directory, basename[:-2],
- in_attic, file_executable, file_size, None
- )
-
- def _get_attic_file(self, parent_directory, basename):
- """Return a CVSFile object for the Attic file at BASENAME.
-
- PARENT_DIRECTORY is the CVSDirectory that physically contains the
- file on the filesystem (i.e., the Attic directory). It is not
- necessarily the parent_directory of the CVSFile that will be
- returned.
-
- Return CVSFile, whose parent directory is usually
- PARENT_DIRECTORY.parent_directory, but might be PARENT_DIRECTORY
- iff CVSFile will remain in the Attic directory."""
-
- try:
- return self._get_cvs_file(parent_directory, basename, True)
- except FileInAndOutOfAtticException, e:
- if Ctx().retain_conflicting_attic_files:
- Log().warn(
- "%s: %s;\n"
- " storing the latter into 'Attic' subdirectory.\n"
- % (warning_prefix, e)
- )
- else:
- self.record_fatal_error(str(e))
-
- # Either way, return a CVSFile object so that the rest of the
- # file processing can proceed:
- return self._get_cvs_file(
- parent_directory, basename, True, leave_in_attic=True
- )
-
- def _generate_attic_cvs_files(self, cvs_directory):
- """Generate CVSFiles for the files in Attic directory CVS_DIRECTORY.
-
- Also add CVS_DIRECTORY to self if any files are being retained in
- that directory."""
-
- retained_attic_file = False
-
- fnames = os.listdir(cvs_directory.filename)
- fnames.sort()
- for fname in fnames:
- pathname = os.path.join(cvs_directory.filename, fname)
- if os.path.isdir(pathname):
- Log().warn("Directory %s found within Attic; ignoring" % (pathname,))
- elif fname.endswith(',v'):
- cvs_file = self._get_attic_file(cvs_directory, fname)
- if cvs_file.parent_directory == cvs_directory:
- # This file will be retained in the Attic directory.
- retained_attic_file = True
- yield cvs_file
-
- if retained_attic_file:
- # If any files were retained in the Attic directory, then write
- # the Attic directory to CVSFileDatabase:
- self.add_cvs_directory(cvs_directory)
-
- def _get_non_attic_file(self, parent_directory, basename):
- """Return a CVSFile object for the non-Attic file at BASENAME."""
-
- return self._get_cvs_file(parent_directory, basename, False)
-
- def _generate_cvs_files(self, cvs_directory):
- """Generate the CVSFiles under non-Attic directory CVS_DIRECTORY.
-
- Process directories recursively, including Attic directories.
- Also create and register CVSDirectories as they are found, and
- look for conflicts between the filenames that will result from
- files, attic files, and subdirectories."""
-
- self.add_cvs_directory(cvs_directory)
-
- # Map {cvs_file.basename : cvs_file.filename} for files directly
- # in cvs_directory:
- rcsfiles = {}
-
- attic_dir = None
-
- # Non-Attic subdirectories of cvs_directory (to be recursed into):
- dirs = []
-
- fnames = os.listdir(cvs_directory.filename)
- fnames.sort()
- for fname in fnames:
- pathname = os.path.join(cvs_directory.filename, fname)
- if os.path.isdir(pathname):
- if fname == 'Attic':
- attic_dir = fname
- else:
- dirs.append(fname)
- elif fname.endswith(',v'):
- cvs_file = self._get_non_attic_file(cvs_directory, fname)
- rcsfiles[cvs_file.basename] = cvs_file.filename
- yield cvs_file
- else:
- # Silently ignore other files:
- pass
-
- # Map {cvs_file.basename : cvs_file.filename} for files in an
- # Attic directory within cvs_directory:
- attic_rcsfiles = {}
-
- if attic_dir is not None:
- attic_directory = CVSDirectory(
- self.file_key_generator.gen_id(),
- cvs_directory.project, cvs_directory, 'Attic',
- )
-
- for cvs_file in self._generate_attic_cvs_files(attic_directory):
- if cvs_file.parent_directory == cvs_directory:
- attic_rcsfiles[cvs_file.basename] = cvs_file.filename
- yield cvs_file
-
- alldirs = dirs + [attic_dir]
- else:
- alldirs = dirs
-
- # Check for conflicts between directory names and the filenames
- # that will result from the rcs files (both in this directory and
- # in attic). (We recurse into the subdirectories nevertheless, to
- # try to detect more problems.)
- for fname in alldirs:
- pathname = os.path.join(cvs_directory.filename, fname)
- for rcsfile_list in [rcsfiles, attic_rcsfiles]:
- if fname in rcsfile_list:
- self.record_fatal_error(
- 'Directory name conflicts with filename. Please remove or '
- 'rename one\n'
- 'of the following:\n'
- ' "%s"\n'
- ' "%s"'
- % (pathname, rcsfile_list[fname],)
- )
-
- # Now recurse into the other subdirectories:
- for fname in dirs:
- dirname = os.path.join(cvs_directory.filename, fname)
-
- # Verify that the directory name does not contain any illegal
- # characters:
- try:
- verify_svn_filename_legal(fname)
- except IllegalSVNPathError, e:
- raise FatalError(
- 'Directory %r would result in an illegal SVN path name: %s'
- % (dirname, e,)
- )
-
- sub_directory = CVSDirectory(
- self.file_key_generator.gen_id(),
- cvs_directory.project, cvs_directory, fname,
- )
-
- for cvs_file in self._generate_cvs_files(sub_directory):
- yield cvs_file
-
- def process_project(self, project):
- Ctx()._projects[project.id] = project
-
- root_cvs_directory = CVSDirectory(
- self.file_key_generator.gen_id(), project, None, ''
- )
- project.root_cvs_directory_id = root_cvs_directory.id
- pdc = _ProjectDataCollector(self, project)
-
- found_rcs_file = False
- for cvs_file in self._generate_cvs_files(root_cvs_directory):
- pdc.process_file(cvs_file)
- found_rcs_file = True
-
- if not found_rcs_file:
- self.record_fatal_error(
- 'No RCS files found under %r!\n'
- 'Are you absolutely certain you are pointing cvs2svn\n'
- 'at a CVS repository?\n'
- % (project.project_cvs_repos_path,)
- )
-
- pdc.summarize_symbol_transforms()
-
- self.num_files += pdc.num_files
- Log().verbose('Processed', self.num_files, 'files')
-
- def _set_cvs_path_ordinals(self):
- cvs_files = list(Ctx()._cvs_file_db.itervalues())
- cvs_files.sort(CVSPath.slow_compare)
- for (i, cvs_file) in enumerate(cvs_files):
- cvs_file.ordinal = i
-
- def close(self):
- """Close the data structures associated with this instance.
-
- Return a list of fatal errors encountered while processing input.
- Each list entry is a string describing one fatal error."""
-
- self.revision_recorder.finish()
- self.symbol_stats.purge_ghost_symbols()
- self.symbol_stats.close()
- self.symbol_stats = None
- self.metadata_logger = None
- self.metadata_db.close()
- self.metadata_db = None
- self._cvs_item_store.close()
- self._cvs_item_store = None
- self._set_cvs_path_ordinals()
- self.revision_recorder = None
- retval = self.fatal_errors
- self.fatal_errors = None
- return retval
-
-