diff options
-rw-r--r-- | README | 43 | ||||
-rw-r--r-- | config | 4 | ||||
-rwxr-xr-x | create-git.sh | 45 | ||||
-rwxr-xr-x | process_directory.sh | 20 | ||||
-rwxr-xr-x | rewrite-blob-data.py | 17 | ||||
-rwxr-xr-x | rewrite-commit-dump.py | 39 | ||||
-rwxr-xr-x | script.sh | 13 |
7 files changed, 162 insertions, 19 deletions
@@ -0,0 +1,43 @@ +Note; this is raw, and likes to dump things in cwd- logs namely. + +To run it, first get yourself a copy of gentoo-x86 CVS; place that in +cvs-repo in this directory- this can be a partial copy of CVS, or full- +that said, it needs to conform to thus: + +$(pwd)/cvs-repo/CVSROOT +$(pwd)/cvs-repo/gentoo-x86/* + +From there, ./script.sh is your main point of entry; it'll process that, +going parallel, using $(pwd)/output for temp space- it's suggested that +be tmpfs (much like cvs-repo). + +As each category/directory/component is finished, a git repo is generated, +some basic blob rewrites are done ($Header related). Two core directories +will exist in each; cvs2svn-tmp (which holds the fast-import data w/in), and +git; a recomposed bare git repository of that slice of gentoo-x86 history. + +Once that category/component is finished, it's moved into $(pwd)/final , and +another component is started; script.sh currently will run at grep -c MHz /proc/cpuinfo parallelism. + +Upon finishing the cvs->git conversion, the content needs to be reintegrated. + +create-git.sh exists for this. It looks in $(pwd)/final, and creates the new +repo in $(pwd)/git/work; this is a bare repo. + +Roughly, it does this via generating an empty repo, setting up alternates into slice of +history, setting up refs/heads/source/* space for each slice of history, +then forcing a date-ordered fast-export- manipulating the resultant stream +(stripping resets, rewriting the commit field to point to refs/heads/master, rewriting +commit messages to convert some basic structured information into git footers), and +spitting that out. + +It creates two dumps of intermediate data as it's going; export-stream-raw , and +export-stream-rewritten; the first is git fast-export raw output, the second is +the rewritten stream. Each are ~490MB (they're small due to the fact that +since we're exporting/importing w/in the same repo, we don't have to send blobs +through the stream- they can be directly referenced in the command stream). + +Now that that is done, we have a recomposed history in refs/heads/master. +From there, we do prun'ing/gc'ing, and force a git repack -Adf. + +That repo is ready to go at that point. @@ -171,7 +171,7 @@ ctx.sort_executable = r'sort' # Change the following line to True if the conversion should only # include the trunk of the repository (i.e., all branches and tags # should be omitted from the conversion): -ctx.trunk_only = False +ctx.trunk_only = True # How to convert CVS author names, log messages, and filenames to # Unicode. The first argument to CVSTextDecoder is a list of encoders @@ -539,7 +539,7 @@ run_options.set_project( # The filesystem path to the part of the CVS repository (*not* a # CVS working copy) that should be converted. This may be a # subdirectory (i.e., a module) within a larger CVS repository. - r'cvs-repo', + r'cvs-repo/gentoo-x86', # A list of symbol transformations that can be used to rename # symbols in this project. diff --git a/create-git.sh b/create-git.sh new file mode 100755 index 0000000..6389024 --- /dev/null +++ b/create-git.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +proc_count=$(grep -c MHz /proc/cpuinfo) +[ ${proc_count} -eq 0 ] && proc_count=1 +root="$(pwd)" +mkdir -p git +rm -rf git/* git/.git +set -f +mkdir -p git +cd git +git init --bare +git config core.logAllRefUpdates false +git config prune.expire now +mkdir -p objects/info +targets=( $(find ../final/ -maxdepth 1 -mindepth 1 -printf '../final/%P/\n' | \ + xargs -n1 readlink -f | tee >(sed -e 's:$:/git/objects:' > objects/info/alternates) ) ) +for x in "${targets[@]}"; do + rev=$(git --git-dir $x/git rev-list -1 master 2> /dev/null) + [ -z "$rev" ] && { echo "no content: $x"; continue; } + x="refs/heads/source/$(basename $x)" + git update-ref "$x" $rev +done + +echo "linearizing history, and rewriting messages..." + +time ( + git fast-export --progress=1000 --all --reverse --date-order --no-data | \ + tee ../export-stream-raw | \ + "${root}/rewrite-commit-dump.py" | \ + tee ../export-stream-rewritten | \ + git fast-import +) 2>&1 | tee git-creation.log + +echo "recomposed; repacking and breaking alternate linkage..." +# Wipe the strong refs to the other repos... +git ls-remote . refs/heads/source/'*' | awk '{print $2;}' | xargs -n1 git update-ref -d +# Localize the content... +time git repack -Adf --window=100 --depth=100 +# Wipe the alternates. +rm objects/info/alternates +echo "doing cleanup..." +time git prune +echo "doing basic sanity check" +time git log -p refs/heads/master > /dev/null || echo "non zero exit code from git log run..." +echo "Done" diff --git a/process_directory.sh b/process_directory.sh index 54c51cf..c9ff6e6 100755 --- a/process_directory.sh +++ b/process_directory.sh @@ -1,25 +1,31 @@ #!/bin/bash +command=' + sed -re "s/^\(paludis (0.1.*)\)$/Package-manager: Paludis \1/" \ + -e "s/^\([Pp]ortage version: (.*)\)$/Package-manager: Portage \1/"' f() { set -x - mkdir -p "${output}"/{git,cvs-repo/gentoo-x86/Attic} + mkdir -p "${output}"/{git{,-work},cvs-repo/gentoo-x86/Attic} ln -s "${cvsroot}" "${output}/cvs-repo/CVSROOT" ln -s "${root}/gentoo-x86/$1" "${output}/cvs-repo/gentoo-x86/$1" #ln -s "${root}/gentoo-x86/Attic" "${output}/cvs-repo/gentoo-x86/Attic" ln -s "$(pwd)/config" "${output}/config" - cd "${output}" + # Note- this must be canonical path, else it screws up our $Header rewriting. + cd "$(readlink -f "${output}" )" time cvs2git --options config -vv cd git git init --bare - cat ../cvs2svn-tmp/git-{blob,dump}.dat | git fast-import - rm -rf "${final}" + { "${base}/rewrite-blob-data.py" ../cvs2svn-tmp/git-blob.dat; + cat ../cvs2svn-tmp/git-dump.dat; + } | git fast-import + rm -rf "${final}" git-work cd "$root" mv "$output" "${final}" - git --git-dir "${final}/git" log --pretty=tformat:"%at %H" > "${final}/git-hashes" set +x } [ $# -ne 1 ] && { echo "need an argument..."; exit 1; } +base="$(pwd)" root="$(pwd)/cvs-repo" cvsroot="${root}/CVSROOT" repo="${root}/gentoo-x86" @@ -29,6 +35,6 @@ mkdir -p "$(dirname "${final}")" rm -rf "${output}" mkdir -p "${output}" -echo "processing ${1%,v} ${1}" +echo "processing ${1%,v}" >&2 time f "$1" &> "${output}/"log || { echo "failed $1"; exit 1; } -echo "processed $1" +echo "processed $1" >&2 diff --git a/rewrite-blob-data.py b/rewrite-blob-data.py new file mode 100755 index 0000000..55115a7 --- /dev/null +++ b/rewrite-blob-data.py @@ -0,0 +1,17 @@ +#!/usr/bin/python +import functools +import os +import re +import sys + +# $Header: /usr/local/ssd/gentoo-x86/output/.*/.*/cvs-repo/ +# $Header: /usr/local/ssd/gentoo-x86/output/app-accessibility/cvs-repo/gentoo-x86/app-accessibility/SphinxTrain/ChangeLog,v +base = os.path.dirname(os.path.abspath(__file__)) +mangler = functools.partial( + re.compile(r"\$Header: %s/output/.*/cvs-repo/" % base).sub, + r"$Header: /var/cvsroot/") + +write = sys.stdout.write +source = open(sys.argv[1]) if len(sys.argv) > 1 else sys.stdin +for x in source: + write(mangler(x)) diff --git a/rewrite-commit-dump.py b/rewrite-commit-dump.py new file mode 100755 index 0000000..4784cb5 --- /dev/null +++ b/rewrite-commit-dump.py @@ -0,0 +1,39 @@ +#!/usr/bin/python +import functools +import re +import sys + +mangler = [] +mangler.append(functools.partial( + re.compile(r"^\(paludis (0.1.*)\)$", re.M|re.I).sub, + r"Package-Manager: paludis-\1/")) +mangler.append(functools.partial( + re.compile(r"^\(portage version: (.*)\)$", re.M|re.I).sub, + r"Package-Manager: portage-\1")) + +write = sys.stdout.write +source = open(sys.argv[1]) if len(sys.argv) > 1 else sys.stdin +write('reset refs/heads/master\n') +while True: + x = source.readline() + if not x: + break + chunked = x.split() + if not chunked: + write(x) + continue + elif chunked[0] in ('reset', 'from'): + continue + elif chunked[0] == 'commit': + write('commit refs/heads/master\n') + continue + elif chunked[0] != 'data': + write(x) + continue + assert len(chunked) == 2 + size = int(chunked[1]) + data = source.read(size) + assert len(data) == size + for func in mangler: + data = func(data) + write("data %i\n%s" % (len(data), data)) @@ -3,20 +3,13 @@ proc_count=$(grep -c MHz /proc/cpuinfo) [ $proc_count -eq 0 ] && proc_count=1 -rm -rf git -mkdir git +rm -rf git/* git/.git final/* +mkdir git -p # Prioritize the larger categories first; they typically will have # the most revs, thus start them first. time { \ find cvs-repo/gentoo-x86 -maxdepth 1 -mindepth 1 -printf '%P\n' | \ xargs -n1 -I{} -- du -cs "cvs-repo/gentoo-x86/{}" | grep -v 'total$' | \ sort -gr | awk '{print $2;}' | xargs -n1 basename | \ - xargs -n1 -P${proc_count} ./process_directory.sh | \ - { - cd git; - git init &> /dev/null - while read l; do - git fetch "$(readlink -f "../final/$l/git")" && git merge FETCH_HEAD -m "blah" -q - done - } + xargs -n1 -P${proc_count} ./process_directory.sh } |