diff options
author | Brian Harring <ferringb@google.com> | 2012-10-16 05:34:41 -0700 |
---|---|---|
committer | Brian Harring <ferringb@google.com> | 2012-10-16 13:28:49 -0700 |
commit | 9ab662882db48df135679083a2929c53d9cedc05 (patch) | |
tree | 2aa0f5d30d7cda540a099d3cde3d54fa133d1efa | |
parent | tweak the progress output a bit; specifically, percentile increments (diff) | |
download | git-conversion-tools-9ab662882db48df135679083a2929c53d9cedc05.tar.gz git-conversion-tools-9ab662882db48df135679083a2929c53d9cedc05.tar.bz2 git-conversion-tools-9ab662882db48df135679083a2929c53d9cedc05.zip |
add --full mode; via this, the two phases can work together. faster namely.
-rw-r--r-- | README | 4 | ||||
-rwxr-xr-x | create-git.sh | 58 | ||||
-rwxr-xr-x | process_directory.sh | 6 | ||||
-rwxr-xr-x | rewrite-commit-dump.py | 12 | ||||
-rwxr-xr-x | script.sh | 18 |
5 files changed, 75 insertions, 23 deletions
@@ -43,3 +43,7 @@ Now that that is done, we have a recomposed history in refs/heads/master. From there, we do prun'ing/gc'ing, and force a git repack -Adf. That repo is ready to go at that point. + + +For a full production run, ./script.sh --full # is the fastest form (basically allows the final linearization, +deduplication, etc, to run in parallel as work is available. not a huge gain, but shaves a minute off or so). diff --git a/create-git.sh b/create-git.sh index 667fed0..4fa47e5 100755 --- a/create-git.sh +++ b/create-git.sh @@ -8,32 +8,58 @@ rm -rf git/* git/.git set -f mkdir -p git cd git +git_root="$(pwd)" git init --bare git config core.logAllRefUpdates false git config prune.expire now mkdir -p objects/info -targets=( $(find ../final/ -maxdepth 1 -mindepth 1 -printf '../final/%P/\n' | \ - xargs -n1 readlink -f | \ - while read l; do - [ -e "$l/cvs2svn-tmp/git-dump.dat" ] || continue; - echo "$l/git/objects" >> objects/info/alternates - echo "$l" - done - ) -) -echo "loading all commits, linearizing, and rewriting history..." -time ( - "${root}/rewrite-commit-dump.py" "${targets[@]}" | \ - tee ../export-stream-rewritten | \ - git fast-import -) 2>&1 | tee git-creation.log +update_alternates() { + local alternates="$(readlink -f objects/info)/alternates" + cd "${root}" + while read l; do + l=$(readlink -f "$l") + [ -e "$l/cvs2svn-tmp/git-dump.dat" ] || { echo "ignoring nonexistant alternates source $l" >&2; continue; } + echo "$l/git/objects" >> "${alternates}" + echo "$l" + done + echo "starting history linearizing/rewriting" >&2 +} + +standalone_mode() { + echo "loading all commits" >&2 + find ../final/ -maxdepth 1 -mindepth 1 -printf '../final/%P/\n' | \ + xargs -n1 readlink -f | update_alternates +} + +if [ "$1" == --fast ]; then + command=update_alternates +else + command=standalone_mode + echo "loading all commits in parallel to their generation..." >&2 +fi + +# Roughly; since alternates are updated as we go- and since rewrite-commit-dump +# doesn't actually output anything till it's linearized the history, we have +# to delay fast-import's startup until we know we have data (meaning linearize +# has finished- thus the alternates are all in place). +# Bit tricky, but the gains have been worth it. +time { + ${command} | \ + "${root}/rewrite-commit-dump.py" | \ + ( read line; { echo "$line"; cat; } | \ + tee ../export-stream-rewritten |\ + git fast-import + ) +} 2>&1 > >(tee git-creation.log) +ret=$? +[ $ret -eq 0 ] || { echo "none zero exit... the hell? $ret"; exit 1; } echo "recomposed; repacking and breaking alternate linkage..." # Localize the content we actual use out of the alternates... time git repack -Adf --window=100 --depth=100 # Wipe the alternates. -rm objects/info/alternates +rm objects/info/alternates || { echo "no alternates means no sources..."; exit 2; } echo "doing cleanup..." time git prune echo "doing basic sanity check" diff --git a/process_directory.sh b/process_directory.sh index a7be6ed..4918628 100755 --- a/process_directory.sh +++ b/process_directory.sh @@ -27,7 +27,7 @@ f() { set +x } -[ $# -ne 1 ] && { echo "need an argument..."; exit 1; } +[ $# -lt 1 ] && { echo "need an argument..."; exit 1; } base="$(pwd)" root="$(pwd)/cvs-repo" @@ -42,3 +42,7 @@ mkdir -p "${output}" echo "processing ${1%,v}" >&2 time f "$1" &> "${output}/"log || { echo "failed $1"; exit 1; } echo "processed $1" >&2 + +# Echo the completed pathway if we're in fast mode; this allows +# create-git.sh to get a head start on this repo once we've finished. +[ $# -eq 2 ] && echo "$(readlink -f "$final")" >&$2 diff --git a/rewrite-commit-dump.py b/rewrite-commit-dump.py index 8cf1f4c..11264f4 100755 --- a/rewrite-commit-dump.py +++ b/rewrite-commit-dump.py @@ -268,13 +268,23 @@ def main(argv): # Be careful here to just iterate over source; doing so allows this script # to do basic processing as it goes (specifically while it's being fed from # the mainline cvs2git parallelized repo creator). - source = argv if argv else sys.stdin + source = argv + if not argv: + # See python manpage for details; stdin buffers if you iterate over it; + # we want each line as they're available, thus use this form. + def source(): + line = sys.stdin.readline() + while line: + yield line + line = sys.stdin.readline() + source = source() for directory in source: directory = directory.strip() tmp = os.path.join(directory, 'cvs2svn-tmp') commits = os.path.join(tmp, 'git-dump.dat') if not os.path.exists(commits): sys.stderr.write("skipping %s; no commit data\n" % directory) + sys.stderr.flush() continue records.extend(manifest_dedup( deserialize_records( @@ -8,9 +8,17 @@ mkdir git -p ./create-mailmap.py userinfo.xml > gentoo_mailmap.py || { echo "failed to create mailmap."; exit 1; } # Prioritize the larger categories first; they typically will have # the most revs, thus start them first. -time { \ - find cvs-repo/gentoo-x86 -maxdepth 1 -mindepth 1 -printf '%P\n' | \ - xargs -n1 -I{} -- du -cs "cvs-repo/gentoo-x86/{}" | grep -v 'total$' | \ - sort -gr | awk '{print $2;}' | xargs -n1 basename | \ - xargs -n1 -P${proc_count} ./process_directory.sh +f() { + time { \ + find cvs-repo/gentoo-x86 -maxdepth 1 -mindepth 1 -printf '%P\n' | \ + xargs -n1 -I{} -- du -cs "cvs-repo/gentoo-x86/{}" | grep -v 'total$' | \ + sort -gr | awk '{print $2;}' | xargs -n1 basename | \ + xargs -n1 -P${proc_count} -I{} ./process_directory.sh "{}" $1 + } } +fast=false +if [ "$1" == --full ]; then + f 300 300> >(time ./create-git.sh --fast) +else + f +fi |