aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrian Harring <ferringb@google.com>2012-10-16 05:34:41 -0700
committerBrian Harring <ferringb@google.com>2012-10-16 13:28:49 -0700
commit9ab662882db48df135679083a2929c53d9cedc05 (patch)
tree2aa0f5d30d7cda540a099d3cde3d54fa133d1efa
parenttweak the progress output a bit; specifically, percentile increments (diff)
downloadgit-conversion-tools-9ab662882db48df135679083a2929c53d9cedc05.tar.gz
git-conversion-tools-9ab662882db48df135679083a2929c53d9cedc05.tar.bz2
git-conversion-tools-9ab662882db48df135679083a2929c53d9cedc05.zip
add --full mode; via this, the two phases can work together. faster namely.
-rw-r--r--README4
-rwxr-xr-xcreate-git.sh58
-rwxr-xr-xprocess_directory.sh6
-rwxr-xr-xrewrite-commit-dump.py12
-rwxr-xr-xscript.sh18
5 files changed, 75 insertions, 23 deletions
diff --git a/README b/README
index 1fe1420..95bdbc0 100644
--- a/README
+++ b/README
@@ -43,3 +43,7 @@ Now that that is done, we have a recomposed history in refs/heads/master.
From there, we do prun'ing/gc'ing, and force a git repack -Adf.
That repo is ready to go at that point.
+
+
+For a full production run, ./script.sh --full # is the fastest form (basically allows the final linearization,
+deduplication, etc, to run in parallel as work is available. not a huge gain, but shaves a minute off or so).
diff --git a/create-git.sh b/create-git.sh
index 667fed0..4fa47e5 100755
--- a/create-git.sh
+++ b/create-git.sh
@@ -8,32 +8,58 @@ rm -rf git/* git/.git
set -f
mkdir -p git
cd git
+git_root="$(pwd)"
git init --bare
git config core.logAllRefUpdates false
git config prune.expire now
mkdir -p objects/info
-targets=( $(find ../final/ -maxdepth 1 -mindepth 1 -printf '../final/%P/\n' | \
- xargs -n1 readlink -f | \
- while read l; do
- [ -e "$l/cvs2svn-tmp/git-dump.dat" ] || continue;
- echo "$l/git/objects" >> objects/info/alternates
- echo "$l"
- done
- )
-)
-echo "loading all commits, linearizing, and rewriting history..."
-time (
- "${root}/rewrite-commit-dump.py" "${targets[@]}" | \
- tee ../export-stream-rewritten | \
- git fast-import
-) 2>&1 | tee git-creation.log
+update_alternates() {
+ local alternates="$(readlink -f objects/info)/alternates"
+ cd "${root}"
+ while read l; do
+ l=$(readlink -f "$l")
+ [ -e "$l/cvs2svn-tmp/git-dump.dat" ] || { echo "ignoring nonexistant alternates source $l" >&2; continue; }
+ echo "$l/git/objects" >> "${alternates}"
+ echo "$l"
+ done
+ echo "starting history linearizing/rewriting" >&2
+}
+
+standalone_mode() {
+ echo "loading all commits" >&2
+ find ../final/ -maxdepth 1 -mindepth 1 -printf '../final/%P/\n' | \
+ xargs -n1 readlink -f | update_alternates
+}
+
+if [ "$1" == --fast ]; then
+ command=update_alternates
+else
+ command=standalone_mode
+ echo "loading all commits in parallel to their generation..." >&2
+fi
+
+# Roughly; since alternates are updated as we go- and since rewrite-commit-dump
+# doesn't actually output anything till it's linearized the history, we have
+# to delay fast-import's startup until we know we have data (meaning linearize
+# has finished- thus the alternates are all in place).
+# Bit tricky, but the gains have been worth it.
+time {
+ ${command} | \
+ "${root}/rewrite-commit-dump.py" | \
+ ( read line; { echo "$line"; cat; } | \
+ tee ../export-stream-rewritten |\
+ git fast-import
+ )
+} 2>&1 > >(tee git-creation.log)
+ret=$?
+[ $ret -eq 0 ] || { echo "none zero exit... the hell? $ret"; exit 1; }
echo "recomposed; repacking and breaking alternate linkage..."
# Localize the content we actual use out of the alternates...
time git repack -Adf --window=100 --depth=100
# Wipe the alternates.
-rm objects/info/alternates
+rm objects/info/alternates || { echo "no alternates means no sources..."; exit 2; }
echo "doing cleanup..."
time git prune
echo "doing basic sanity check"
diff --git a/process_directory.sh b/process_directory.sh
index a7be6ed..4918628 100755
--- a/process_directory.sh
+++ b/process_directory.sh
@@ -27,7 +27,7 @@ f() {
set +x
}
-[ $# -ne 1 ] && { echo "need an argument..."; exit 1; }
+[ $# -lt 1 ] && { echo "need an argument..."; exit 1; }
base="$(pwd)"
root="$(pwd)/cvs-repo"
@@ -42,3 +42,7 @@ mkdir -p "${output}"
echo "processing ${1%,v}" >&2
time f "$1" &> "${output}/"log || { echo "failed $1"; exit 1; }
echo "processed $1" >&2
+
+# Echo the completed pathway if we're in fast mode; this allows
+# create-git.sh to get a head start on this repo once we've finished.
+[ $# -eq 2 ] && echo "$(readlink -f "$final")" >&$2
diff --git a/rewrite-commit-dump.py b/rewrite-commit-dump.py
index 8cf1f4c..11264f4 100755
--- a/rewrite-commit-dump.py
+++ b/rewrite-commit-dump.py
@@ -268,13 +268,23 @@ def main(argv):
# Be careful here to just iterate over source; doing so allows this script
# to do basic processing as it goes (specifically while it's being fed from
# the mainline cvs2git parallelized repo creator).
- source = argv if argv else sys.stdin
+ source = argv
+ if not argv:
+ # See python manpage for details; stdin buffers if you iterate over it;
+ # we want each line as they're available, thus use this form.
+ def source():
+ line = sys.stdin.readline()
+ while line:
+ yield line
+ line = sys.stdin.readline()
+ source = source()
for directory in source:
directory = directory.strip()
tmp = os.path.join(directory, 'cvs2svn-tmp')
commits = os.path.join(tmp, 'git-dump.dat')
if not os.path.exists(commits):
sys.stderr.write("skipping %s; no commit data\n" % directory)
+ sys.stderr.flush()
continue
records.extend(manifest_dedup(
deserialize_records(
diff --git a/script.sh b/script.sh
index 7104262..b0c29a6 100755
--- a/script.sh
+++ b/script.sh
@@ -8,9 +8,17 @@ mkdir git -p
./create-mailmap.py userinfo.xml > gentoo_mailmap.py || { echo "failed to create mailmap."; exit 1; }
# Prioritize the larger categories first; they typically will have
# the most revs, thus start them first.
-time { \
- find cvs-repo/gentoo-x86 -maxdepth 1 -mindepth 1 -printf '%P\n' | \
- xargs -n1 -I{} -- du -cs "cvs-repo/gentoo-x86/{}" | grep -v 'total$' | \
- sort -gr | awk '{print $2;}' | xargs -n1 basename | \
- xargs -n1 -P${proc_count} ./process_directory.sh
+f() {
+ time { \
+ find cvs-repo/gentoo-x86 -maxdepth 1 -mindepth 1 -printf '%P\n' | \
+ xargs -n1 -I{} -- du -cs "cvs-repo/gentoo-x86/{}" | grep -v 'total$' | \
+ sort -gr | awk '{print $2;}' | xargs -n1 basename | \
+ xargs -n1 -P${proc_count} -I{} ./process_directory.sh "{}" $1
+ }
}
+fast=false
+if [ "$1" == --full ]; then
+ f 300 300> >(time ./create-git.sh --fast)
+else
+ f
+fi