add --full mode; via this, the two phases can work together. faster namely.

author: Brian Harring <ferringb@google.com> 2012-10-16 05:34:41 -0700
committer: Brian Harring <ferringb@google.com> 2012-10-16 13:28:49 -0700
commit: 9ab662882db48df135679083a2929c53d9cedc05 (patch)
tree: 2aa0f5d30d7cda540a099d3cde3d54fa133d1efa
parent: tweak the progress output a bit; specifically, percentile increments (diff)
download: git-conversion-tools-9ab662882db48df135679083a2929c53d9cedc05.tar.gz
git-conversion-tools-9ab662882db48df135679083a2929c53d9cedc05.tar.bz2
git-conversion-tools-9ab662882db48df135679083a2929c53d9cedc05.zip
5 files changed, 75 insertions, 23 deletions
diff --git a/README b/README
index 1fe1420..95bdbc0 100644
--- a/README
+++ b/README
@@ -43,3 +43,7 @@ Now that that is done, we have a recomposed history in refs/heads/master.
 From there, we do prun'ing/gc'ing, and force a git repack -Adf.
 
 That repo is ready to go at that point.
+
+
+For a full production run, ./script.sh --full # is the fastest form (basically allows the final linearization,
+deduplication, etc, to run in parallel as work is available.  not a huge gain, but shaves a minute off or so).
diff --git a/create-git.sh b/create-git.sh
index 667fed0..4fa47e5 100755
--- a/create-git.sh
+++ b/create-git.sh
@@ -8,32 +8,58 @@ rm -rf git/* git/.git
 set -f
 mkdir -p git
 cd git
+git_root="$(pwd)"
 git init --bare
 git config core.logAllRefUpdates false
 git config prune.expire now
 mkdir -p objects/info
-targets=( $(find ../final/ -maxdepth 1 -mindepth 1 -printf '../final/%P/\n' | \
-  xargs -n1 readlink -f | \
-    while read l; do
-      [ -e "$l/cvs2svn-tmp/git-dump.dat" ] || continue;
-      echo "$l/git/objects" >> objects/info/alternates
-      echo "$l"
-    done
-  )
-)
 
-echo "loading all commits, linearizing, and rewriting history..."
-time (
-  "${root}/rewrite-commit-dump.py" "${targets[@]}" | \
-    tee ../export-stream-rewritten | \
-    git fast-import
-) 2>&1 | tee git-creation.log
+update_alternates() {
+  local alternates="$(readlink -f objects/info)/alternates"
+  cd "${root}"
+  while read l; do
+    l=$(readlink -f "$l")
+    [ -e "$l/cvs2svn-tmp/git-dump.dat" ] || { echo "ignoring nonexistant alternates source $l" >&2; continue; }
+    echo "$l/git/objects" >> "${alternates}"
+    echo "$l"
+  done
+  echo "starting history linearizing/rewriting" >&2
+}
+
+standalone_mode() {
+  echo "loading all commits" >&2
+  find ../final/ -maxdepth 1 -mindepth 1 -printf '../final/%P/\n' | \
+    xargs -n1 readlink -f | update_alternates
+}
+
+if [ "$1" == --fast ]; then
+  command=update_alternates
+else
+  command=standalone_mode
+  echo "loading all commits in parallel to their generation..." >&2
+fi
+
+# Roughly; since alternates are updated as we go- and since rewrite-commit-dump
+# doesn't actually output anything till it's linearized the history, we have
+# to delay fast-import's startup until we know we have data (meaning linearize
+# has finished- thus the alternates are all in place).
+# Bit tricky, but the gains have been worth it.
+time {
+  ${command} | \
+  "${root}/rewrite-commit-dump.py" | \
+  ( read line; { echo "$line"; cat; } | \
+      tee ../export-stream-rewritten |\
+      git fast-import
+  )
+} 2>&1 > >(tee git-creation.log)
+ret=$?
+[ $ret -eq 0 ] || { echo "none zero exit... the hell? $ret"; exit 1; }
 
 echo "recomposed; repacking and breaking alternate linkage..."
 # Localize the content we actual use out of the alternates...
 time git repack -Adf --window=100 --depth=100
 # Wipe the alternates.
-rm objects/info/alternates
+rm objects/info/alternates || { echo "no alternates means no sources..."; exit 2; }
 echo "doing cleanup..."
 time git prune
 echo "doing basic sanity check"
diff --git a/process_directory.sh b/process_directory.sh
index a7be6ed..4918628 100755
--- a/process_directory.sh
+++ b/process_directory.sh
@@ -27,7 +27,7 @@ f() {
   set +x
 }
 
-[ $# -ne 1 ] && { echo "need an argument..."; exit 1; }
+[ $# -lt 1 ] && { echo "need an argument..."; exit 1; }
 
 base="$(pwd)"
 root="$(pwd)/cvs-repo"
@@ -42,3 +42,7 @@ mkdir -p "${output}"
 echo "processing ${1%,v}" >&2
 time f "$1" &> "${output}/"log || { echo "failed $1"; exit 1; }
 echo "processed  $1" >&2
+
+# Echo the completed pathway if we're in fast mode; this allows
+# create-git.sh to get a head start on this repo once we've finished.
+[ $# -eq 2 ] && echo "$(readlink -f "$final")" >&$2
diff --git a/rewrite-commit-dump.py b/rewrite-commit-dump.py
index 8cf1f4c..11264f4 100755
--- a/rewrite-commit-dump.py
+++ b/rewrite-commit-dump.py
@@ -268,13 +268,23 @@ def main(argv):
   # Be careful here to just iterate over source; doing so allows this script
   # to do basic processing as it goes (specifically while it's being fed from
   # the mainline cvs2git parallelized repo creator).
-  source = argv if argv else sys.stdin
+  source = argv
+  if not argv:
+    # See python manpage for details; stdin buffers if you iterate over it;
+    # we want each line as they're available, thus use this form.
+    def source():
+      line = sys.stdin.readline()
+      while line:
+        yield line
+        line = sys.stdin.readline()
+    source = source()
   for directory in source:
     directory = directory.strip()
     tmp = os.path.join(directory, 'cvs2svn-tmp')
     commits = os.path.join(tmp, 'git-dump.dat')
     if not os.path.exists(commits):
       sys.stderr.write("skipping %s; no commit data\n" % directory)
+      sys.stderr.flush()
       continue
     records.extend(manifest_dedup(
       deserialize_records(
diff --git a/script.sh b/script.sh
index 7104262..b0c29a6 100755
--- a/script.sh
+++ b/script.sh
@@ -8,9 +8,17 @@ mkdir git -p
 ./create-mailmap.py userinfo.xml > gentoo_mailmap.py || { echo "failed to create mailmap."; exit 1; }
 # Prioritize the larger categories first; they typically will have
 # the most revs, thus start them first.
-time { \
-  find cvs-repo/gentoo-x86 -maxdepth 1 -mindepth 1 -printf '%P\n' | \
-  xargs -n1 -I{} --  du -cs "cvs-repo/gentoo-x86/{}" | grep -v 'total$' | \
-  sort -gr | awk '{print $2;}' | xargs -n1 basename | \
-  xargs -n1 -P${proc_count} ./process_directory.sh
+f() {
+  time { \
+    find cvs-repo/gentoo-x86 -maxdepth 1 -mindepth 1 -printf '%P\n' | \
+    xargs -n1 -I{} --  du -cs "cvs-repo/gentoo-x86/{}" | grep -v 'total$' | \
+    sort -gr | awk '{print $2;}' | xargs -n1 basename | \
+    xargs -n1 -P${proc_count} -I{} ./process_directory.sh "{}" $1
+  }
 }
+fast=false
+if [ "$1" == --full ]; then
+  f 300 300> >(time ./create-git.sh --fast)
+else
+  f
+fi
author	Brian Harring <ferringb@google.com>	2012-10-16 05:34:41 -0700
committer	Brian Harring <ferringb@google.com>	2012-10-16 13:28:49 -0700
commit	9ab662882db48df135679083a2929c53d9cedc05 (patch)
tree	2aa0f5d30d7cda540a099d3cde3d54fa133d1efa
parent	tweak the progress output a bit; specifically, percentile increments (diff)
download	git-conversion-tools-9ab662882db48df135679083a2929c53d9cedc05.tar.gz git-conversion-tools-9ab662882db48df135679083a2929c53d9cedc05.tar.bz2 git-conversion-tools-9ab662882db48df135679083a2929c53d9cedc05.zip