aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'cvs2svn_rcsparse/texttools.py')
-rw-r--r--cvs2svn_rcsparse/texttools.py348
1 files changed, 0 insertions, 348 deletions
diff --git a/cvs2svn_rcsparse/texttools.py b/cvs2svn_rcsparse/texttools.py
deleted file mode 100644
index 7c713eb..0000000
--- a/cvs2svn_rcsparse/texttools.py
+++ /dev/null
@@ -1,348 +0,0 @@
-# -*-python-*-
-#
-# Copyright (C) 1999-2006 The ViewCVS Group. All Rights Reserved.
-#
-# By using this file, you agree to the terms and conditions set forth in
-# the LICENSE.html file which can be found at the top level of the ViewVC
-# distribution or at http://viewvc.org/license-1.html.
-#
-# For more information, visit http://viewvc.org/
-#
-# -----------------------------------------------------------------------
-
-import string
-
-# note: this will raise an ImportError if it isn't available. the rcsparse
-# package will recognize this and switch over to the default parser.
-from mx import TextTools
-
-import common
-
-
-# for convenience
-_tt = TextTools
-
-_idchar_list = map(chr, range(33, 127)) + map(chr, range(160, 256))
-_idchar_list.remove('$')
-_idchar_list.remove(',')
-#_idchar_list.remove('.') # leave as part of 'num' symbol
-_idchar_list.remove(':')
-_idchar_list.remove(';')
-_idchar_list.remove('@')
-_idchar = string.join(_idchar_list, '')
-_idchar_set = _tt.set(_idchar)
-
-_onechar_token_set = _tt.set(':;')
-
-_not_at_set = _tt.invset('@')
-
-_T_TOKEN = 30
-_T_STRING_START = 40
-_T_STRING_SPAN = 60
-_T_STRING_END = 70
-
-_E_COMPLETE = 100 # ended on a complete token
-_E_TOKEN = 110 # ended mid-token
-_E_STRING_SPAN = 130 # ended within a string
-_E_STRING_END = 140 # ended with string-end ('@') (could be mid-@@)
-
-_SUCCESS = +100
-
-_EOF = 'EOF'
-_CONTINUE = 'CONTINUE'
-_UNUSED = 'UNUSED'
-
-
-# continuation of a token over a chunk boundary
-_c_token_table = (
- (_T_TOKEN, _tt.AllInSet, _idchar_set),
- )
-
-class _mxTokenStream:
-
- # the algorithm is about the same speed for any CHUNK_SIZE chosen.
- # grab a good-sized chunk, but not too large to overwhelm memory.
- # note: we use a multiple of a standard block size
- CHUNK_SIZE = 192 * 512 # about 100k
-
-# CHUNK_SIZE = 5 # for debugging, make the function grind...
-
- def __init__(self, file):
- self.rcsfile = file
- self.tokens = [ ]
- self.partial = None
-
- self.string_end = None
-
- def _parse_chunk(self, buf, start=0):
- "Get the next token from the RCS file."
-
- buflen = len(buf)
-
- assert start < buflen
-
- # construct a tag table which refers to the buffer we need to parse.
- table = (
- #1: ignore whitespace. with or without whitespace, move to the next rule.
- (None, _tt.AllInSet, _tt.whitespace_set, +1),
-
- #2
- (_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
-
- #3: accumulate token text and exit, or move to the next rule.
- (_UNUSED, _tt.AllInSet + _tt.AppendMatch, _idchar_set, +2),
-
- #4
- (_E_TOKEN, _tt.EOF + _tt.AppendTagobj, _tt.Here, -3, _SUCCESS),
-
- #5: single character tokens exit immediately, or move to the next rule
- (_UNUSED, _tt.IsInSet + _tt.AppendMatch, _onechar_token_set, +2),
-
- #6
- (_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, -5, _SUCCESS),
-
- #7: if this isn't an '@' symbol, then we have a syntax error (go to a
- # negative index to indicate that condition). otherwise, suck it up
- # and move to the next rule.
- (_T_STRING_START, _tt.Is + _tt.AppendTagobj, '@'),
-
- #8
- (None, _tt.Is, '@', +4, +1),
- #9
- (buf, _tt.Is, '@', +1, -1),
- #10
- (_T_STRING_END, _tt.Skip + _tt.AppendTagobj, 0, 0, +1),
- #11
- (_E_STRING_END, _tt.EOF + _tt.AppendTagobj, _tt.Here, -10, _SUCCESS),
-
- #12
- (_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
-
- #13: suck up everything that isn't an AT. go to next rule to look for EOF
- (buf, _tt.AllInSet, _not_at_set, 0, +1),
-
- #14: go back to look for double AT if we aren't at the end of the string
- (_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, -6, _SUCCESS),
- )
-
- # Fast, texttools may be, but it's somewhat lacking in clarity.
- # Here's an attempt to document the logic encoded in the table above:
- #
- # Flowchart:
- # _____
- # / /\
- # 1 -> 2 -> 3 -> 5 -> 7 -> 8 -> 9 -> 10 -> 11
- # | \/ \/ \/ /\ \/
- # \ 4 6 12 14 /
- # \_______/_____/ \ / /
- # \ 13 /
- # \__________________________________________/
- #
- # #1: Skip over any whitespace.
- # #2: If now EOF, exit with code _E_COMPLETE.
- # #3: If we have a series of characters in _idchar_set, then:
- # #4: Output them as a token, and go back to #1.
- # #5: If we have a character in _onechar_token_set, then:
- # #6: Output it as a token, and go back to #1.
- # #7: If we do not have an '@', then error.
- # If we do, then log a _T_STRING_START and continue.
- # #8: If we have another '@', continue on to #9. Otherwise:
- # #12: If now EOF, exit with code _E_STRING_SPAN.
- # #13: Record the slice up to the next '@' (or EOF).
- # #14: If now EOF, exit with code _E_STRING_SPAN.
- # Otherwise, go back to #8.
- # #9: If we have another '@', then we've just seen an escaped
- # (by doubling) '@' within an @-string. Record a slice including
- # just one '@' character, and jump back to #8.
- # Otherwise, we've *either* seen the terminating '@' of an @-string,
- # *or* we've seen one half of an escaped @@ sequence that just
- # happened to be split over a chunk boundary - in either case,
- # we continue on to #10.
- # #10: Log a _T_STRING_END.
- # #11: If now EOF, exit with _E_STRING_END. Otherwise, go back to #1.
-
- success, taglist, idx = _tt.tag(buf, table, start)
-
- if not success:
- ### need a better way to report this error
- raise common.RCSIllegalCharacter()
- assert idx == buflen
-
- # pop off the last item
- last_which = taglist.pop()
-
- i = 0
- tlen = len(taglist)
- while i < tlen:
- if taglist[i] == _T_STRING_START:
- j = i + 1
- while j < tlen:
- if taglist[j] == _T_STRING_END:
- s = _tt.join(taglist, '', i+1, j)
- del taglist[i:j]
- tlen = len(taglist)
- taglist[i] = s
- break
- j = j + 1
- else:
- assert last_which == _E_STRING_SPAN
- s = _tt.join(taglist, '', i+1)
- del taglist[i:]
- self.partial = (_T_STRING_SPAN, [ s ])
- break
- i = i + 1
-
- # figure out whether we have a partial last-token
- if last_which == _E_TOKEN:
- self.partial = (_T_TOKEN, [ taglist.pop() ])
- elif last_which == _E_COMPLETE:
- pass
- elif last_which == _E_STRING_SPAN:
- assert self.partial
- else:
- assert last_which == _E_STRING_END
- self.partial = (_T_STRING_END, [ taglist.pop() ])
-
- taglist.reverse()
- taglist.extend(self.tokens)
- self.tokens = taglist
-
- def _set_end(self, taglist, text, l, r, subtags):
- self.string_end = l
-
- def _handle_partial(self, buf):
- which, chunks = self.partial
- if which == _T_TOKEN:
- success, taglist, idx = _tt.tag(buf, _c_token_table)
- if not success:
- # The start of this buffer was not a token. So the end of the
- # prior buffer was a complete token.
- self.tokens.insert(0, string.join(chunks, ''))
- else:
- assert len(taglist) == 1 and taglist[0][0] == _T_TOKEN \
- and taglist[0][1] == 0 and taglist[0][2] == idx
- if idx == len(buf):
- #
- # The whole buffer was one huge token, so we may have a
- # partial token again.
- #
- # Note: this modifies the list of chunks in self.partial
- #
- chunks.append(buf)
-
- # consumed the whole buffer
- return len(buf)
-
- # got the rest of the token.
- chunks.append(buf[:idx])
- self.tokens.insert(0, string.join(chunks, ''))
-
- # no more partial token
- self.partial = None
-
- return idx
-
- if which == _T_STRING_END:
- if buf[0] != '@':
- self.tokens.insert(0, string.join(chunks, ''))
- return 0
- chunks.append('@')
- start = 1
- else:
- start = 0
-
- self.string_end = None
- string_table = (
- (None, _tt.Is, '@', +3, +1),
- (_UNUSED, _tt.Is + _tt.AppendMatch, '@', +1, -1),
- (self._set_end, _tt.Skip + _tt.CallTag, 0, 0, _SUCCESS),
-
- (None, _tt.EOF, _tt.Here, +1, _SUCCESS),
-
- # suck up everything that isn't an AT. move to next rule to look
- # for EOF
- (_UNUSED, _tt.AllInSet + _tt.AppendMatch, _not_at_set, 0, +1),
-
- # go back to look for double AT if we aren't at the end of the string
- (None, _tt.EOF, _tt.Here, -5, _SUCCESS),
- )
-
- success, unused, idx = _tt.tag(buf, string_table,
- start, len(buf), chunks)
-
- # must have matched at least one item
- assert success
-
- if self.string_end is None:
- assert idx == len(buf)
- self.partial = (_T_STRING_SPAN, chunks)
- elif self.string_end < len(buf):
- self.partial = None
- self.tokens.insert(0, string.join(chunks, ''))
- else:
- self.partial = (_T_STRING_END, chunks)
-
- return idx
-
- def _parse_more(self):
- buf = self.rcsfile.read(self.CHUNK_SIZE)
- if not buf:
- return _EOF
-
- if self.partial:
- idx = self._handle_partial(buf)
- if idx is None:
- return _CONTINUE
- if idx < len(buf):
- self._parse_chunk(buf, idx)
- else:
- self._parse_chunk(buf)
-
- return _CONTINUE
-
- def get(self):
- try:
- return self.tokens.pop()
- except IndexError:
- pass
-
- while not self.tokens:
- action = self._parse_more()
- if action == _EOF:
- return None
-
- return self.tokens.pop()
-
-
-# _get = get
-# def get(self):
- token = self._get()
- print 'T:', `token`
- return token
-
- def match(self, match):
- if self.tokens:
- token = self.tokens.pop()
- else:
- token = self.get()
-
- if token != match:
- raise common.RCSExpected(token, match)
-
- def unget(self, token):
- self.tokens.append(token)
-
- def mget(self, count):
- "Return multiple tokens. 'next' is at the end."
- while len(self.tokens) < count:
- action = self._parse_more()
- if action == _EOF:
- ### fix this
- raise RuntimeError, 'EOF hit while expecting tokens'
- result = self.tokens[-count:]
- del self.tokens[-count:]
- return result
-
-
-class Parser(common._Parser):
- stream_class = _mxTokenStream