"""Convert the parse tree created by sre_parse.parse back into a string Python's "re" regular expression library uses the 'sre' modules developed by Secret Labs AB. (Tack sa mycket, effbot!) This is an internal module, which means it isn't documented and you shouldn't use it. Despite that, it's a very useful module if you are like me and develop alternate ways to parse using regular expressions. With a bit of work you can make your own special-purpose C code given a regular expression, or make a SAX event generating parser generator like Martel (see http://www.dalkescientific.com/Martel/ ). Less esoterically, you can do like Jeff Petkau did and find matching strings given the pattern; see http://www.uselesspython.com/jpetkau1.py When writing these tools, it's very helpful to know where you are in the tree. sre_dump lets you dump the tree back into a regular expression, as in: >>> import sre_dump, sre_parse >>> tree = sre_parse.parse("AB|CD") >>> tree [('branch', (None, [[('literal', 65), ('literal', 66)], [('literal', 67), ('literal', 68)]]))] >>> sre_dump.dump(tree) 'AB|CD' >>> sre_dump.dump(tree[0][1][1][0]) 'AB' >>> For more in-depth debugging of regular expression generated state diagrams, you will want to know where the given node came from. Viewing just the text of the subpattern doesn't help because it can exist several times in the pattern. The 'dump_and_offsets' function also returns a list of locations for each subexpression. The list is a 3-tuple of (expression, start position, end position). >>> s, offsets = sre_dump.dump_and_offsets(tree) >>> def show_offsets(s, offsets): ... print s ... for expr, i, j, text in offsets: ... print " "*i + "-"*(j-i) + " " *(len(s)-j+1), s[i:j] ... >>> show_offsets(s, offsets) AB|CD - A - B - C - D ----- AB|CD >>> You can get the same effect with the pprint function. >>> print sre_dump.pprint("(AB*|C?D){2,3}") (AB*|C?D){2,3} - A - B -- B* - C -- C? - D ------- AB*|C?D --------- (AB*|C?D) -------------- (AB*|C?D){2,3} >>> It is almost complete. What's missing are a few edge cases, like rare character escapes (especially in character ranges). Please send me any patches along with test cases. This was developed using Python 2.3. sre_parse hasn't changed much between releases so this should be pretty portable. Written in 2003 by Andrew Dalke Released into the public domain. No copyright protections asserted. Enjoy! Changelog: 7 Aug 2003 - Initial release. """ import sre_parse from sre_constants import * import string __all__ = ["dump", "dump_and_offsets", "pprint", "pformat"] _categories = { CATEGORY_DIGIT: "\\d", CATEGORY_NOT_DIGIT: "\\D", CATEGORY_WORD: "\\w", CATEGORY_NOT_WORD: "\\W", CATEGORY_SPACE: "\\s", CATEGORY_NOT_SPACE: "\\S", # are there more? } _at_symbols = { AT_BEGINNING: "^", AT_END: "$", AT_BEGINNING_STRING: r"\A", AT_END_STRING: r"\Z", AT_BOUNDARY: r"\b", AT_NON_BOUNDARY: r"\B", } # [AB\]C] to match any of A, B, ] and C # [AB\-D] matches any one of A, B, -, and D def _escape_in_char(i): c = chr(i) if c in ']-': return '\\' + c return c # [--A] means from "-" to "A" def _escape_in_range(i): c = chr(i) if c in ']': return '\\' + c return c def _dump_in(av): # XXX is this proper escaping? words = [] emit = words.append first_flg = 1 for x in av: if x[0] == NEGATE: if first_flg == 1: emit("^") else: raise AssertionError("negate not first?") elif x[0] == LITERAL: if first_flg and x[1] == ord("^"): emit("\\^") else: emit(_escape_in_char(x[1])) elif x[0] == RANGE: emit(_escape_in_range(x[1][0]) + "-" + _escape_in_range(x[1][1])) elif x[0] == CATEGORY: emit(_categories[x[1]]) else: raise NotImplementedError(x[0]) first_flg = 0 return "[" + "".join(words) + "]" # I could use sre.escape but I don't like that it # escapes "<", ">", and some other characters _escape_table = {"\000": "\\000"} # special-case 0 for i in range(256): _escape_table[i] = "\\" + chr(i) for c in string.ascii_letters + string.digits + "<>: /": _escape_table[ord(c)] = c del c, i def _dump(pattern, groupdict): pos = [0] offsets = [] words = [] def emit(s): words.append(s) pos[0] = pos[0] + len(s) def include(suboffsets): x = pos[0] for e, i, j in suboffsets: offsets.append( (e, i+x, j+x) ) start = pos[0] for term in pattern: op, av = term if op == LITERAL: emit(_escape_table[av]) elif op == NOT_LITERAL: emit("[^%s]" % chr(av)) elif op == IN: emit(_dump_in(av)) elif op in (MAX_REPEAT, MIN_REPEAT): i, j, subtree = av s, suboffsets = _dump(subtree, groupdict) include(suboffsets) emit(s) if j == MAXREPEAT: if i == 0: emit("*") elif i == 1: emit("+") else: emit("{%d,}" % i) else: if i == 0 and j == 1: emit("?") elif i == j: emit("{%d}" % i) elif i == 0: emit("{,%d}" % j) else: emit("{%d,%d}" % (i, j)) if op == MIN_REPEAT: emit("?") elif op == SUBPATTERN: groupnum, subtree = av s, suboffsets = _dump(subtree, groupdict) if groupnum in groupdict: name = groupdict[groupnum] emit("(?P<%s>" % (name,)) #") emacs cruft elif groupnum is None: emit("(?:") #") emacs cruft else: emit("(") include(suboffsets) emit("%s)" % s) elif op == ASSERT: dir, subtree = av dir = {1:"", -1:"<"}[dir] s, suboffsets = _dump(subtree, groupdict) emit("(?%s=" % dir) #") include(suboffsets) emit("%s)" % s) elif op == ASSERT_NOT: dir, subtree = av dir = {1:"", -1:"<"}[dir] s, suboffsets = _dump(subtree, groupdict) emit("(?%s!" % dir) include(suboffsets) emit("%s)" % s) elif op == BRANCH: # av[0] is always None for x in av[1][:-1]: s, suboffsets = _dump(x, groupdict) include(suboffsets) emit(s) emit("|") s, suboffsets = _dump(av[1][-1], groupdict) include(suboffsets) emit(s) elif op == ANY: emit(".") elif op == AT: emit(_at_symbols[av]) elif op == GROUPREF: if av in groupdict: emit("(?P=%s)" % groupdict[av]) else: emit("\\%d" % av) else: raise NotImplementedError(op) # These are added in creation order, inner before outer. offsets.append( (term, start, pos[0]) ) start = pos[0] return "".join(words), offsets class NoGroupNames: def __getitem__(self, i): assert i < 100, "that would be silly" return "group_%d" % i def __contains__(self, i): return 0 def dump_and_offsets(tree): # You really should pass in a SubTree, but I'll let you # get away with a couple common variations if isinstance(tree, list): # used when building a list by hand; ignore group names d = NoGroupNames() elif isinstance(tree, tuple) and len(tree) == 2: # an (op, av) 2-pl element in the tree tree = [tree] d = NoGroupNames() else: if isinstance(tree, str): # pass in a string and I'll convert it for you tree = sre_parse.parse(tree) # need the reverse mapping to get the names d = {} for k, v in tree.pattern.groupdict.items(): d[v] = k return _dump(tree, d) def dump(tree): return dump_and_offsets(tree)[0] def pprint(tree, stream=None): s, offsets = dump_and_offsets(tree) print >>stream, s for expr, i, j in offsets: print >>stream, " "*i + "-"*(j-i) + " " *(len(s)-j+1) + s[i:j] def pformat(tree): import cStringIO as StringIO sio = StringIO.StringIO() pprint(tree, sio) return sio.getvalue() def _do_test(input, expected = None): if expected is None: expected = input tree = sre_parse.parse(input) try: output, offsets = dump_and_offsets(tree) except (NotImplementedError, KeyError), msg: import traceback print " ===== Cannot compute =====" print "Input was", repr(input) print "Exception mesage:", msg traceback.print_exc() return False if output != expected: print " ===== Different =====" print "Input was", repr(input) print "Expected:", repr(expected) print "Output is", repr(output) print tree return False # Check the offsets are generated correctly for expr, i, j in offsets: a = output[i:j] # Use the SubPattern to get the pattern names correct. b = dump(sre_parse.SubPattern(tree.pattern, [expr])) if a != b: print " ===== bad substrings =====" print "Input was", repr(input) print "Excepcted", repr(a) print "Redump is", repr(b) return False return True def test(): # Some of these taken from 'perlre', some from AMK's docs # and many I just made up test_values = r""" A AB ABC [^A] [^ABC] A+ A* A? AA* A{2} A{2,4} A{9,12} A{2,} A{,9} [\d] [^\d] [A-Za-z0-9_]+ . .* AB|CD AB|CD|EF|GH (AB|CD)* (a??) a*? a{3,}? ab{4,7}? (?PABC*) (?Px)|(?Py) [b:]+ (b)|(:+) a|(b) (?:Q)(Q) ^A*$ (?:(?Pa)|(?Pb))(?Pc)? (?=AB)C (?!CD)DC AB(?<=CD) AB(? (?P[a-zA-Z]+)(?P=name) [AB\]C] [--A] [ABC\-D] [\^ABC] """.split() passed = True for input in test_values: passed = passed and _do_test(input) # sre_parse does some optimization so I # have to allow variant forms special_values = r""" \d [\d] \d+ [\d]+ \w [\w] \W [\W] \s [\s] \S [\S] A|B [AB] A|B|C|D [ABCD] (?:a|b|c) (?:[abc]) .*\..*$ .*\..*$ .*\.([^b]..|.[^a].|..[^t])$ .*\.([^b]..|.[^a].|..[^t])$ .*\.([^b].?.?|.[^a]?.?|..?[^t]?) .*\.([^b].?.?|.[^a]?.?|..?[^t]?) .*\.(?!bat$).*$ .*\.(?!bat$).*$ .*\.(?!bat$|exe$) .*\.(?!bat$|exe$) """.split("\n")[1:-1] for x in special_values: if not x.strip(): continue input, expected = x.split() passed = passed and _do_test(input, expected) # issues with spaces others = [ r"^([^ ]*) *([^ ]*)", r"(.)\1", r"/Time: (..):(..):(..)/", r"\Aabc", r"abc\b", r"abc\Bqwe", r"qwerty\Z", (r"\n", "\\\n"), ] for x in others: if isinstance(x, str): passed = passed and _do_test(x) else: input, output = x passed = passed and _do_test(input, output) s = pformat("(AB*|C?D){2,3}((?P\d+)(?:a|b))+") if passed: print "All tests passed." else: print "Test failure." sys.exit(1) if __name__ == "__main__": import sys if len(sys.argv) == 1: test() else: in_s = sys.argv[1] x = sre_parse.parse(in_s) out_s = dump(x) print "Input :", repr(in_s) print "Output:", repr(out_s)