rendered paste bodyimport tokenizeimport pyparsing as pfrom operator import itemgetterdef parse_grammar(text): """Parses the grammar. # Valid constructs: rules: rule+ rule: NAME ':' branch NEWLINE branch: seq ('|' seq)* seq: ( repeated )+ repeated: token ( '*' | '+' )* # repeat zero/one or more times token: ( '[' branch ']' # optional group (linebreaks allowed inside) | '(' branch ')' # group (linebreaks allowed inside) | STRING # literal | NAME # named terminal, or nonterminal ) Comments (#...) and all whitespace except linebreaks are normally ignored. Basicly same rules for whitespace as in Python, except that no blocks is allowed. """ # Modes: START, COLON, BODY = xrange(3) modes = ('START','COLON','BODY') mode = START rules = {} rule_name = None stack = None branch = None seq = None end_after_this = False readline = iter(text.splitlines(1)).next for tok,lex,start,end,line in tokenize.generate_tokens(readline): if tok in (tokenize.COMMENT,tokenize.NL): continue elif tok == tokenize.ENDMARKER: end_after_this = True tok = tokenize.NEWLINE #~ print '%-6s %4d %s (%d)' % (modes[mode], start[0], tokenize.tok_name[tok], tok) if mode is START: if tok == tokenize.NAME: if lex in rules: raise ValueError("Duplicate rules. Line %d, column %d" % start) rule_name = lex mode = COLON elif tok == tokenize.NEWLINE: continue else: raise ValueError("Expecting rule name. Line %d, column %d" % start) continue elif mode is COLON: if (tok,lex) == (tokenize.OP, ':'): mode = BODY stack = [] branch = [] seq = [] else: raise ValueError("Expecting colon. Line %d, column %d" % start) continue elif mode is BODY: if tok == tokenize.NAME: seq.append(Name(lex)) elif tok == tokenize.STRING: seq.append(Literal(decode_string(lex))) elif tok == tokenize.OP: if lex in '([': stack.append((lex,branch,seq)) branch = [] seq = [] elif lex in ')]': if not stack: raise ValueError("Unbalanced groups. Line %d, column %d" % start) if len(seq) == 1: token = seq[0] else: token = Sequence(seq) branch.append(token) if len(branch) != 1: token = Branch(branch) if lex == ']': token = Optional(token) left,branch,seq = stack.pop() if lex != {'(':')','[':']'}[left]: raise ValueError("Unbalanced groups. Line %d, column %d" % start) seq.append(token) elif lex == '|': if len(seq) == 1: token = seq[0] else: token = Sequence(seq) branch.append(token) seq = [] elif lex in '*+': if not seq: raise ValueError("Nothing to repeat. Line %d, column %d" % start) seq[-1] = {'*':ZeroOrMore,'+':OneOrMore}[lex](seq[-1]) else: raise ValueError("Invalid token. Line %d, column %d" % start) elif tok == tokenize.NEWLINE: if stack: raise ValueError("Unbalanced groups. Line %d, column %d" % start) if len(seq) == 1: token = seq[0] else: token = Sequence(seq) branch.append(token) if len(branch) != 1: token = Branch(branch) if lex == ']': token = Optional(token) rules[rule_name] = Rule(rule_name,token) rule_name = stack = branch = seq = None mode = START else: raise ValueError("Invalid token. Line %d, column %d" % start) if end_after_this: break return rulesdef compile_grammar(rules,terminals,nonterminal_action=None,terminal_action=None): """Compile the grammar AST into pyparsing objects. rules: AST dictionary returned by parse_grammar terminals: dictionary of terminal name -> pyparsing object nonterminal_action: Will be called with the name of a rule, and will be excpected to return a callable that will be used as parsing action. terminal_action: Will be called with the value of a literal, and will be excpected to return a callable that will be used as parsing action. """ nonterminals = set(rules) rules = rules.values() if nonterminal_action is None: def nonterminal_action(name): return lambda toks: (name,) + tuple(toks) if terminal_action is None: def terminal_action(lexeme): return lambda toks: (lexeme, toks[0] if toks else '') def decend(node): """Recursivly compile the nodes into corresponding ParsingElements """ if isinstance(node,Name): # Names; both nonterminals and named terminals if node.name in nonterminals: return compiled[node.name] elif node.name in terminals: return terminals[node.name] else: raise KeyError("Unknown terminal: %s" % node.name) elif isinstance(node,Literal): # Embedded strings; both keywords and symbols. if node.value.replace('_','').isalnum(): ret = p.Keyword(node.value) else: ret = p.Literal(node.value) ret.setParseAction(terminal_action(node.value)) return ret elif isinstance(node,Sequence): return p.And([decend(c) for c in node]) elif isinstance(node,Branch): return p.MatchFirst([decend(c) for c in node]) elif isinstance(node,Optional): return p.Optional(decend(node.token)) elif isinstance(node,ZeroOrMore): return p.ZeroOrMore(decend(node.token)) elif isinstance(node,OneOrMore): return p.OneOrMore(decend(node.token)) # Create Forwards for each rule compiled = {} for rule in rules: compiled[rule.name] = p.Forward().setName(rule.name).setParseAction(nonterminal_action(rule.name)) # Compile each rule for rule in rules: compiled[rule.name] << decend(rule.body) return compileddef decode_string(s): """Decodes a literal string into the logical string it represents. """ u_flag = r_flag = None i = 0 if s[i] in 'uU': u_flag = True i += 1 if s[i] == 'rR': r_flag = True i += 1 if s[i:i+3] in ('"""',"'''"): content = s[i+3:-3] else: content = s[i+1:-1] if r_flag: if u_flag: content = content.decode('raw_unicode_escape') elif u_flag: content = content.decode('unicode_escape') else: content = content.decode('string_escape') return content###################################### AST classes:class Node(tuple): _children = (0,0) # slice of content that holds child nodes def __str__(self,indent=0): ret = ['%*s<%s' % (indent,'',self.__class__.__name__)] for name in dir(self): if not hasattr(Node,name): value = getattr(self,name) if not isinstance(value,Node): ret.append(' %s=%r' % (name,value)) found = False lo, hi = self._children if hi is None: hi = len(self) for i in xrange(lo,hi): if not found: ret.append('>') found = True ret.append('\n') ret.append(self[i].__str__(indent+2)) if found: ret.append('\n%*s</%s>' % (indent,'',self.__class__.__name__)) else: ret.append('/>') return ''.join(ret) def __repr__(self): return '%s(%s)' % (self.__class__.__name__,','.join(repr(item) for item in self))class Rule(Node): _children = (1,2) def __new__(cls,name,body): return super(Rule,cls).__new__(cls,(name,body)) name = property(itemgetter(0)) body = property(itemgetter(1))class Name(Node): def __new__(cls,name): return super(Name,cls).__new__(cls,(name,)) name = property(itemgetter(0))class Literal(Node): def __new__(cls,value): return super(Literal,cls).__new__(cls,(value,)) value = property(itemgetter(0))class Sequence(Node): _children = (0,None)class Branch(Node): _children = (0,None)class Optional(Node): _children = (0,1) def __new__(cls,token): return super(Optional,cls).__new__(cls,(token,)) token = property(itemgetter(0))class ZeroOrMore(Node): _children = (0,1) def __new__(cls,token): return super(ZeroOrMore,cls).__new__(cls,(token,)) token = property(itemgetter(0))class OneOrMore(Node): _children = (0,1) def __new__(cls,token): return super(OneOrMore,cls).__new__(cls,(token,)) token = property(itemgetter(0))###################################### Python grammar specifics belowurl = 'http://svn.python.org/view/*checkout*/python/tags/r261/Grammar/Grammar'print "Downloading grammar from %r..." % urlimport urllib2f = urllib2.urlopen(url)data = f.read()f.close()print "Download complete!"printimport token, symboltoken_names = { "~": token.TILDE, '!=': token.NOTEQUAL, '%': token.PERCENT, '%=': token.PERCENTEQUAL, '&': token.AMPER, '&=': token.AMPEREQUAL, '(': token.LPAR, ')': token.RPAR, '*': token.STAR, '**': token.DOUBLESTAR, '**=': token.DOUBLESTAREQUAL, '*=': token.STAREQUAL, '+': token.PLUS, '+=': token.PLUSEQUAL, ',': token.COMMA, '-': token.MINUS, '-=': token.MINEQUAL, '.': token.DOT, '/': token.SLASH, '//': token.DOUBLESLASH, '//=': token.DOUBLESLASHEQUAL, '/=': token.SLASHEQUAL, ':': token.COLON, ';': token.SEMI, '<': token.LESS, '<<': token.LEFTSHIFT, '<<=': token.LEFTSHIFTEQUAL, '<=': token.LESSEQUAL, '<>': token.NOTEQUAL, '=': token.EQUAL, '==': token.EQEQUAL, '>': token.GREATER, '>=': token.GREATEREQUAL, '>>': token.RIGHTSHIFT, '>>=': token.RIGHTSHIFTEQUAL, '@': token.AT, '[': token.LSQB, ']': token.RSQB, '^': token.CIRCUMFLEX, '^=': token.CIRCUMFLEXEQUAL, '`': token.BACKQUOTE, '{': token.LBRACE, '|': token.VBAR, '|=': token.VBAREQUAL, '}': token.RBRACE, 'and': token.NAME, 'as': token.NAME, 'assert': token.NAME, 'break': token.NAME, 'class': token.NAME, 'continue': token.NAME, 'def': token.NAME, 'del': token.NAME, 'elif': token.NAME, 'else': token.NAME, 'except': token.NAME, 'exec': token.NAME, 'finally': token.NAME, 'for': token.NAME, 'from': token.NAME, 'global': token.NAME, 'if': token.NAME, 'import': token.NAME, 'in': token.NAME, 'is': token.NAME, 'lambda': token.NAME, 'not': token.NAME, 'or': token.NAME, 'pass': token.NAME, 'print': token.NAME, 'raise': token.NAME, 'return': token.NAME, 'try': token.NAME, 'while': token.NAME, 'with': token.NAME, 'yield': token.NAME,}def nonterminal_action(name): """Returns the parsing action for the rules below. """ id = getattr(symbol,name) return lambda toks: (id,) + tuple(toks)def terminal_action(lexeme): """Returns the parsing action for nonterminals. """ if lexeme in token_names: id = token_names[lexeme] elif hasattr(token,lexeme): id = getattr(token,lexeme) else: id = getattr(symbol,lexeme) return lambda toks: (id,toks[0] if toks else '')p.ParserElement.setDefaultWhitespaceChars(' \t')string_start = (p.Literal('"""') | "'''" | '"' | "'")string_token = ('\\' + p.CharsNotIn("",exact=1) | p.CharsNotIn('\\',exact=1))string_end = p.matchPreviousExpr(string_start)terminals = { 'NEWLINE': p.Literal('\n').setWhitespaceChars(' \t') .setName('NEWLINE').setParseAction(terminal_action('NEWLINE')), 'ENDMARKER': p.stringEnd.copy().setWhitespaceChars(' \t') .setName('ENDMARKER').setParseAction(terminal_action('ENDMARKER')), 'NAME': (p.Word(p.alphas + "_", p.alphanums + "_", asKeyword=True)) .setName('NAME').setParseAction(terminal_action('NAME')), 'NUMBER': p.Combine( p.Word(p.nums) + p.CaselessLiteral("l") | (p.Word(p.nums) + p.Optional("." + p.Optional(p.Word(p.nums))) | "." + p.Word(p.nums)) + p.Optional(p.CaselessLiteral("e") + p.Optional(p.Literal("+") | "-") + p.Word(p.nums)) + p.Optional(p.CaselessLiteral("j")) ).setName('NUMBER').setParseAction(terminal_action('NUMBER')), 'STRING': p.Combine( p.Optional(p.CaselessLiteral('u')) + p.Optional(p.CaselessLiteral('r')) + string_start + p.ZeroOrMore(~string_end + string_token) + string_end ).setName('STRING').setParseAction(terminal_action('STRING')), # I can't find a good way of parsing indents/dedents. # The Grammar just has the tokens NEWLINE, INDENT and DEDENT scattered accross the rules. # A single NEWLINE would be translated to NEWLINE + PEER (from pyparsing.indentedBlock()), unless followed by INDENT or DEDENT # That NEWLINE and IN/DEDENT could be spit across rule boundaries. (see the 'suite' rule) 'INDENT': (p.LineStart() + p.Optional(p.Word(' '))).setName('INDENT'), 'DEDENT': (p.LineStart() + p.Optional(p.Word(' '))).setName('DEDENT')}# compile the rules, and extract the rule for expressionsrules = parse_grammar(data)compiled = compile_grammar(rules,terminals,nonterminal_action,terminal_action)eval_input = compiled['eval_input']# Test strings = "[x for x in dir(symbol) if not isinstance(gettatr(symbol,x),int)]"print "input: %r" % s# First use internal parserprintimport parsernormal = parser.expr(s).totuple()print "normal = parser.expr(s).totuple():\n", normal# Then use pyparsingprintpyparsing = eval_input.parseString(s,parseAll=True)[0]print "pyparsing = eval_input.parseString(s,parseAll=True)[0]:\n", pyparsing# Compare the resultsprintprint "No implicit newline at the end, so slice is nessecary for comparison"print "normal[:-2] == pyparsing[:-1]:", normal[:-2] == pyparsing[:-1]