MizardX

public python v1 · immutable
#1609860
·published 2009-10-10 13:28 UTC
import tokenizeimport pyparsing as pfrom operator import itemgetterdef parse_grammar(text):    """Parses the grammar.    # Valid constructs:    rules:    rule+    rule:     NAME ':' branch NEWLINE    branch:   seq ('|' seq)*    seq:      ( repeated )+    repeated: token ( '*' | '+' )*     # repeat zero/one or more times    token:    ( '[' branch ']'         # optional group (linebreaks allowed inside)              | '(' branch ')'         # group (linebreaks allowed inside)              | STRING                 # literal              | NAME                   # named terminal, or nonterminal              )    Comments (#...) and all whitespace except linebreaks are normally ignored.    Basicly same rules for whitespace as in Python, except that no blocks is allowed.    """    # Modes:    START, COLON, BODY = xrange(3)    modes = ('START','COLON','BODY')    mode = START    rules = {}    rule_name = None    stack = None    branch = None    seq = None    end_after_this = False    readline = iter(text.splitlines(1)).next    for tok,lex,start,end,line in tokenize.generate_tokens(readline):        if tok in (tokenize.COMMENT,tokenize.NL):            continue        elif tok == tokenize.ENDMARKER:            end_after_this = True            tok = tokenize.NEWLINE        #~ print '%-6s %4d %s (%d)' % (modes[mode], start[0], tokenize.tok_name[tok], tok)        if mode is START:            if tok == tokenize.NAME:                if lex in rules:                    raise ValueError("Duplicate rules. Line %d, column %d" % start)                rule_name = lex                mode = COLON            elif tok == tokenize.NEWLINE:                continue            else:                raise ValueError("Expecting rule name. Line %d, column %d" % start)            continue        elif mode is COLON:            if (tok,lex) == (tokenize.OP, ':'):                mode = BODY                stack = []                branch = []                seq = []            else:                raise ValueError("Expecting colon. Line %d, column %d" % start)            continue        elif mode is BODY:            if tok == tokenize.NAME:                seq.append(Name(lex))            elif tok == tokenize.STRING:                seq.append(Literal(decode_string(lex)))            elif tok == tokenize.OP:                if lex in '([':                    stack.append((lex,branch,seq))                    branch = []                    seq = []                elif lex in ')]':                    if not stack:                        raise ValueError("Unbalanced groups. Line %d, column %d" % start)                    if len(seq) == 1:                        token = seq[0]                    else:                        token = Sequence(seq)                    branch.append(token)                    if len(branch) != 1:                        token = Branch(branch)                    if lex == ']':                        token = Optional(token)                    left,branch,seq = stack.pop()                    if lex != {'(':')','[':']'}[left]:                        raise ValueError("Unbalanced groups. Line %d, column %d" % start)                    seq.append(token)                elif lex == '|':                    if len(seq) == 1:                        token = seq[0]                    else:                        token = Sequence(seq)                    branch.append(token)                    seq = []                elif lex in '*+':                    if not seq:                        raise ValueError("Nothing to repeat. Line %d, column %d" % start)                    seq[-1] = {'*':ZeroOrMore,'+':OneOrMore}[lex](seq[-1])                else:                    raise ValueError("Invalid token. Line %d, column %d" % start)            elif tok == tokenize.NEWLINE:                if stack:                    raise ValueError("Unbalanced groups. Line %d, column %d" % start)                if len(seq) == 1:                    token = seq[0]                else:                    token = Sequence(seq)                branch.append(token)                if len(branch) != 1:                    token = Branch(branch)                if lex == ']':                    token = Optional(token)                rules[rule_name] = Rule(rule_name,token)                rule_name = stack = branch = seq = None                mode = START            else:                raise ValueError("Invalid token. Line %d, column %d" % start)        if end_after_this:            break    return rulesdef compile_grammar(rules,terminals,nonterminal_action=None,terminal_action=None):    """Compile the grammar AST into pyparsing objects.    rules: AST dictionary returned by parse_grammar    terminals: dictionary of terminal name -> pyparsing object    nonterminal_action: Will be called with the name of a rule, and will be excpected        to return a callable that will be used as parsing action.    terminal_action: Will be called with the value of a literal, and will be        excpected to return a callable that will be used as parsing action.    """    nonterminals = set(rules)    rules = rules.values()    if nonterminal_action is None:        def nonterminal_action(name):            return lambda toks: (name,) + tuple(toks)    if terminal_action is None:        def terminal_action(lexeme):            return lambda toks: (lexeme, toks[0] if toks else '')    def decend(node):        """Recursivly compile the nodes into corresponding ParsingElements        """        if isinstance(node,Name):            # Names; both nonterminals and named terminals            if node.name in nonterminals:                return compiled[node.name]            elif node.name in terminals:                return terminals[node.name]            else:                raise KeyError("Unknown terminal: %s" % node.name)        elif isinstance(node,Literal):            # Embedded strings; both keywords and symbols.            if node.value.replace('_','').isalnum():                ret = p.Keyword(node.value)            else:                ret = p.Literal(node.value)            ret.setParseAction(terminal_action(node.value))            return ret        elif isinstance(node,Sequence):            return p.And([decend(c) for c in node])        elif isinstance(node,Branch):            return p.MatchFirst([decend(c) for c in node])        elif isinstance(node,Optional):            return p.Optional(decend(node.token))        elif isinstance(node,ZeroOrMore):            return p.ZeroOrMore(decend(node.token))        elif isinstance(node,OneOrMore):            return p.OneOrMore(decend(node.token))    # Create Forwards for each rule    compiled = {}    for rule in rules:        compiled[rule.name] = p.Forward().setName(rule.name).setParseAction(nonterminal_action(rule.name))    # Compile each rule    for rule in rules:        compiled[rule.name] << decend(rule.body)    return compileddef decode_string(s):    """Decodes a literal string into the logical string it represents.    """    u_flag = r_flag = None    i = 0    if s[i] in 'uU':      u_flag = True      i += 1    if s[i] == 'rR':      r_flag = True      i += 1    if s[i:i+3] in ('"""',"'''"):        content = s[i+3:-3]    else:        content = s[i+1:-1]    if r_flag:        if u_flag:            content = content.decode('raw_unicode_escape')    elif u_flag:        content = content.decode('unicode_escape')    else:        content = content.decode('string_escape')    return content###################################### AST classes:class Node(tuple):    _children = (0,0) # slice of content that holds child nodes    def __str__(self,indent=0):        ret = ['%*s<%s' % (indent,'',self.__class__.__name__)]        for name in dir(self):            if not hasattr(Node,name):                value = getattr(self,name)                if not isinstance(value,Node):                    ret.append(' %s=%r' % (name,value))        found = False        lo, hi = self._children        if hi is None: hi = len(self)        for i in xrange(lo,hi):            if not found:                ret.append('>')                found = True            ret.append('\n')            ret.append(self[i].__str__(indent+2))        if found:            ret.append('\n%*s</%s>' % (indent,'',self.__class__.__name__))        else:            ret.append('/>')        return ''.join(ret)        def __repr__(self):        return '%s(%s)' % (self.__class__.__name__,','.join(repr(item) for item in self))class Rule(Node):    _children = (1,2)        def __new__(cls,name,body):        return super(Rule,cls).__new__(cls,(name,body))        name = property(itemgetter(0))    body = property(itemgetter(1))class Name(Node):        def __new__(cls,name):        return super(Name,cls).__new__(cls,(name,))        name = property(itemgetter(0))class Literal(Node):        def __new__(cls,value):        return super(Literal,cls).__new__(cls,(value,))        value = property(itemgetter(0))class Sequence(Node):    _children = (0,None)class Branch(Node):    _children = (0,None)class Optional(Node):    _children = (0,1)        def __new__(cls,token):        return super(Optional,cls).__new__(cls,(token,))        token = property(itemgetter(0))class ZeroOrMore(Node):    _children = (0,1)        def __new__(cls,token):        return super(ZeroOrMore,cls).__new__(cls,(token,))        token = property(itemgetter(0))class OneOrMore(Node):    _children = (0,1)        def __new__(cls,token):        return super(OneOrMore,cls).__new__(cls,(token,))        token = property(itemgetter(0))###################################### Python grammar specifics belowurl = 'http://svn.python.org/view/*checkout*/python/tags/r261/Grammar/Grammar'print "Downloading grammar from %r..." % urlimport urllib2f = urllib2.urlopen(url)data = f.read()f.close()print "Download complete!"printimport token, symboltoken_names = {    "~": token.TILDE, '!=': token.NOTEQUAL, '%': token.PERCENT, '%=': token.PERCENTEQUAL, '&': token.AMPER,    '&=': token.AMPEREQUAL, '(': token.LPAR, ')': token.RPAR, '*': token.STAR, '**': token.DOUBLESTAR,    '**=': token.DOUBLESTAREQUAL, '*=': token.STAREQUAL, '+': token.PLUS, '+=': token.PLUSEQUAL, ',': token.COMMA,    '-': token.MINUS, '-=': token.MINEQUAL, '.': token.DOT, '/': token.SLASH, '//': token.DOUBLESLASH,    '//=': token.DOUBLESLASHEQUAL, '/=': token.SLASHEQUAL, ':': token.COLON, ';': token.SEMI, '<': token.LESS,    '<<': token.LEFTSHIFT, '<<=': token.LEFTSHIFTEQUAL, '<=': token.LESSEQUAL, '<>': token.NOTEQUAL,    '=': token.EQUAL, '==': token.EQEQUAL, '>': token.GREATER, '>=': token.GREATEREQUAL, '>>': token.RIGHTSHIFT,    '>>=': token.RIGHTSHIFTEQUAL, '@': token.AT, '[': token.LSQB, ']': token.RSQB, '^': token.CIRCUMFLEX,    '^=': token.CIRCUMFLEXEQUAL, '`': token.BACKQUOTE, '{': token.LBRACE, '|': token.VBAR, '|=': token.VBAREQUAL,    '}': token.RBRACE,    'and': token.NAME, 'as': token.NAME, 'assert': token.NAME, 'break': token.NAME, 'class': token.NAME,    'continue': token.NAME, 'def': token.NAME, 'del': token.NAME, 'elif': token.NAME, 'else': token.NAME,    'except': token.NAME, 'exec': token.NAME, 'finally': token.NAME, 'for': token.NAME, 'from': token.NAME,    'global': token.NAME, 'if': token.NAME, 'import': token.NAME, 'in': token.NAME, 'is': token.NAME,    'lambda': token.NAME, 'not': token.NAME, 'or': token.NAME, 'pass': token.NAME, 'print': token.NAME,    'raise': token.NAME, 'return': token.NAME, 'try': token.NAME, 'while': token.NAME, 'with': token.NAME,    'yield': token.NAME,}def nonterminal_action(name):    """Returns the parsing action for the rules below.    """    id = getattr(symbol,name)    return lambda toks: (id,) + tuple(toks)def terminal_action(lexeme):    """Returns the parsing action for nonterminals.    """    if lexeme in token_names:        id = token_names[lexeme]    elif hasattr(token,lexeme):        id = getattr(token,lexeme)    else:        id = getattr(symbol,lexeme)    return lambda toks: (id,toks[0] if toks else '')p.ParserElement.setDefaultWhitespaceChars(' \t')string_start = (p.Literal('"""') | "'''" | '"' | "'")string_token = ('\\' + p.CharsNotIn("",exact=1) | p.CharsNotIn('\\',exact=1))string_end = p.matchPreviousExpr(string_start)terminals = {    'NEWLINE': p.Literal('\n').setWhitespaceChars(' \t')        .setName('NEWLINE').setParseAction(terminal_action('NEWLINE')),    'ENDMARKER': p.stringEnd.copy().setWhitespaceChars(' \t')        .setName('ENDMARKER').setParseAction(terminal_action('ENDMARKER')),    'NAME': (p.Word(p.alphas + "_", p.alphanums + "_", asKeyword=True))        .setName('NAME').setParseAction(terminal_action('NAME')),    'NUMBER': p.Combine(            p.Word(p.nums) + p.CaselessLiteral("l") |            (p.Word(p.nums) + p.Optional("." + p.Optional(p.Word(p.nums))) | "." + p.Word(p.nums)) +                p.Optional(p.CaselessLiteral("e") + p.Optional(p.Literal("+") | "-") + p.Word(p.nums)) +                p.Optional(p.CaselessLiteral("j"))        ).setName('NUMBER').setParseAction(terminal_action('NUMBER')),    'STRING': p.Combine(            p.Optional(p.CaselessLiteral('u')) +            p.Optional(p.CaselessLiteral('r')) +            string_start + p.ZeroOrMore(~string_end + string_token) + string_end        ).setName('STRING').setParseAction(terminal_action('STRING')),    # I can't find a good way of parsing indents/dedents.    # The Grammar just has the tokens NEWLINE, INDENT and DEDENT scattered accross the rules.    # A single NEWLINE would be translated to NEWLINE + PEER (from pyparsing.indentedBlock()), unless followed by INDENT or DEDENT    # That NEWLINE and IN/DEDENT could be spit across rule boundaries. (see the 'suite' rule)    'INDENT': (p.LineStart() + p.Optional(p.Word(' '))).setName('INDENT'),    'DEDENT': (p.LineStart() + p.Optional(p.Word(' '))).setName('DEDENT')}# compile the rules, and extract the rule for expressionsrules = parse_grammar(data)compiled = compile_grammar(rules,terminals,nonterminal_action,terminal_action)eval_input = compiled['eval_input']# Test strings = "[x for x in dir(symbol) if not isinstance(gettatr(symbol,x),int)]"print "input: %r" % s# First use internal parserprintimport parsernormal = parser.expr(s).totuple()print "normal = parser.expr(s).totuple():\n", normal# Then use pyparsingprintpyparsing = eval_input.parseString(s,parseAll=True)[0]print "pyparsing = eval_input.parseString(s,parseAll=True)[0]:\n", pyparsing# Compare the resultsprintprint "No implicit newline at the end, so slice is nessecary for comparison"print "normal[:-2] == pyparsing[:-1]:", normal[:-2] == pyparsing[:-1]