All pastes #852381 Raw Edit

Simple parser

public python v1 · immutable
#852381 ·published 2008-01-12 20:22 UTC
rendered paste body
# code adapted from:# http://diveintopython.org/html_processing/extracting_data.htmlfrom sgmllib import SGMLParserTEXT = """<div id="categories">  <div id="cat-undo">    <ul>      <li>        <a class="subtle" href="/dir/?link=list&sid=3965">Arts & Humanities</a>      </li>    </ul>  </div></div>"""class AnchorText(SGMLParser):    def reset(self):        self.in_categories = False        self.in_correct_a = False        self.text = ""        SGMLParser.reset(self)    def start_a(self, attrs):        if self.in_categories == True:            self.in_correct_a = True    def end_a(self):        self.in_correct_a = False    def start_div(self, attrs):        proper_attr = [v for k, v in attrs if v == 'categories']        if proper_attr:            self.in_categories = True    def end_div(self):        self.in_categories = False    def handle_data(self,dat):        if self.in_correct_a:            self.text = "%s%s" % (self.text, dat)if __name__ == "__main__":    parser = AnchorText()    parser.feed(TEXT)    print parser.text # prints "Arts & Humanities"