rendered paste body# code adapted from:# http://diveintopython.org/html_processing/extracting_data.htmlfrom sgmllib import SGMLParserTEXT = """<div id="categories"> <div id="cat-undo"> <ul> <li> <a class="subtle" href="/dir/?link=list&sid=3965">Arts & Humanities</a> </li> </ul> </div></div>"""class AnchorText(SGMLParser): def reset(self): self.in_categories = False self.in_correct_a = False self.text = "" SGMLParser.reset(self) def start_a(self, attrs): if self.in_categories == True: self.in_correct_a = True def end_a(self): self.in_correct_a = False def start_div(self, attrs): proper_attr = [v for k, v in attrs if v == 'categories'] if proper_attr: self.in_categories = True def end_div(self): self.in_categories = False def handle_data(self,dat): if self.in_correct_a: self.text = "%s%s" % (self.text, dat)if __name__ == "__main__": parser = AnchorText() parser.feed(TEXT) print parser.text # prints "Arts & Humanities"