"""A full text indexer for elixir objects"""from __future__ import with_statementfrom datetime import datetimefrom elixir import Entity, Field, OneToMany, ManyToOnefrom elixir import Unicode, Integer, DateTimefrom elixir import using_options, using_table_optionsfrom elixir import Statement, metadataimport typesfrom cleveland.html2text import html2plainTextfrom elixir.entity import EntityMetafrom sqlalchemy.schema import UniqueConstraintfrom sqlalchemy.orm import MapperExtensionfrom sqlalchemy.orm.attributes import InstrumentedAttributefrom sqlalchemy.orm.properties import ColumnPropertyimport retagStripper = re.compile('<[^>]*>')wordSplitter = re.compile('[^a-z,A-Z,0-9]')# Global variablesindexable = {}# Classes to store the index data..class Word(Entity): """ An indexible word """ using_options(tablename='indexer_word') word = Field(Unicode(), required=True, unique=True, index=True ) matches = OneToMany('Match')class Match(Entity): """ Hooks together words to urls """ using_options(tablename='indexer_match') using_table_options(UniqueConstraint('word_id', 'url_id')) word = ManyToOne('Word', ondelete='cascade', required=True) url = ManyToOne('URL', ondelete='cascade', required=True) count = Field(Integer(), required=True)class URL(Entity): """ Holds urls """ using_options(tablename='indexer_url') url = Field(Unicode(), required=True, unique=True, index=True) lastIndexed = Field(DateTime(), required=True, default=datetime.now) extraData = Field(Unicode()) # Extra data to store against url. Eg. "tablename=person recordId=4" matches = OneToMany('Match')# Helper functionsdef indexFields(cls): """ Returns a list of field names for indexing. Doesn't take into account the __index__ attribute. """ result = [] for name, attr in cls.__dict__.items(): # Yes we want __class__ is, not isinstance if attr.__class__ is InstrumentedAttribute: if isinstance(attr.property, ColumnProperty): col = getattr(cls, name).property.columns[0] typeName = str(col.type) if typeName.startswith('Unicode') or typeName.startswith('Text'): # Index all type names result.append(name) return resultdef prepModule(module): """ Searches for indexable objects in a module """ for className, cls in module.__dict__.items(): if isinstance(cls, EntityMeta) and cls is not Entity: cls.mapper.compile() prepClass(cls)def prepClass(cls): """ Preps a class if necessary """ fieldsToIndex = indexFields(cls) if hasattr(cls, '__indexFields__'): cls.__indexFields__(fieldsToIndex) if fieldsToIndex: cls.__fieldsToIndex__ = fieldsToIndexdef indexAllClasses(): """ Indexes all the classes that we've found through prepModule """ for cls, attrNames in indexable.items(): print 'indexing', cls items = cls.query.all() for item in items: indexInstance(item, attrNames)def indexInstance(obj, attrNames=None, checkIfModified=False): """ [Re]index an elixir instance """ if not hasattr(obj, '__fieldsToIndex__'): prepClass(obj.__class__) url = obj.url.encode('utf=8') if checkIfModified and not obj._sa_instance_state.modified: return False if hasattr(obj, '__getIndexPhrases__'): phrases = obj.__getIndexPhrases__() else: if not attrNames: attrNames = obj.__fieldsToIndex__ htmlAttrNames = obj.__htmlFieldsToIndex__ changedAttrNames = set(attrNames+htmlAttrNames) if checkIfModified: changedAttrNames -= obj._sa_instance_state.unmodified if not changedAttrNames: return False phrases = [] for attrName in attrNames: phrases.append(getattr(obj, attrName)) for attrName in htmlAttrNames: text = html2plainText(getattr(obj, attrName) or '') phrases.append(text) if hasattr(obj, '__getExtraData__'): extraData = obj.__getExtraData__() else: extraData = None ourWords = {} ourMatches = {} for phrase in phrases: if not phrase: continue if isinstance(phrase, str): phrase = phrase.decode('utf-8') phrase = tagStripper.sub('', phrase.lower()) words = wordSplitter.split(phrase) for word in words: word = wordSplitter.sub('', word) if not word: continue # Add to our list ourMatches[word] = ourMatches.get(word, 0)+1 # Now update the database # (Re)create the urlId urlIds = metadata.bind.text('SELECT id from indexer_url where url=:url').execute(url=url).fetchall() urlId = urlIds and urlIds[0][0] or None if urlId: # TODO: Maybe just load all the previous matches and words into sets in memory and compare.. ? # De-index object deIndex(obj.url) metadata.bind.text('insert into indexer_url (url, "extraData", "lastIndexed") values (:url, :extraData, now())').execute( url=url, extraData=extraData) urlIds = metadata.bind.text('select id from indexer_url where url=:url').execute( url=url).fetchall() urlId = urlIds and urlIds[0][0] or None # Create all the words and matches for word, count in ourMatches.items(): # Ensure word exists existingWords = metadata.bind.text( 'select id from indexer_word where word=:word limit 1').execute( word=word).fetchall() if existingWords: wordId = existingWords[0][0] else: # Insert the new word record if it doesn't exist metadata.bind.text('insert into indexer_word (word) values (:word)').execute(word=word) wordId = metadata.bind.text( 'select id from indexer_word where word=:word limit 1').execute( word=word).fetchone()[0] # Insert the new match record metadata.bind.text( 'insert into indexer_match (word_id, url_id, count) ' 'values (:wordId, :urlId, :count)').execute( wordId=wordId, urlId=urlId, count=count) matchInfo = metadata.bind.text( 'select id, count from indexer_match where ' 'word_id=:wordId and url_id=:urlId limit 1').execute( wordId=wordId, urlId=urlId).fetchone()def cleanUpOrphanWords(self): """ Kills all orphan words """ [w.delete() for w in Word.query.filter_by(matches=[])]def search(phrase): """ Search for a url with pages """ if isinstance(phrase, str): phrase = phrase.decode('utf-8') words = map(unicode.lower, wordSplitter.split(phrase)) urls = {} for word in words: wordObj = Word.query.filter_by(word=word).first() if wordObj: for match in wordObj.matches: urls.setdefault(match.url.url, [0, match.url.extraData]) urls[match.url.url][0] += match.count return sorted(urls.items(), reverse=True, key=lambda item: item[1][0])def deIndex(url): """ Removes a url from the DB """ # Get the url id url = url.encode('utf-8') urls = metadata.bind.text('select id from indexer_url where url=:url').execute(url=url).fetchall() if not urls: return urlId = urls[0][0] # Get all the matches matches = metadata.bind.text('select id, word_id from indexer_match where url_id=:urlId').execute(urlId=urlId).fetchall() for matchId, wordId in matches: # See if we need to delete the word words = metadata.bind.text('select distinct url_id from indexer_match where word_id=:wordId limit 2').execute(wordId=wordId).fetchall() # If this is the only url with these words in it, delete the word records if len(words) == 1: metadata.bind.text('delete from indexer_word where id=:wordId').execute(wordId=wordId) # Now delete all the match objects in one sweep metadata.bind.text('delete from indexer_match where url_id=:urlId').execute(urlId=urlId) # Finally delete the url metadata.bind.text('delete from indexer_url where id=:urlId').execute(urlId=urlId)def urlChanged(oldUrl, newUrl): """ To be called by objects when their url changes """ metadata.bind.text('UPDATE indexer_url SET url=:newUrl WHERE url=:oldUrl').execute(newUrl=newUrl, oldUrl=oldUrl) # metadata.bind.text('UPDATE indexer_url u SET url=:newUrl || substring(url from :start) WHERE url like(:oldUrl || "%")').execute(newUrl=newUrl, oldUrl=oldUrl, start=start)# Mapper extension to keep up indexingclass IndexerExtension(MapperExtension): """ Keeps urls on objects up to date """ def after_insert(self, mapper, connection, instance): indexInstance(instance) def after_update(self, mapper, connection, instance): indexInstance(instance) def before_delete(self, mapper, connection, instance): instance.refresh() deIndex(instance.url)indexerExtension = IndexerExtension()def makeEntityIndexable(entity, fields=None, htmlFields=None): extensions = entity._descriptor.mapper_options.get('extension', []) if type(extensions) is not types.ListType: extensions = [extensions] if indexerExtension not in extensions: extensions.append(indexerExtension) entity._descriptor.mapper_options['extension'] = extensions # Get all the text based fields so we can index them if fields is None: fields = indexFields(entity) if hasattr(entity, '__indexFields__'): entity.__indexFields__(fields) if fields: entity.__fieldsToIndex__ = fields if htmlFields: entity.__htmlFieldsToIndex__ = htmlFieldsclass Indexed(object): def __init__(self, entity, fields=None, htmlFields=None): makeEntityIndexable(entity, fields, htmlFields)# An elixir statement to make an object indexedindexed = Statement(Indexed)---------------------------------- 8< ----------------------------------And the html2text bit (this could definitely do with some work):---------------------------------- 8< ----------------------------------#!/usr/bin/env python"""html2text: Turn HTML into equivalent Markdown-structured text."""__version__ = "2.3"__author__ = "Aaron Swartz (me@aaronsw.com)"__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"]# TODO:# Support decoded entities with unifiable.# Relative URL resolutionif not hasattr(__builtins__, 'True'): True, False = 1, 0import re, sys, urllib, htmlentitydefs, codecsimport sgmllibsgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')try: from textwrap import wrapexcept: pass# Use Unicode characters instead of their ascii psuedo-replacementsUNICODE_SNOB = 1# Put the links after each paragraph instead of at the end.LINKS_EACH_PARAGRAPH = 1# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)BODY_WIDTH = 78# Don't show internal links (href="#local-anchor") -- corresponding link targets# won't be visible in the plain text file anyway.SKIP_INTERNAL_LINKS = True### Entity Nonsense ###def name2cp(k): if k == 'apos': return ord("'") if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 return htmlentitydefs.name2codepoint[k] else: k = htmlentitydefs.entitydefs[k] if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 return ord(codecs.latin_1_decode(k)[0])unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*','ndash':'-', 'oelig':'oe', 'aelig':'ae','agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i','ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}unifiable_n = {}for k in unifiable.keys(): unifiable_n[name2cp(k)] = unifiable[k]def charref(name): if name[0] in ['x','X']: c = int(name[1:], 16) else: c = int(name) if not UNICODE_SNOB and c in unifiable_n.keys(): return unifiable_n[c] else: return unichr(c)def entityref(c): if not UNICODE_SNOB and c in unifiable.keys(): return unifiable[c] else: try: name2cp(c) except KeyError: return "&" + c else: return unichr(name2cp(c))def replaceEntities(s): s = s.group(1) if s[0] == "#": return charref(s[1:]) else: return entityref(s)r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")def unescape(s): return r_unescape.sub(replaceEntities, s)def fixattrs(attrs): # Fix bug in sgmllib.py if not attrs: return attrs newattrs = [] for attr in attrs: newattrs.append((attr[0], unescape(attr[1]))) return newattrs### End Entity Nonsense ###def onlywhite(line): """Return true if the line does only consist of whitespace characters.""" for c in line: if c is not ' ' and c is not ' ': return c is ' ' return linedef optwrap(text): """Wrap all paragraphs in the provided text.""" if not BODY_WIDTH: return text assert wrap # Requires Python 2.3. result = '' newlines = 0 for para in text.split("\n"): if len(para) > 0: if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*': for line in wrap(para, BODY_WIDTH): result += line + "\n" result += "\n" newlines = 2 else: if not onlywhite(para): result += para + "\n" newlines = 1 else: if newlines < 2: result += "\n" newlines += 1 return resultdef hn(tag): if tag[0] == 'h' and len(tag) == 2: try: n = int(tag[1]) if n in range(1, 10): return n except ValueError: return 0class _html2text(sgmllib.SGMLParser): def __init__(self, out=sys.stdout.write): sgmllib.SGMLParser.__init__(self) if out is None: self.out = self.outtextf else: self.out = out self.outtext = u'' self.quiet = 0 self.p_p = 0 self.outcount = 0 self.start = 1 self.space = 0 self.a = [] self.astack = [] self.acount = 0 self.list = [] self.blockquote = 0 self.pre = 0 self.startpre = 0 self.lastWasNL = 0 def outtextf(self, s): if type(s) is type(''): s = codecs.utf_8_decode(s)[0] self.outtext += s def close(self): sgmllib.SGMLParser.close(self) self.pbr() self.o('', 0, 'end') return self.outtext def handle_charref(self, c): self.o(charref(c)) def handle_entityref(self, c): self.o(entityref(c)) def unknown_starttag(self, tag, attrs): self.handle_tag(tag, attrs, 1) def unknown_endtag(self, tag): self.handle_tag(tag, None, 0) def previousIndex(self, attrs): """ returns the index of certain set of attributes (of a link) in the self.a list If the set of attributes is not found, returns None """ if not attrs.has_key('href'): return None i = -1 for a in self.a: i += 1 match = 0 if a.has_key('href') and a['href'] == attrs['href']: if a.has_key('title') or attrs.has_key('title'): if (a.has_key('title') and attrs.has_key('title') and a['title'] == attrs['title']): match = True else: match = True if match: return i def handle_tag(self, tag, attrs, start): attrs = fixattrs(attrs) if hn(tag): self.p() if start: self.o(hn(tag)*" " + ' ') if tag in ['p', 'div']: self.p() if tag == "br" and start: self.o(" \n") if tag == "hr" and start: self.p() self.o("* * *") self.p() if tag in ["head", "style", 'script']: if start: self.quiet += 1 else: self.quiet -= 1 if tag in ["body"]: self.quiet = 0 # sites like 9rules.com never close <head> if tag == "blockquote": if start: self.p(); self.o('> ', 0, 1); self.start = 1 self.blockquote += 1 else: self.blockquote -= 1 self.p() #if tag in ['em', 'i', 'u']: self.o("_") #if tag in ['strong', 'b']: self.o("**") if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` if tag == "a": if start: attrsD = {} for (x, y) in attrs: attrsD[x] = y attrs = attrsD self.astack.append(None) else: if self.astack: a = self.astack.pop() if a: i = self.previousIndex(a) if i is not None: a = self.a[i] else: self.acount += 1 a['count'] = self.acount a['outcount'] = self.outcount self.a.append(a) self.o("][" + `a['count']` + "]") if tag == "img" and start: attrsD = {} for (x, y) in attrs: attrsD[x] = y attrs = attrsD #alt = attrs.get('alt', '') #self.o(alt) if tag == 'dl' and start: self.p() if tag == 'dt' and not start: self.pbr() if tag == 'dd' and start: self.o(' ') if tag == 'dd' and not start: self.pbr() if tag in ["ol", "ul"]: if start: self.list.append({'name':tag, 'num':0}) else: if self.list: self.list.pop() self.p() if tag == 'li': if start: self.pbr() if self.list: li = self.list[-1] else: li = {'name':'ul', 'num':0} self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly. if li['name'] == "ul": self.o("* ") elif li['name'] == "ol": li['num'] += 1 self.o(`li['num']`+". ") self.start = 1 else: self.pbr() if tag in ["table", "tr"] and start: self.p() if tag == 'td': self.pbr() if tag == "pre": if start: self.startpre = 1 self.pre = 1 else: self.pre = 0 self.p() def pbr(self): if self.p_p == 0: self.p_p = 1 def p(self): self.p_p = 2 def o(self, data, puredata=0, force=0): if not self.quiet: if puredata and not self.pre: data = re.sub('\s+', ' ', data) if data and data[0] == ' ': self.space = 1 data = data[1:] if not data and not force: return if self.startpre: #self.out(" :") #TODO: not output when already one there self.startpre = 0 bq = (">" * self.blockquote) if not (force and data and data[0] == ">") and self.blockquote: bq += " " if self.pre: bq += " " data = data.replace("\n", "\n"+bq) if self.start: self.space = 0 self.p_p = 0 self.start = 0 if force == 'end': # It's the end. self.p_p = 0 self.out("\n") self.space = 0 if self.p_p: self.out(('\n'+bq)*self.p_p) self.space = 0 if self.space: if not self.lastWasNL: self.out(' ') self.space = 0 if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"): if force == "end": self.out("\n") newa = [] for link in self.a: if self.outcount > link['outcount']: self.out(" ["+`link['count']`+"]: " + link['href']) #TODO: base href if link.has_key('title'): self.out(" ("+link['title']+")") self.out("\n") else: newa.append(link) if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. self.a = newa self.p_p = 0 self.out(data) self.lastWasNL = data and data[-1] == '\n' self.outcount += 1 def handle_data(self, data): self.o(data, 1) def unknown_decl(self, data): passdef wrapwrite(text): sys.stdout.write(text.encode('utf8'))def html2text_file(html, out=wrapwrite): h = _html2text(out) h.feed(html) h.feed("") return h.close()def html2text(html): return optwrap(html2text_file(html, None))newLineRe = re.compile('((\r\r)|(\n\r\n\r)|(\r\n\r\n)|(\n\n))+', re.UNICODE)titleRe = re.compile(r'^\#\#\s*', re.UNICODE|re.MULTILINE)def xml2text(html): """ Turns xml into text """ from xml.dom import minidom # fix up entitydefs xml = [] lastEnd = 0 for m in re.finditer('&[^&;]+;', html): xml.append(html[lastEnd:m.start()]) defName = html[m.start()+1:m.end()-1] newName = htmlentitydefs.name2codepoint.get(defName, None) if newName is not None: xml.append('&#%s;' % newName) lastEnd = m.end() xml = u'<?xml version="1.0" encoding="utf-8"?><root>' + ''.join(xml) + u'</root>' dom = minidom.parseString(xml.encode('utf-8')) raise Exeception('ouch')def html2plainText(html): """ Returns plain text with no [] """ #text = xml2text(html) text = html2text(html) text = titleRe.sub('', text) text = newLineRe.sub('\r\n\r\n', text) return textif __name__ == "__main__": if sys.argv[1:]: arg = sys.argv[1] if arg.startswith('http://'): j = urllib.urlopen(arg) try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) text = j.read() encoding = enc(j.headers, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' data = text.decode(encoding) else: data = open(arg, 'r').read() else: data = sys.stdin.read() wrapwrite(html2text(data))---------------------------------- 8< ----------------------------------Example usage: ---------------------------------- 8< ----------------------------------from cleveland.indexer import indexed, urlChangedclass Page(Entity): using_options(tablename='page') indexed(fields=['name', 'title', 'url'], htmlFields=['content', 'extra']) name = Field(Unicode(), required=True, unique=True) title = Field(Unicode(), required=True, default='') content = Field(Unicode()) extra = Field(Unicode()) url = Field(Unicode(), required=True, unique=True) visible = Field(Boolean(), required=True, default=False, index=True) lastModified = Field(DateTime(), required=True, default=datetime.now) templateName = Field(Unicode()) def __getExtraData__(self): return 'page %s' % self.id @before_update def urlUpdate(self): """ Removes the index entrys for the old urls """ if self._oldUrl is not None: urlChanged(self._oldUrl, self.url) for article in self.articles: urlChanged(self._oldUrl + '/' + article.name, article.url) self._oldUrl = None