elixir full text search engine p

public python v1 · immutable
#1462937
·published 2009-06-17 03:31 UTC
"""A full text indexer for elixir objects"""from __future__ import with_statementfrom datetime import datetimefrom elixir import Entity, Field, OneToMany, ManyToOnefrom elixir import Unicode, Integer, DateTimefrom elixir import using_options, using_table_optionsfrom elixir import Statement, metadataimport typesfrom cleveland.html2text import html2plainTextfrom elixir.entity import EntityMetafrom sqlalchemy.schema import UniqueConstraintfrom sqlalchemy.orm import MapperExtensionfrom sqlalchemy.orm.attributes import InstrumentedAttributefrom sqlalchemy.orm.properties import ColumnPropertyimport retagStripper = re.compile('<[^>]*>')wordSplitter = re.compile('[^a-z,A-Z,0-9]')# Global variablesindexable = {}# Classes to store the index data..class Word(Entity):    """    An indexible word    """    using_options(tablename='indexer_word')    word = Field(Unicode(), required=True, unique=True, index=True )    matches = OneToMany('Match')class Match(Entity):    """    Hooks together words to urls    """    using_options(tablename='indexer_match')    using_table_options(UniqueConstraint('word_id', 'url_id'))    word = ManyToOne('Word', ondelete='cascade', required=True)    url = ManyToOne('URL', ondelete='cascade', required=True)    count = Field(Integer(), required=True)class URL(Entity):    """    Holds urls    """    using_options(tablename='indexer_url')    url = Field(Unicode(), required=True, unique=True, index=True)    lastIndexed = Field(DateTime(), required=True, default=datetime.now)    extraData = Field(Unicode()) # Extra data to store against url. Eg. "tablename=person recordId=4"    matches = OneToMany('Match')# Helper functionsdef indexFields(cls):    """    Returns a list of field names for indexing.    Doesn't take into account the __index__ attribute.    """    result = []    for name, attr in cls.__dict__.items():        # Yes we want __class__ is, not isinstance        if attr.__class__ is InstrumentedAttribute:            if isinstance(attr.property, ColumnProperty):                col = getattr(cls, name).property.columns[0]                typeName = str(col.type)                if typeName.startswith('Unicode') or typeName.startswith('Text'):                    # Index all type names                    result.append(name)    return resultdef prepModule(module):    """    Searches for indexable objects in a module    """    for className, cls in module.__dict__.items():        if isinstance(cls, EntityMeta) and cls is not Entity:            cls.mapper.compile()            prepClass(cls)def prepClass(cls):    """    Preps a class if necessary    """    fieldsToIndex = indexFields(cls)    if hasattr(cls, '__indexFields__'):        cls.__indexFields__(fieldsToIndex)    if fieldsToIndex:        cls.__fieldsToIndex__ = fieldsToIndexdef indexAllClasses():    """    Indexes all the classes that we've found through prepModule    """    for cls, attrNames in indexable.items():        print 'indexing', cls         items = cls.query.all()        for item in items:            indexInstance(item, attrNames)def indexInstance(obj, attrNames=None, checkIfModified=False):    """    [Re]index an elixir instance    """    if not hasattr(obj, '__fieldsToIndex__'):        prepClass(obj.__class__)    url = obj.url.encode('utf=8')    if checkIfModified and not obj._sa_instance_state.modified:        return False    if hasattr(obj, '__getIndexPhrases__'):        phrases = obj.__getIndexPhrases__()    else:        if not attrNames:            attrNames = obj.__fieldsToIndex__        htmlAttrNames = obj.__htmlFieldsToIndex__        changedAttrNames = set(attrNames+htmlAttrNames)        if checkIfModified:            changedAttrNames -= obj._sa_instance_state.unmodified            if not changedAttrNames:                return False        phrases = []        for attrName in attrNames:            phrases.append(getattr(obj, attrName))        for attrName in htmlAttrNames:            text = html2plainText(getattr(obj, attrName) or '')            phrases.append(text)    if hasattr(obj, '__getExtraData__'):        extraData = obj.__getExtraData__()    else:        extraData = None    ourWords = {}    ourMatches = {}    for phrase in phrases:        if not phrase:            continue        if isinstance(phrase, str):            phrase = phrase.decode('utf-8')        phrase = tagStripper.sub('', phrase.lower())        words = wordSplitter.split(phrase)        for word in words:            word = wordSplitter.sub('', word)            if not word:                continue            # Add to our list            ourMatches[word] = ourMatches.get(word, 0)+1        # Now update the database        # (Re)create the urlId        urlIds = metadata.bind.text('SELECT id from indexer_url where url=:url').execute(url=url).fetchall()        urlId = urlIds and urlIds[0][0] or None        if urlId:            # TODO: Maybe just load all the previous matches and words into sets in memory and compare.. ?            # De-index object            deIndex(obj.url)        metadata.bind.text('insert into indexer_url (url, "extraData", "lastIndexed") values (:url, :extraData, now())').execute(            url=url, extraData=extraData)        urlIds = metadata.bind.text('select id from indexer_url where url=:url').execute(            url=url).fetchall()        urlId = urlIds and urlIds[0][0] or None        # Create all the words and matches        for word, count in ourMatches.items():            # Ensure word exists            existingWords = metadata.bind.text(                'select id from indexer_word where word=:word limit 1').execute(                word=word).fetchall()            if existingWords:                wordId = existingWords[0][0]            else:                # Insert the new word record if it doesn't exist                metadata.bind.text('insert into indexer_word (word) values (:word)').execute(word=word)                wordId = metadata.bind.text(                    'select id from indexer_word where word=:word limit 1').execute(                    word=word).fetchone()[0]            # Insert the new match record            metadata.bind.text(                'insert into indexer_match (word_id, url_id, count) '                'values (:wordId, :urlId, :count)').execute(                wordId=wordId, urlId=urlId, count=count)            matchInfo = metadata.bind.text(                'select id, count from indexer_match where '                'word_id=:wordId and url_id=:urlId limit 1').execute(                wordId=wordId, urlId=urlId).fetchone()def cleanUpOrphanWords(self):    """    Kills all orphan words    """    [w.delete() for w in Word.query.filter_by(matches=[])]def search(phrase):    """    Search for a url with pages    """    if isinstance(phrase, str):        phrase = phrase.decode('utf-8')    words = map(unicode.lower, wordSplitter.split(phrase))    urls = {}    for word in words:        wordObj = Word.query.filter_by(word=word).first()        if wordObj:            for match in wordObj.matches:                urls.setdefault(match.url.url, [0, match.url.extraData])                urls[match.url.url][0] += match.count    return sorted(urls.items(), reverse=True, key=lambda item: item[1][0])def deIndex(url):    """    Removes a url from the DB    """    # Get the url id    url = url.encode('utf-8')    urls = metadata.bind.text('select id from indexer_url where url=:url').execute(url=url).fetchall()    if not urls:        return    urlId = urls[0][0]    # Get all the matches    matches = metadata.bind.text('select id, word_id from indexer_match where url_id=:urlId').execute(urlId=urlId).fetchall()    for matchId, wordId in matches:        # See if we need to delete the word        words = metadata.bind.text('select distinct url_id from indexer_match where word_id=:wordId limit 2').execute(wordId=wordId).fetchall()        # If this is the only url with these words in it, delete the word records        if len(words) == 1:            metadata.bind.text('delete from indexer_word where id=:wordId').execute(wordId=wordId)    # Now delete all the match objects in one sweep    metadata.bind.text('delete from indexer_match where url_id=:urlId').execute(urlId=urlId)    # Finally delete the url    metadata.bind.text('delete from indexer_url where id=:urlId').execute(urlId=urlId)def urlChanged(oldUrl, newUrl):    """    To be called by objects when their url changes    """    metadata.bind.text('UPDATE indexer_url SET url=:newUrl WHERE url=:oldUrl').execute(newUrl=newUrl, oldUrl=oldUrl)    # metadata.bind.text('UPDATE indexer_url u SET url=:newUrl || substring(url from :start) WHERE url like(:oldUrl || "%")').execute(newUrl=newUrl, oldUrl=oldUrl, start=start)# Mapper extension to keep up indexingclass IndexerExtension(MapperExtension):    """    Keeps urls on objects up to date    """    def after_insert(self, mapper, connection, instance):        indexInstance(instance)    def after_update(self, mapper, connection, instance):        indexInstance(instance)        def before_delete(self, mapper, connection, instance):        instance.refresh()        deIndex(instance.url)indexerExtension = IndexerExtension()def makeEntityIndexable(entity, fields=None, htmlFields=None):    extensions = entity._descriptor.mapper_options.get('extension', [])    if type(extensions) is not types.ListType:         extensions = [extensions]    if indexerExtension not in extensions:        extensions.append(indexerExtension)        entity._descriptor.mapper_options['extension'] = extensions        # Get all the text based fields so we can index them        if fields is None:            fields = indexFields(entity)        if hasattr(entity, '__indexFields__'):            entity.__indexFields__(fields)        if fields:            entity.__fieldsToIndex__ = fields        if htmlFields:            entity.__htmlFieldsToIndex__ = htmlFieldsclass Indexed(object):    def __init__(self, entity, fields=None, htmlFields=None):        makeEntityIndexable(entity, fields, htmlFields)# An elixir statement to make an object indexedindexed = Statement(Indexed)---------------------------------- 8< ----------------------------------And the html2text bit (this could definitely do with some work):---------------------------------- 8< ----------------------------------#!/usr/bin/env python"""html2text: Turn HTML into equivalent Markdown-structured text."""__version__ = "2.3"__author__ = "Aaron Swartz (me@aaronsw.com)"__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"]# TODO:#   Support decoded entities with unifiable.#   Relative URL resolutionif not hasattr(__builtins__, 'True'): True, False = 1, 0import re, sys, urllib, htmlentitydefs, codecsimport sgmllibsgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')try: from textwrap import wrapexcept: pass# Use Unicode characters instead of their ascii psuedo-replacementsUNICODE_SNOB = 1# Put the links after each paragraph instead of at the end.LINKS_EACH_PARAGRAPH = 1# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)BODY_WIDTH = 78# Don't show internal links (href="#local-anchor") -- corresponding link targets# won't be visible in the plain text file anyway.SKIP_INTERNAL_LINKS = True### Entity Nonsense ###def name2cp(k):    if k == 'apos': return ord("'")    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3        return htmlentitydefs.name2codepoint[k]    else:        k = htmlentitydefs.entitydefs[k]        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1        return ord(codecs.latin_1_decode(k)[0])unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*','ndash':'-', 'oelig':'oe', 'aelig':'ae','agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i','ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}unifiable_n = {}for k in unifiable.keys():    unifiable_n[name2cp(k)] = unifiable[k]def charref(name):    if name[0] in ['x','X']:        c = int(name[1:], 16)    else:        c = int(name)        if not UNICODE_SNOB and c in unifiable_n.keys():        return unifiable_n[c]    else:        return unichr(c)def entityref(c):    if not UNICODE_SNOB and c in unifiable.keys():        return unifiable[c]    else:        try: name2cp(c)        except KeyError: return "&" + c        else: return unichr(name2cp(c))def replaceEntities(s):    s = s.group(1)    if s[0] == "#":         return charref(s[1:])    else: return entityref(s)r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")def unescape(s):    return r_unescape.sub(replaceEntities, s)    def fixattrs(attrs):    # Fix bug in sgmllib.py    if not attrs: return attrs    newattrs = []    for attr in attrs:        newattrs.append((attr[0], unescape(attr[1])))    return newattrs### End Entity Nonsense ###def onlywhite(line):    """Return true if the line does only consist of whitespace characters."""    for c in line:        if c is not ' ' and c is not '  ':            return c is ' '    return linedef optwrap(text):    """Wrap all paragraphs in the provided text."""    if not BODY_WIDTH:        return text        assert wrap # Requires Python 2.3.    result = ''    newlines = 0    for para in text.split("\n"):        if len(para) > 0:            if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':                for line in wrap(para, BODY_WIDTH):                    result += line + "\n"                result += "\n"                newlines = 2            else:                if not onlywhite(para):                    result += para + "\n"                    newlines = 1        else:            if newlines < 2:                result += "\n"                newlines += 1    return resultdef hn(tag):    if tag[0] == 'h' and len(tag) == 2:        try:            n = int(tag[1])            if n in range(1, 10): return n        except ValueError: return 0class _html2text(sgmllib.SGMLParser):    def __init__(self, out=sys.stdout.write):        sgmllib.SGMLParser.__init__(self)                if out is None: self.out = self.outtextf        else: self.out = out        self.outtext = u''        self.quiet = 0        self.p_p = 0        self.outcount = 0        self.start = 1        self.space = 0        self.a = []        self.astack = []        self.acount = 0        self.list = []        self.blockquote = 0        self.pre = 0        self.startpre = 0        self.lastWasNL = 0        def outtextf(self, s):         if type(s) is type(''): s = codecs.utf_8_decode(s)[0]        self.outtext += s        def close(self):        sgmllib.SGMLParser.close(self)                self.pbr()        self.o('', 0, 'end')                return self.outtext            def handle_charref(self, c):        self.o(charref(c))    def handle_entityref(self, c):        self.o(entityref(c))                def unknown_starttag(self, tag, attrs):        self.handle_tag(tag, attrs, 1)        def unknown_endtag(self, tag):        self.handle_tag(tag, None, 0)            def previousIndex(self, attrs):        """ returns the index of certain set of attributes (of a link) in the            self.a list             If the set of attributes is not found, returns None        """        if not attrs.has_key('href'): return None                i = -1        for a in self.a:            i += 1            match = 0                        if a.has_key('href') and a['href'] == attrs['href']:                if a.has_key('title') or attrs.has_key('title'):                        if (a.has_key('title') and attrs.has_key('title') and                            a['title'] == attrs['title']):                            match = True                else:                    match = True            if match: return i    def handle_tag(self, tag, attrs, start):        attrs = fixattrs(attrs)            if hn(tag):            self.p()            if start: self.o(hn(tag)*" " + ' ')        if tag in ['p', 'div']: self.p()                if tag == "br" and start: self.o("  \n")        if tag == "hr" and start:            self.p()            self.o("* * *")            self.p()        if tag in ["head", "style", 'script']:             if start: self.quiet += 1            else: self.quiet -= 1        if tag in ["body"]:            self.quiet = 0 # sites like 9rules.com never close <head>                if tag == "blockquote":            if start:                 self.p(); self.o('> ', 0, 1); self.start = 1                self.blockquote += 1            else:                self.blockquote -= 1                self.p()                #if tag in ['em', 'i', 'u']: self.o("_")        #if tag in ['strong', 'b']: self.o("**")        if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``                if tag == "a":            if start:                attrsD = {}                for (x, y) in attrs: attrsD[x] = y                attrs = attrsD                self.astack.append(None)            else:                if self.astack:                    a = self.astack.pop()                    if a:                        i = self.previousIndex(a)                        if i is not None:                            a = self.a[i]                        else:                            self.acount += 1                            a['count'] = self.acount                            a['outcount'] = self.outcount                            self.a.append(a)                        self.o("][" + `a['count']` + "]")                if tag == "img" and start:            attrsD = {}            for (x, y) in attrs: attrsD[x] = y            attrs = attrsD            #alt = attrs.get('alt', '')            #self.o(alt)                if tag == 'dl' and start: self.p()        if tag == 'dt' and not start: self.pbr()        if tag == 'dd' and start: self.o('    ')        if tag == 'dd' and not start: self.pbr()                if tag in ["ol", "ul"]:            if start:                self.list.append({'name':tag, 'num':0})            else:                if self.list: self.list.pop()                        self.p()                if tag == 'li':            if start:                self.pbr()                if self.list: li = self.list[-1]                else: li = {'name':'ul', 'num':0}                self.o("  "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.                if li['name'] == "ul": self.o("* ")                elif li['name'] == "ol":                    li['num'] += 1                    self.o(`li['num']`+". ")                self.start = 1            else:                self.pbr()                if tag in ["table", "tr"] and start: self.p()        if tag == 'td': self.pbr()                if tag == "pre":            if start:                self.startpre = 1                self.pre = 1            else:                self.pre = 0            self.p()                def pbr(self):        if self.p_p == 0: self.p_p = 1    def p(self): self.p_p = 2        def o(self, data, puredata=0, force=0):        if not self.quiet:             if puredata and not self.pre:                data = re.sub('\s+', ' ', data)                if data and data[0] == ' ':                    self.space = 1                    data = data[1:]            if not data and not force: return                        if self.startpre:                #self.out(" :") #TODO: not output when already one there                self.startpre = 0                        bq = (">" * self.blockquote)            if not (force and data and data[0] == ">") and self.blockquote: bq += " "                        if self.pre:                bq += "    "                data = data.replace("\n", "\n"+bq)                        if self.start:                self.space = 0                self.p_p = 0                self.start = 0            if force == 'end':                # It's the end.                self.p_p = 0                self.out("\n")                self.space = 0            if self.p_p:                self.out(('\n'+bq)*self.p_p)                self.space = 0                            if self.space:                if not self.lastWasNL: self.out(' ')                self.space = 0            if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):                if force == "end": self.out("\n")                newa = []                for link in self.a:                    if self.outcount > link['outcount']:                        self.out("   ["+`link['count']`+"]: " + link['href']) #TODO: base href                        if link.has_key('title'): self.out(" ("+link['title']+")")                        self.out("\n")                    else:                        newa.append(link)                if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.                self.a = newa            self.p_p = 0            self.out(data)            self.lastWasNL = data and data[-1] == '\n'            self.outcount += 1    def handle_data(self, data):        self.o(data, 1)        def unknown_decl(self, data): passdef wrapwrite(text): sys.stdout.write(text.encode('utf8'))def html2text_file(html, out=wrapwrite):    h = _html2text(out)    h.feed(html)    h.feed("")    return h.close()def html2text(html):    return optwrap(html2text_file(html, None))newLineRe = re.compile('((\r\r)|(\n\r\n\r)|(\r\n\r\n)|(\n\n))+', re.UNICODE)titleRe = re.compile(r'^\#\#\s*', re.UNICODE|re.MULTILINE)def xml2text(html):    """    Turns xml into text    """    from xml.dom import minidom    # fix up entitydefs    xml = []    lastEnd = 0    for m in re.finditer('&[^&;]+;', html):        xml.append(html[lastEnd:m.start()])        defName = html[m.start()+1:m.end()-1]        newName = htmlentitydefs.name2codepoint.get(defName, None)        if newName is not None:            xml.append('&#%s;' % newName)        lastEnd = m.end()    xml = u'<?xml version="1.0" encoding="utf-8"?><root>' + ''.join(xml) + u'</root>'    dom = minidom.parseString(xml.encode('utf-8'))    raise Exeception('ouch')    def html2plainText(html):    """    Returns plain text with no []    """    #text = xml2text(html)    text = html2text(html)    text = titleRe.sub('', text)    text = newLineRe.sub('\r\n\r\n', text)    return textif __name__ == "__main__":    if sys.argv[1:]:        arg = sys.argv[1]        if arg.startswith('http://'):            j = urllib.urlopen(arg)            try:                from feedparser import _getCharacterEncoding as enc            except ImportError:                enc = lambda x, y: ('utf-8', 1)            text = j.read()            encoding = enc(j.headers, text)[0]            if encoding == 'us-ascii': encoding = 'utf-8'            data = text.decode(encoding)        else:            data = open(arg, 'r').read()    else:        data = sys.stdin.read()    wrapwrite(html2text(data))---------------------------------- 8< ----------------------------------Example usage: ---------------------------------- 8< ----------------------------------from cleveland.indexer import indexed, urlChangedclass Page(Entity):    using_options(tablename='page')    indexed(fields=['name', 'title', 'url'], htmlFields=['content', 'extra'])    name = Field(Unicode(), required=True, unique=True)    title = Field(Unicode(), required=True, default='')    content = Field(Unicode())    extra = Field(Unicode())    url = Field(Unicode(), required=True, unique=True)    visible = Field(Boolean(), required=True, default=False, index=True)    lastModified = Field(DateTime(), required=True, default=datetime.now)    templateName = Field(Unicode())    def __getExtraData__(self): return 'page %s' % self.id    @before_update    def urlUpdate(self):        """        Removes the index entrys for the old urls        """        if self._oldUrl is not None:            urlChanged(self._oldUrl, self.url)            for article in self.articles:                urlChanged(self._oldUrl + '/' + article.name, article.url)        self._oldUrl = None