All pastes #2131834 Raw Edit

Something

public python v1 · immutable
#2131834 ·published 2012-03-24 18:46 UTC
rendered paste body
#!/usr/bin/pythonfrom itertools import combinationsfrom transaction_generator import *from blist import sortedsetimport pprintset = sortedsetNUM_ITEMS = 10ITEMSET = range(1,NUM_ITEMS)def cover(X, D):    S = set([])    for tid in D:        if X <= set(D[tid]):            S.add(tid)    return Sdef support(X, D):    return len(cover(X, D))def frequency(X, D):    return support(X, D) / len(D)def is_frequent(X, D, sigma_rel):    return frequency(X, D) >= sigma_relclass ItemSet():    def __init__(self, item_set, support=0):        self.item_set = item_set        self.support = support    def __repr__(self):        return "%s" %(self.item_set,)        #return "%s --> %s" %(self.item_set, self.support)def frequent_itemsets(D, sigma_rel):    """    Apriori algorithm by agrawal et al for frequent itemsets mining    Input: Data Dict and the minimum threshold sigma_rel (0 <= sigma_rel <=1)    Output: Set of frequent itemsets    """    C = [] # candidate itemset - list of "Itemset"s    C.append([])    C.append([])    #[C.append(ItemSet(set(j), 0)) for j in I]    C[1] = [ItemSet(set([j]), 0) for j in ITEMSET]    k = 1    Freq = []    G = {}    while C[k]:        print k        C.append([])        for tid in D:            for X in C[k]:                if X.item_set <= set(D[tid]):                    X.support += 1        F=[] # Frequent itemsets, the return value        for X in C[k]:            if float(X.support) / len(D) >= sigma_rel:                F.append(X)                G[X.__repr__()] = X                Freq.append(X)        # optimisation to determine the candidate set for the next iteration        if k == 1:            for a,b in combinations(F,2):                W = ItemSet((a.item_set | b.item_set),0)                C[2].append(W)        if k > 1:            for x,y in combinations(F, 2):                m = x.item_set                n = y.item_set                M = list(m)                N = list(n)                if (M[0:k-1] == N[0:k-1]) and (M[k-1] < N[k-1]):                    W = ItemSet((m | set([list(n)[k-1]])), 0)                    br = False                    for J in combinations(W.item_set, len(m)):                        F1 = [i.item_set for i in F]                        if set(J) not in F1:                            br = True                            break;                    if not br:                        C[k+1].append(W)        k+=1    return Gdef apriori(D, sigma_rel, gamma):    F = frequent_itemsets(generate(1000, NUM_ITEMS), 0.2)    R = []    for I in F.values():        #print "I:",I        C=[]        C.append([])        C.append([])        C[1] = [ItemSet(set([j]), 0) for j in I.item_set]        k = 1        while C[k]:            H = []            C.append([])            for X in C[k]:                #print "X:",X                try:                    #if (I.support / F[ItemSet(I.item_set-X.item_set).__repr__()].support) >= gamma:                    #print "In F:::::::", F[X.__repr__()], type(F[X.__repr__()]), F[X.__repr__()].support                    #print "I support:",I.support                    """                    Here, we are assuming: (X ==> I\X), so we do: supp(X + I\X) / supp(X) :==: supp(I)/supp(X)                    The reason to do this was X starts from C[1] and increases with k and will never be null and as we                    dont have a null sortedset in the frequent_itemsets we cannot determine the value of a supoort                    of a null set. It does'nt make sense. So we have to make sure that in A ==> B, A != empty set                    """                    if (float(I.support) / F[X.__repr__()].support) >= gamma:                        H.append(X)                        #H.append([X,I])                        if len(I.item_set - X.item_set) != 0:                            R.append([X, " ==>", I.item_set-X.item_set])                except KeyError:                    print "Keyerror "                    continue            #print "H: ",H            if k == 1:                for a,b in combinations(H,2):                    W = ItemSet((a.item_set | b.item_set),0)                    C[2].append(W)            if k > 1:                for x,y in combinations(H, 2):                    m = x.item_set                    n = y.item_set                    M = list(m)                    N = list(n)                    if (M[0:k-1] == N[0:k-1]) and (M[k-1] < N[k-1]):                        W = ItemSet((m | set([list(n)[k-1]])), 0)                        br = False                        for J in combinations(W.item_set, len(m)):                            F1 = [i.item_set for i in H]                            if set(J) not in F1:                                br = True                                break;                        if not br:                            C[k+1].append(W)            k += 1    return RR = apriori(generate(1000, NUM_ITEMS), 0.9, 0.9)print "R:"pprint.pprint(R)