#!/usr/bin/pythonfrom itertools import combinationsfrom transaction_generator import *from blist import sortedsetimport pprintset = sortedsetNUM_ITEMS = 10ITEMSET = range(1,NUM_ITEMS)def cover(X, D): S = set([]) for tid in D: if X <= set(D[tid]): S.add(tid) return Sdef support(X, D): return len(cover(X, D))def frequency(X, D): return support(X, D) / len(D)def is_frequent(X, D, sigma_rel): return frequency(X, D) >= sigma_relclass ItemSet(): def __init__(self, item_set, support=0): self.item_set = item_set self.support = support def __repr__(self): return "%s" %(self.item_set,) #return "%s --> %s" %(self.item_set, self.support)def frequent_itemsets(D, sigma_rel): """ Apriori algorithm by agrawal et al for frequent itemsets mining Input: Data Dict and the minimum threshold sigma_rel (0 <= sigma_rel <=1) Output: Set of frequent itemsets """ C = [] # candidate itemset - list of "Itemset"s C.append([]) C.append([]) #[C.append(ItemSet(set(j), 0)) for j in I] C[1] = [ItemSet(set([j]), 0) for j in ITEMSET] k = 1 Freq = [] G = {} while C[k]: print k C.append([]) for tid in D: for X in C[k]: if X.item_set <= set(D[tid]): X.support += 1 F=[] # Frequent itemsets, the return value for X in C[k]: if float(X.support) / len(D) >= sigma_rel: F.append(X) G[X.__repr__()] = X Freq.append(X) # optimisation to determine the candidate set for the next iteration if k == 1: for a,b in combinations(F,2): W = ItemSet((a.item_set | b.item_set),0) C[2].append(W) if k > 1: for x,y in combinations(F, 2): m = x.item_set n = y.item_set M = list(m) N = list(n) if (M[0:k-1] == N[0:k-1]) and (M[k-1] < N[k-1]): W = ItemSet((m | set([list(n)[k-1]])), 0) br = False for J in combinations(W.item_set, len(m)): F1 = [i.item_set for i in F] if set(J) not in F1: br = True break; if not br: C[k+1].append(W) k+=1 return Gdef apriori(D, sigma_rel, gamma): F = frequent_itemsets(generate(1000, NUM_ITEMS), 0.2) R = [] for I in F.values(): #print "I:",I C=[] C.append([]) C.append([]) C[1] = [ItemSet(set([j]), 0) for j in I.item_set] k = 1 while C[k]: H = [] C.append([]) for X in C[k]: #print "X:",X try: #if (I.support / F[ItemSet(I.item_set-X.item_set).__repr__()].support) >= gamma: #print "In F:::::::", F[X.__repr__()], type(F[X.__repr__()]), F[X.__repr__()].support #print "I support:",I.support """ Here, we are assuming: (X ==> I\X), so we do: supp(X + I\X) / supp(X) :==: supp(I)/supp(X) The reason to do this was X starts from C[1] and increases with k and will never be null and as we dont have a null sortedset in the frequent_itemsets we cannot determine the value of a supoort of a null set. It does'nt make sense. So we have to make sure that in A ==> B, A != empty set """ if (float(I.support) / F[X.__repr__()].support) >= gamma: H.append(X) #H.append([X,I]) if len(I.item_set - X.item_set) != 0: R.append([X, " ==>", I.item_set-X.item_set]) except KeyError: print "Keyerror " continue #print "H: ",H if k == 1: for a,b in combinations(H,2): W = ItemSet((a.item_set | b.item_set),0) C[2].append(W) if k > 1: for x,y in combinations(H, 2): m = x.item_set n = y.item_set M = list(m) N = list(n) if (M[0:k-1] == N[0:k-1]) and (M[k-1] < N[k-1]): W = ItemSet((m | set([list(n)[k-1]])), 0) br = False for J in combinations(W.item_set, len(m)): F1 = [i.item_set for i in H] if set(J) not in F1: br = True break; if not br: C[k+1].append(W) k += 1 return RR = apriori(generate(1000, NUM_ITEMS), 0.9, 0.9)print "R:"pprint.pprint(R)