Association analysis with the Apriori algorithm
Apriori 는 아이템간의 연관규칙을 찾기 위한 알고리즘이다. 흔히 알려진 예로, 맥주와 기저귀의 관계가 있다. 나이브하게 계산하면 경우의 수가 지수적으로 증가하는데 Apriori 는 계산할 필요가 없는 아이템을 빠르게 제거할 수 있는 길을 제시한다.
Apriori principle
If a set S is frequent, all subsets of S are frequent. Contraposition: If a set S is infrequent, its supersets are infrequent.
… 천잰데?
- Support of an itemset: Frequency of an itemset
- Confidence of $A \rightarrow B$: $\frac{Support(A \land B)}{Support(A)}$
Algorithm
위키피디아에서는 이렇게 설명하고 있는데…
Finding frequent itemsets with the Apriori algorithm
from collections import defaultdict
def loadDataSet():
return [[1,3,4], [2,3,5], [1,2,3,5], [2,5]]
def createC1(dataSet):
C1 = []
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return map(frozenset, C1)
def scanD(D, Ck, minSupport):
ssCnt = defaultdict(int)
for tid in D:
for can in Ck:
if can.issubset(tid):
ssCnt[can] += 1
numItems = float(len(D))
retList = []
supportData = {}
for key, cnt in ssCnt.iteritems():
support = cnt/numItems
if support>=minSupport:
retList.insert(0, key)
supportData[key] = support
return retList, supportData
dataSet = loadDataSet()
print(dataSet)
C1=createC1(dataSet)
print(C1)
D=map(set,dataSet)
print(D)
L1,supportData0=scanD(D, C1, 0.5)
print(L1)
[[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
[frozenset([1]), frozenset([2]), frozenset([3]), frozenset([4]), frozenset([5])]
[set([1, 3, 4]), set([2, 3, 5]), set([1, 2, 3, 5]), set([2, 5])]
[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])]
def aprioriGen(Lk, k_plus_1): # create C_k+1
retList = []
lenLk = len(Lk)
for i, Lk_i in enumerate(Lk):
for j, Lk_j in enumerate(Lk[i+1:]):
L1 = list(Lk_i)[:k_plus_1-2]
L2 = list(Lk_j)[:k_plus_1-2]
L1.sort()
L2.sort()
if L1==L2:
retList.append(Lk_i | Lk_j)
return retList
def apriori(dataSet, minSupport=0.5):
C1 = createC1(dataSet)
D = map(set, dataSet)
L1, supportData = scanD(D, C1, minSupport)
L = [L1]
# k=2 means we're starting from 2nd item. 2nd item's index is 1.
k = 2
while len(L[k-2]) > 0:
Ck = aprioriGen(L[k-2], k) # L[0] is L1. We generate C2 with L1
Lk, supK = scanD(D, Ck, minSupport)
supportData.update(supK)
L.append(Lk)
k += 1
return L, supportData
L, supportData = apriori(dataSet)
print(L)
for i, l in enumerate(L):
idx = i+1
print("L%d = %s"%(idx, l))
aprioriGen(L[0], 2)
[[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])], [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])], [frozenset([2, 3, 5])], []]
L1 = [frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])]
L2 = [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])]
L3 = [frozenset([2, 3, 5])]
L4 = []
[frozenset({1, 3}),
frozenset({1, 2}),
frozenset({1, 5}),
frozenset({2, 3}),
frozenset({3, 5}),
frozenset({2, 5})]
def generateRules(L, supportData, minConf=0.7):
bigRuleList = []
for i, L_i in enumerate(L[1:]):
for freqSet in L_i:
H1 = [frozenset([item]) for item in freqSet]
if i>0:
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
return bigRuleList
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
prunedH = []
for conseq in H:
conf = supportData[freqSet] / supportData[freqSet-conseq]
if conf>=minConf:
print freqSet-conseq, '-->', conseq, 'conf:', conf
brl.append((freqSet-conseq, conseq, conf))
prunedH.append(conseq)
return prunedH
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
m = len(H[0])
if len(freqSet)>m+1:
Hmp1 = aprioriGen(H, m+1)
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
if len(Hmp1)>1:
rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
rules = generateRules(L, supportData, minConf=0.5)
frozenset([3]) --> frozenset([1]) conf: 0.666666666667
frozenset([1]) --> frozenset([3]) conf: 1.0
frozenset([5]) --> frozenset([2]) conf: 1.0
frozenset([2]) --> frozenset([5]) conf: 1.0
frozenset([3]) --> frozenset([2]) conf: 0.666666666667
frozenset([2]) --> frozenset([3]) conf: 0.666666666667
frozenset([5]) --> frozenset([3]) conf: 0.666666666667
frozenset([3]) --> frozenset([5]) conf: 0.666666666667
frozenset([5]) --> frozenset([2, 3]) conf: 0.666666666667
frozenset([3]) --> frozenset([2, 5]) conf: 0.666666666667
frozenset([2]) --> frozenset([3, 5]) conf: 0.666666666667
- 이 코드 좀 이상한것 같은데…? 착각인가?
L1 = list(Lk_i)[:k_plus_1-2]
L2 = list(Lk_j)[:k_plus_1-2]
L1.sort()
L2.sort()
if L1==L2:
retList.append(Lk_i | Lk_j)
- 책의 rulesFromConseq 에서 m=1 인 케이스가 처리 되나?
Another implementation
책의 구현은 너무 길고 복잡하고 읽기 어렵다. 더 짧고 간결한 구현이나 라이브러리를 한번 알아봐야겠다.
- http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
- http://codereview.stackexchange.com/questions/38101/optimizing-apriori-algorithm-python-pandas (wook)
- https://pypi.python.org/pypi/apriori
Interpretation about: $(A \rightarrow B \land C) \rightarrow (A \land B \rightarrow C) $
xenosoz 군의 진리표를 이용한 해석
from itertools import product
from ipy_table import *
table = [('A', 'B', 'C', 'A→B∧C', 'A∧B→C', '(A→B∧C)→(A∧B→C)')]
for A, B, C in product([True, False], repeat=3):
left = (A and B and C) or not A
right = (A and B and C) or not (A and B)
l_r = (left and right) or not left
table.append([A, B, C, left, right, l_r])
make_table(table)
A | B | C | A→B∧C | A∧B→C | (A→B∧C)→(A∧B→C) |
True | True | True | True | True | True |
True | True | False | False | False | True |
True | False | True | False | True | True |
True | False | False | False | True | True |
False | True | True | True | True | True |
False | True | False | True | True | True |
False | False | True | True | True | True |
False | False | False | True | True | True |
A, B, C 값에 상관없이 $(A \rightarrow B \land C) \rightarrow (A \land B \rightarrow C) $ 이건 항상 맞는 말임.
N 이 많아지면?
답이 없음. 그래서 이 다음 장에 나오는 FP Growth 라는걸 쓴다고 하는데 … (CH12에 계속)
Comments