1. 程式人生 > >R_Studio(關聯)對Groceries資料集進行關聯分析

R_Studio(關聯)對Groceries資料集進行關聯分析

     

 

  RGui的arules程式包裡含有Groceries資料集,該資料集是某個雜貨店一個月真實的交易記錄,共有9835條消費記錄,169個商品

 

 

#install.packages("arules")
library(arules)
setwd('D:\\data') 

#讀入資料
#Groceries資料集
Groceries
groceries<-read.transactions("groceries.txt",format="basket",sep=",")

#檢視groceries中的資料
summary(groceries) class(groceries) groceries dim(groceries) colnames(groceries)[1:5] #rownames(groceries)[1:5] basketSize<-size(groceries) summary(basketSize) sum(basketSize) #size函式和itemFrequency函式都是arules包中的函式,前者是為了計算購物籃裡商品數量,後者是為了計算每種商品的支援度 itemFreq<-itemFrequency(groceries) itemFreq[1:5] sum(itemFreq) itemCount
<-(itemFreq/sum(itemFreq))*sum(basketSize) summary(itemCount) #按支援度itemFrequency排序,檢視支援度的最大值 orderedItem<-sort(itemCount,decreasing=T) orderedItem[1:10] orderedItemFreq<-sort(itemFrequency(groceries),decreasing=T) orderedItemFreq[1:10] #切除第100行到800行,計算第1列到第3列的支援度 itemFrequency(groceries[100:800,1:3])
#itemFrequencyPlot 畫頻繁項的圖 #按最小支援度檢視 itemFrequencyPlot(groceries,support=0.1) #按照排序檢視 itemFrequencyPlot(groceries,topN=10,horiz=T) #只關心購買兩件商品以上的交易 groceries_use<-groceries[basketSize>1] dim(groceries_use) inspect(groceries[1:5]) #一個點代表在某個transaction上購買了item。 image(groceries[1:10]) #當資料集很大的時候,這張稀疏矩陣圖是很難展現的,一般可以用sample函式進行取樣顯示 image(sample(groceries,100)) groceryrules<-apriori(groceries,parameter=list(support=0.03,confidence=0.25,minlen=2)) summary(groceryrules) #inspect檢視具體的規則 inspect(groceryrules[1:5]) inspect(groceryrules) #按照某種度量,對規則進行排序。 ordered_groceryrules<-sort(groceryrules,by="lift") inspect(ordered_groceryrules[1:5]) yogurtrules<-subset(groceryrules,items%in%c("yogurt")) inspect(yogurtrules) fruitrules<-subset(groceryrules,items%pin%c("fruit")) inspect(fruitrules) byrules<-subset(groceryrules,items%ain%c("berries","yogurt")) inspect(byrules) fruitrules<-subset(groceryrules,items%pin%c("fruit")&lift>2) inspect(fruitrules) berriesInLHS<-apriori(groceries,parameter=list(support=0.001,confidence=0.1),appearance=list(lhs=c("berries"),default="rhs")) summary(berriesInLHS) inspect(berriesInLHS) inspect(head(rhs(berriesInLHS),n=5)) berrySub<-subset(berriesInLHS,subset=!(rhs%in%c("root vegetables","whole milk"))) inspect(head(rhs(sort(berrySub,by="confidence")),n=5)) write(groceryrules,file="groceryrules.csv",sep=",",quote=TRUE,row.names=FALSE) groceryrules_df<-as(groceryrules,"data.frame") str(groceryrules_df) data(Groceries) summary(Groceries) print(levels(itemInfo(Groceries)[["level1"]])) print(levels(itemInfo(Groceries)[["level2"]])) inspect(Groceries[1:3]) groceries=aggregate(Groceries,itemInfo(Groceries)[["level2"]]) inspect(groceries[1:3]) itemFrequencyPlot(Groceries,support=0.025,cex.names=0.8,xlim=c(0,0.3), type="relative",horiz=TRUE,col="darkred",las=1, xlab=paste("ProportionofMarketBasketsContainingItem", "\n(ItemRelativeFrequencyorSupport)")) second.rules<-apriori(groceries,parameter=list(support=0.025,confidence=0.05)) print(summary(second.rules)) install.packages("RColorBrewer") install.packages("arulesViz") #library(RColorBrewer) #library(arulesViz) inspect(second.rules) plot(second.rules,control=list(jitter=2,col=rev(brewer.pal(9,"Greens")[4:9])),shading="lift") plot(second.rules,measure="confidence",method="graph",control=list(type="items"),shading="lift") plot(second.rules,method="grouped",control=list(col=rev(brewer.pal(9,"Greens")[4:9]))) groceryrules.eclat<-eclat(groceries,parameter=list(support=0.05,minlen=2)) summary(groceryrules.eclat) inspect(groceryrules.eclat)
Gary.R

 

 

一. 載入資料集

  檢視groceries中的資料

> summary(groceries)
transactions as itemMatrix in sparse format with
 9835 rows (elements/itemsets/transactions) and
 169 columns (items) and a density of 0.02609146 

most frequent items:
      whole milk other vegetables       rolls/buns             soda           yogurt          (Other) 
            2513             1903             1809             1715             1372            34055 

element (itemset/transaction) length distribution:
sizes
   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20   21   22   23 
2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55   46   29   14   14    9   11    4    6 
  24   26   27   28   29   32 
   1    1    1    1    3    1 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   2.000   3.000   4.409   6.000  32.000 

includes extended item information - examples:
            labels
1 abrasive cleaner
2 artif. sweetener
3   baby cosmetics
> class(groceries)
[1] "transactions"
attr(,"package")
[1] "arules"
> groceries
transactions in sparse format with
 9835 transactions (rows) and
 169 items (columns)
> dim(groceries)
[1] 9835  169

 

二. 對資料集進行處理分析

  對groceries中的資料進行統計

> colnames(groceries)[1:5]
[1] "abrasive cleaner" "artif. sweetener" "baby cosmetics"   "baby food"        "bags"            
> #rownames(groceries)[1:5]
> basketSize<-size(groceries)
> summary(basketSize)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   2.000   3.000   4.409   6.000  32.000 
> sum(basketSize)
[1] 43367

 

  統計groceries資料中的支援度

> itemFreq<-itemFrequency(groceries)
> itemFreq[1:5]
abrasive cleaner artif. sweetener   baby cosmetics        baby food             bags 
    0.0035587189     0.0032536858     0.0006100661     0.0001016777     0.0004067107 
> sum(itemFreq)
[1] 4.409456        #代表"平均一個transaction購買的item個數"
#檢視basketSize的分佈:密度曲線(TO ADD HERE)   > itemCount<-(itemFreq/sum(itemFreq))*sum(basketSize) > summary(itemCount) Min. 1st Qu. Median Mean 3rd Qu. Max. 1.0 38.0 103.0 256.6 305.0 2513.0

 

  按支援度itemFrequency排序,檢視支援度的最大值

> orderedItem<-sort(itemCount,decreasing=T)
> orderedItem[1:10]
      whole milk other vegetables       rolls/buns             soda           yogurt    bottled water 
            2513             1903             1809             1715             1372             1087 
 root vegetables   tropical fruit    shopping bags          sausage 
            1072             1032              969              924 
> orderedItemFreq<-sort(itemFrequency(groceries),decreasing=T)
> orderedItemFreq[1:10]
      whole milk other vegetables       rolls/buns             soda           yogurt    bottled water 
      0.25551601       0.19349263       0.18393493       0.17437722       0.13950178       0.11052364 
 root vegetables   tropical fruit    shopping bags          sausage 
      0.10899847       0.10493137       0.09852567       0.09395018 
#切除第100行到800行,計算第1列到第3列的支援度
> itemFrequency(groceries[100:800,1:3])
abrasive cleaner artif. sweetener   baby cosmetics 
     0.005706134      0.001426534      0.001426534 

 

  使用itemFrequencyPlot 畫頻繁項的圖

#按最小支援度檢視
itemFrequencyPlot(groceries,support=0.1)

  

 

#按照排序檢視
itemFrequencyPlot(groceries,topN=10,horiz=T)

  

 

 

 

  根據業務對資料集進行過濾,獲得進一步規則挖掘的資料集

 

> #只關心購買兩件商品以上的交易
> groceries_use<-groceries[basketSize>1]
> dim(groceries_use)
[1] 7676  169

 

  通過圖形更直觀觀測資料的稀疏情況

> inspect(groceries[1:5])
    items                                                                
[1] {citrus fruit,margarine,ready soups,semi-finished bread}             
[2] {coffee,tropical fruit,yogurt}                                       
[3] {whole milk}                                                         
[4] {cream cheese,meat spreads,pip fruit,yogurt}                         
[5] {condensed milk,long life bakery product,other vegetables,whole milk}
#一個點代表在某個transaction上購買了item。
> image(groceries[1:10])

 

 

  

 

 

#當資料集很大的時候,這張稀疏矩陣圖是很難展現的,一般可以用sample函式進行取樣顯示
image(sample(groceries,100))

 

  

 

三、對資料集進行規則挖掘

 

apriori函式

  

 

 

> summary(groceryrules)
set of 15 rules

rule length distribution (lhs + rhs):sizes
 2 
15 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      2       2       2       2       2       2 

summary of quality measures:
    support          confidence          lift           count      
 Min.   :0.03010   Min.   :0.2929   Min.   :1.205   Min.   :296.0  
 1st Qu.:0.03274   1st Qu.:0.3185   1st Qu.:1.488   1st Qu.:322.0  
 Median :0.04230   Median :0.3737   Median :1.572   Median :416.0    中位數:0.04230中位數:0.3737中位數:1.572中位數:416
 Mean   :0.04475   Mean   :0.3704   Mean   :1.598   Mean   :440.1  
 3rd Qu.:0.05247   3rd Qu.:0.4024   3rd Qu.:1.758   3rd Qu.:516.0  
 Max.   :0.07483   Max.   :0.4496   Max.   :2.247   Max.   :736.0    Max.:0.07483個最大值:0.4496個最大值:2.247個最大值:736

mining info:
      data ntransactions support confidence
 groceries          9835    0.03       0.25

 

> #inspect檢視具體的規則
> inspect(groceryrules[1:5])
    lhs                     rhs          support    confidence lift     count
[1] {whipped/sour cream} => {whole milk} 0.03223183 0.4496454  1.759754 317  
[2] {pip fruit}          => {whole milk} 0.03009659 0.3978495  1.557043 296  
[3] {pastry}             => {whole milk} 0.03324860 0.3737143  1.462587 327  
[4] {citrus fruit}       => {whole milk} 0.03050330 0.3685504  1.442377 300  
[5] {sausage}            => {rolls/buns} 0.03060498 0.3257576  1.771048 301  
> inspect(groceryrules)
     lhs                     rhs                support    confidence lift     count
[1]  {whipped/sour cream} => {whole milk}       0.03223183 0.4496454  1.759754 317  
[2]  {pip fruit}          => {whole milk}       0.03009659 0.3978495  1.557043 296  
[3]  {pastry}             => {whole milk}       0.03324860 0.3737143  1.462587 327  
[4]  {citrus fruit}       => {whole milk}       0.03050330 0.3685504  1.442377 300  
[5]  {sausage}            => {rolls/buns}       0.03060498 0.3257576  1.771048 301  
[6]  {bottled water}      => {whole milk}       0.03436706 0.3109476  1.216940 338  
[7]  {tropical fruit}     => {other vegetables} 0.03589222 0.3420543  1.767790 353  
[8]  {tropical fruit}     => {whole milk}       0.04229792 0.4031008  1.577595 416  
[9]  {root vegetables}    => {other vegetables} 0.04738180 0.4347015  2.246605 466  
[10] {root vegetables}    => {whole milk}       0.04890696 0.4486940  1.756031 481  
[11] {yogurt}             => {other vegetables} 0.04341637 0.3112245  1.608457 427  
[12] {yogurt}             => {whole milk}       0.05602440 0.4016035  1.571735 551  
[13] {rolls/buns}         => {whole milk}       0.05663447 0.3079049  1.205032 557  
[14] {other vegetables}   => {whole milk}       0.07483477 0.3867578  1.513634 736  
[15] {whole milk}         => {other vegetables} 0.07483477 0.2928770  1.513634 736  

 

 

四. 對資料集進行評估規則

 

規則可以劃分為3大類:

  • Actionable
    • 這些rule提供了非常清晰、有用的洞察,可以直接應用在業務上。
  • Trivial
    • 這些rule顯而易見,很清晰但是沒啥用。屬於common sense,如 {尿布} => {嬰兒食品}。
  • Inexplicable
    • 這些rule是不清晰的,難以解釋,需要額外的研究來判定是否是有用的rule。

 

> #按照某種度量,對規則進行排序。
> ordered_groceryrules<-sort(groceryrules,by="lift")
> inspect(ordered_groceryrules[1:5])
    lhs                     rhs                support    confidence lift     count
[1] {root vegetables}    => {other vegetables} 0.04738180 0.4347015  2.246605 466  
[2] {sausage}            => {rolls/buns}       0.03060498 0.3257576  1.771048 301  
[3] {tropical fruit}     => {other vegetables} 0.03589222 0.3420543  1.767790 353  
[4] {whipped/sour cream} => {whole milk}       0.03223183 0.4496454  1.759754 317  
[5] {root vegetables}    => {whole milk}       0.04890696 0.4486940  1.756031 481 

 

  

搜尋規則

 

> yogurtrules<-subset(groceryrules,items%in%c("yogurt"))
> inspect(yogurtrules)
    lhs         rhs                support    confidence lift     count
[1] {yogurt} => {other vegetables} 0.04341637 0.3112245  1.608457 427  
[2] {yogurt} => {whole milk}       0.05602440 0.4016035  1.571735 551  
> fruitrules<-subset(groceryrules,items%pin%c("fruit"))
> inspect(fruitrules)
    lhs                 rhs                support    confidence lift     count
[1] {pip fruit}      => {whole milk}       0.03009659 0.3978495  1.557043 296  
[2] {citrus fruit}   => {whole milk}       0.03050330 0.3685504  1.442377 300  
[3] {tropical fruit} => {other vegetables} 0.03589222 0.3420543  1.767790 353  
[4] {tropical fruit} => {whole milk}       0.04229792 0.4031008  1.577595 416  
> byrules<-subset(groceryrules,items%ain%c("berries","yogurt"))
> inspect(byrules)

 

 

  items %in% c("A", "B")表示 lhs+rhs的項集並集中,至少有一個item是在c("A", "B") item = Aor item = B

  如果僅僅想搜尋lhs或者rhs,那麼用lhs或rhs替換items即可。如:lhs %in% c("yogurt")

    %in%是精確匹配

    %pin%是部分匹配,也就是說只要item like '%A%' or item like '%B%'

    %ain%是完全匹配,也就是說itemset has ’A' and itemset has ‘B'

  同時可以通過 條件運算子(&, |, !) 新增 support, confidence, lift的過濾條件。

 

> yogurtrules<-subset(groceryrules,items%in%c("yogurt"))
> inspect(yogurtrules)
    lhs         rhs                support    confidence lift     count
[1] {yogurt} => {other vegetables} 0.04341637 0.3112245  1.608457 427  
[2] {yogurt} => {whole milk}       0.05602440 0.4016035  1.571735 551  
> fruitrules<-subset(groceryrules,items%pin%c("fruit"))
> inspect(fruitrules)
    lhs                 rhs                support    confidence lift     count
[1] {pip fruit}      => {whole milk}       0.03009659 0.3978495  1.557043 296  
[2] {citrus fruit}   => {whole milk}       0.03050330 0.3685504  1.442377 300  
[3] {tropical fruit} => {other vegetables} 0.03589222 0.3420543  1.767790 353  
[4] {tropical fruit} => {whole milk}       0.04229792 0.4031008  1.577595 416  
> byrules<-subset(groceryrules,items%ain%c("berries","yogurt"))
> inspect(byrules)
> 
> fruitrules<-subset(groceryrules,items%pin%c("fruit")&lift>2)
> inspect(fruitrules)
> berriesInLHS<-apriori(groceries,parameter=list(support=0.001,confidence=0.1),appearance=list(lhs=c("berries"),default="rhs"))
Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen maxlen target   ext
        0.1    0.1    1 none FALSE            TRUE       5   0.001      1     10  rules FALSE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 9 

set item appearances ...[1 item(s)] done [0.00s].
set transactions ...[169 item(s), 9835 transaction(s)] done [0.01s].
sorting and recoding items ... [157 item(s)] done [0.00s].
creating transaction tree ... done [0.01s].
checking subsets of size 1 2 done [0.00s].
writing ... [26 rule(s)] done [0.00s].
creating S4 object  ... done [0.01s].
> summary(berriesInLHS)
set of 26 rules

rule length distribution (lhs + rhs):sizes
 1  2 
 8 18 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   1.000   2.000   1.692   2.000   2.000 

summary of quality measures:
    support           confidence          lift           count        
 Min.   :0.003660   Min.   :0.1049   Min.   :1.000   Min.   :  36.00  
 1st Qu.:0.004601   1st Qu.:0.1177   1st Qu.:1.000   1st Qu.:  45.25  
 Median :0.007016   Median :0.1560   Median :1.470   Median :  69.00  
 Mean   :0.053209   Mean   :0.1786   Mean   :1.547   Mean   : 523.31  
 3rd Qu.:0.107982   3rd Qu.:0.2011   3rd Qu.:1.830   3rd Qu.:1062.00  
 Max.   :0.255516   Max.   :0.3547   Max.   :3.797   Max.   :2513.00  

mining info:
      data ntransactions support confidence
 groceries          9835   0.001        0.1
> inspect(berriesInLHS)
     lhs          rhs                     support     confidence lift     count
[1]  {}        => {bottled water}         0.110523640 0.1105236  1.000000 1087 
[2]  {}        => {tropical fruit}        0.104931368 0.1049314  1.000000 1032 
[3]  {}        => {root vegetables}       0.108998475 0.1089985  1.000000 1072 
[4]  {}        => {soda}                  0.174377224 0.1743772  1.000000 1715 
[5]  {}        => {yogurt}                0.139501779 0.1395018  1.000000 1372 
[6]  {}        => {rolls/buns}            0.183934926 0.1839349  1.000000 1809 
[7]  {}        => {other vegetables}      0.193492628 0.1934926  1.000000 1903 
[8]  {}        => {whole milk}            0.255516014 0.2555160  1.000000 2513 
[9]  {berries} => {beef}                  0.004473818 0.1345566  2.564659   44 
[10] {berries} => {butter}                0.003762074 0.1131498  2.041888   37 
[11] {berries} => {domestic eggs}         0.003863752 0.1162080  1.831579   38 
[12] {berries} => {fruit/vegetable juice} 0.003660397 0.1100917  1.522858   36 
[13] {berries} => {whipped/sour cream}    0.009049314 0.2721713  3.796886   89 
[14] {berries} => {pip fruit}             0.003762074 0.1131498  1.495738   37 
[15] {berries} => {pastry}                0.004270463 0.1284404  1.443670   42 
[16] {berries} => {citrus fruit}          0.005388917 0.1620795  1.958295   53 
[17] {berries} => {shopping bags}         0.004982206 0.1498471  1.520894   49 
[18] {berries} => {sausage}               0.004982206 0.1498471  1.594963   49 
[19] {berries} => {bottled water}         0.004067107 0.1223242  1.106769   40 
[20] {berries} => {tropical fruit}        0.006710727 0.2018349  1.923494   66 
[21] {berries} => {root vegetables}       0.006609049 0.1987768  1.823666   65 
[22] {berries} => {soda}                  0.007320793 0.2201835  1.262685   72 
[23] {berries} => {yogurt}                0.010574479 0.3180428  2.279848  104 
[24] {berries} => {rolls/buns}            0.006609049 0.1987768  1.080691   65 
[25] {berries} => {other vegetables}      0.010269446 0.3088685  1.596280  101 
[26] {berries} => {whole milk}            0.011794611 0.3547401  1.388328  116 
> inspect(head(rhs(berriesInLHS),n=5))
    items            
[1] {bottled water}  
[2] {tropical fruit} 
[3] {root vegetables}
[4] {soda}           
[5] {yogurt}         

 

 

限制挖掘的item

 

  可以控制規則的左手邊或者右手邊出現的item,即appearance。但儘量要放低支援度和置信度。

 

berrySub<-subset(berriesInLHS,subset=!(rhs%in%c("root vegetables","whole milk")))
inspect(head(rhs(sort(berrySub,by="confidence")),n=5))
write(groceryrules,file="groceryrules.csv",sep=",",quote=TRUE,row.names=FALSE)
groceryrules_df<-as(groceryrules,"data.frame")
str(groceryrules_df)
data(Groceries)
summary(Groceries)
print(levels(itemInfo(Groceries)[["level1"]]))
print(levels(itemInfo(Groceries)[["level2"]]))
inspect(Groceries[1:3])
groceries=aggregate(Groceries,itemInfo(Groceries)[["level2"]])
inspect(groceries[1:3])
itemFrequencyPlot(Groceries,support=0.025,cex.names=0.8,xlim=c(0,0.3),
                  type="relative",horiz=TRUE,col="darkred",las=1,
                  xlab=paste("ProportionofMarketBasketsContainingItem",
                             "\n(ItemRelativeFrequencyorSupport)"))
second.rules<-apriori(groceries,parameter=list(support=0.025,confidence=0.05))
print(summary(second.rules))

 

 

itemFrequency

               itemFrequencyPlot(Groceries, support = 0.025, cex.names=0.8, xlim = c(0,0.3),  type = "relative", horiz = TRUE, col = "dark red", las = 1,  xlab = paste("Proportionof Market Baskets Containing Item",  "\n(Item Relative Frequency or Support)"))  

  horiz=TRUE: 讓柱狀圖水平顯示  

  cex.names=0.8:item的label(這個例子即縱軸)的大小乘以的係數。  

  s=1: 表示刻度的方向,1表示總是水平方向。  

  pe="relative": 即support的值(百分比)。如果type=absolute表示顯示該item的count,而非support。預設就是relative。  

 

  

 

 

擴充套件:

#install.packages("RColorBrewer")
#install.packages("arulesViz")
library(RColorBrewer)
library(arulesViz)
inspect(second.rules)
plot(second.rules,control=list(jitter=2,col=rev(brewer.pal(9,"Greens")[4:9])),shading="lift")

plot(second.rules,measure="confidence",method="graph",control=list(type="items"),shading="lift")
plot(second.rules,method="grouped",control=list(col=rev(brewer.pal(9,"Greens")[4:9])))
groceryrules.eclat<-eclat(groceries,parameter=list(support=0.05,minlen=2))
summary(groceryrules.eclat)
inspect(groceryrules.eclat)

 

2.1 Scatter Plot

library(RColorBrewer)

library(arulesViz)

> plot(second.rules, control=list(jitter=2, col = rev(brewer.pal(9, "Greens")[4:9])), shading = "lift")     


shading = "lift": 表示在散點圖上顏色深淺的度量是lift。當然也可以設定為support 或者Confidence

jitter=2:增加抖動值

col: 調色盤,預設是100個顏色的灰色調色盤。

brewer.pal(n, name): 建立調色盤:n表示該調色盤內總共有多少種顏色;name表示調色盤的名字(參考help)。

這裡使用Green這塊調色盤,引入9中顏色。

這幅散點圖表示了規則的分佈圖:大部分規則的support0.1以內,Confidence0-0.8內。每個點的顏色深淺代表了lift的值。

   

 

 

2.2 Grouped Matrix

> plot(second.rules, method="grouped", control=list(col = rev(brewer.pal(9, "Greens")[4:9])))  


Grouped matrix-based visualization. 

Antecedents (columns) in the matrix are grouped using clustering. Groups are represented as balloons in the matrix.

   

 

 

2.3 Graph

Represents the rules (or itemsets) as a graph

 plot(top.vegie.rules, measure="confidence", method="graph",control=list(type="items"), shading = "lift") 

type=items表示每個圓點的入度的item的集合就是LHSitemset

measure定義了圈圈大小,預設是support

顏色深淺有shading控制