1. 程式人生 > >第九章 資料分組和聚合(上)

第九章 資料分組和聚合(上)

import numpy as np
from pandas import DataFrame
df=DataFrame({'key1':['a','a','b','b','a'],
              'key2':['one','two','one','two','one'],
              'data1':np.random.randn(5),'data2':np.random.randn(5)})
df
      data1     data2 key1 key2
0  0.867596  0.739047    a  one
1 -0.271038 -0.621842    a  two
2  1.809116 -0.479957    b  one
3 -1.204141 -0.176988    b  two
4  0.008490  2.401128    a  one
#想按key1分組,計算data1列的平均值,訪問data1,根據key1呼叫groupby
grouped=df['data1'].groupby(df['key1'])
grouped.mean()
key1
a    0.201683
b    0.302487
Name: data1, dtype: float64
means=df['data1'].groupby([df['key1'],df['key2']]).mean()

means
Out[8]: 
key1  key2
a     one     0.438043
      two    -0.271038
b     one     1.809116
      two    -1.204141
Name: data1, dtype: float64
#通過兩個鍵對資料進行分組,得到的Series具有一個層次化索引
means.unstack()
Out[9]: 
key2       one       two
key1                    
a     0.438043 -0.271038
b     1.809116 -1.204141
states=np.array(['Ohio','California','California','Ohio','Ohio'])

years=np.array([2005,2005,2006,2005,2006])

df['data1'].groupby([states,years]).mean()
Out[12]: 
California  2005   -0.271038
            2006    1.809116
Ohio        2005   -0.168272
            2006    0.008490
Name: data1, dtype: float64
df.groupby('key1').mean()
Out[13]: 
         data1     data2
key1                    
a     0.201683  0.839444
b     0.302487 -0.328473

df.groupby(['key1','key2']).mean()
Out[14]: 
              data1     data2
key1 key2                    
a    one   0.438043  1.570087
     two  -0.271038 -0.621842
b    one   1.809116 -0.479957
     two  -1.204141 -0.176988
df.groupby(['key1','key2']).size()#size()返回一個有分組大小的列
Out[15]: 
key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64
對分組進行迭代:產生一組二元元組,由分組名和資料塊組成
for name,group in df.groupby('key1'):
    print name
    print group


a
      data1     data2 key1 key2
0  0.867596  0.739047    a  one
1 -0.271038 -0.621842    a  two
4  0.008490  2.401128    a  one
b
      data1     data2 key1 key2
2  1.809116 -0.479957    b  one
3 -1.204141 -0.176988    b  two
對於多重鍵的情況,元組的第一個元素將會是由鍵值組成的元組:
for (k1,k2),group in df.groupby(['key1','key2']):
    print k1,k2
    print group


a one
      data1     data2 key1 key2
0  0.867596  0.739047    a  one
4  0.008490  2.401128    a  one
a two
      data1     data2 key1 key2
1 -0.271038 -0.621842    a  two
b one
      data1     data2 key1 key2
2  1.809116 -0.479957    b  one
b two
      data1     data2 key1 key2
3 -1.204141 -0.176988    b  two

pieces=dict(list(df.groupby('key1')))

pieces['b']
Out[19]: 
      data1     data2 key1 key2
2  1.809116 -0.479957    b  one
3 -1.204141 -0.176988    b  two
groupby預設是在axis=0上進行分組的,通過設定也可在任何軸上進行分組,根據dtype對列進行分組:
df.dtypes
Out[20]: 
data1    float64
data2    float64
key1      object
key2      object
dtype: object

grouped=df.groupby(df.dtypes,axis=1)

dict(list(grouped))
Out[22]: 
{dtype('float64'):       data1     data2
 0  0.867596  0.739047
 1 -0.271038 -0.621842
 2  1.809116 -0.479957
 3 -1.204141 -0.176988
 4  0.008490  2.401128, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}
df.groupby(['key1','key2'])[['data2']].mean()
Out[26]: 
              data2
key1 key2          
a    one   1.570087
     two  -0.621842
b    one  -0.479957
     two  -0.176988
這種索引操作所返回的物件是一個已分組的DataFrame或已分組的Series
s_grouped=df.groupby(['key1','key2'])['data2']

s_grouped
Out[28]: <pandas.core.groupby.SeriesGroupBy object at 0x095467F0>

s_grouped.mean()
Out[29]: 
key1  key2
a     one     1.570087
      two    -0.621842
b     one    -0.479957
      two    -0.176988
Name: data2, dtype: float64
通過字典或Series進行分組:
people=DataFrame(np.random.randn(5,5),
                 columns=['a','b','c','d','e'],
index=['Joe','Steve','Wes','Jim','Travis'])

people.ix[2:3,['b','c']]=np.nan#新增幾個na值

people
Out[32]: 
               a         b         c         d         e
Joe    -1.349415 -0.034864 -0.041473  0.316972 -1.077930
Steve  -0.928486  1.348135  0.648762 -1.265573 -1.798529
Wes    -0.221656       NaN       NaN  0.844571  0.249980
Jim    -0.048006 -0.207574 -0.465525 -0.888653  1.646979
Travis -1.190065  0.113572  0.680029 -1.015694  1.728276

by_column=people.groupby(mapping,axis=1)#axis=1表示按行,預設按列axis=0

by_column.sum()
Out[35]: 
            blue       red
Joe     0.275499 -2.462209
Steve  -0.616811 -1.378880
Wes     0.844571  0.028324
Jim    -1.354177  1.391399
Travis -0.335665  0.651783
from pandas import Series

map_series=Series(mapping)

map_series
Out[39]: 
a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object
people.groupby(map_series,axis=1).count()
Out[40]: 
        blue  red
Joe        2    3
Steve      2    3
Wes        1    2
Jim        2    3
Travis     2    3
通過函式進行分組:

        索引值為人名,根據人名的長度進行分組,用len函式即可。

people.groupby(len).sum()
Out[41]: 
          a         b         c         d         e
3 -1.619077 -0.242438 -0.506997  0.272891  0.819028
5 -0.928486  1.348135  0.648762 -1.265573 -1.798529
6 -1.190065  0.113572  0.680029 -1.015694  1.728276
將函式與陣列、列表、字典、Series混用:
people.groupby([len,key_list]).min()
Out[43]: 
              a         b         c         d         e
3 one -1.349415 -0.034864 -0.041473  0.316972 -1.077930
  two -0.048006 -0.207574 -0.465525 -0.888653  1.646979
5 one -0.928486  1.348135  0.648762 -1.265573 -1.798529
6 two -1.190065  0.113572  0.680029 -1.015694  1.728276

columns=pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],
                                  names=['city','tenor'])


hier_df=DataFrame(np.random.randn(4,5),columns=columns)

hier_df
Out[48]: 
city         US                            JP          
tenor         1         3         5         1         3
0     -0.905468  1.703159  0.239218  1.090464 -0.677526
1     -1.152784 -0.610877  1.476141  0.270902  0.118338
2      0.535665 -0.551337 -1.060250  0.890812  0.923537
3     -0.204244 -0.249316  0.886220 -1.238991 -0.793501
根據索引級別分組:層次化索引資料集最方便的地方在於它能夠索引級別進行聚合。通過level關鍵字傳入級別編號或名稱即可
hier_df.groupby(level='city',axis=1).count()
Out[49]: 
city  JP  US
0      2   3
1      2   3
2      2   3
3      2   3
資料聚合:
df
Out[50]: 
      data1     data2 key1 key2
0  0.867596  0.739047    a  one
1 -0.271038 -0.621842    a  two
2  1.809116 -0.479957    b  one
3 -1.204141 -0.176988    b  two
4  0.008490  2.401128    a  one

grouped=df.groupby('key1')

grouped['data1'].quantile(0.9)#quantile計算Series或者dataframe樣本分位數
Out[52]: 
key1
a    0.695775
b    1.507790
Name: data1, dtype: float64
def peak_to_peak(arr):
    return arr.max()-arr.min()



grouped.agg(peak_to_peak)
Out[54]: 
         data1     data2
key1                    
a     1.138635  3.022970
b     3.013257  0.302969
tips=pd.read_csv('e:/tips.csv')

tips['tip_pct']=tips['tip']/tips['total_bill']

tips[:6]
Out[57]: 
   total_bill   tip     sex smoker  day    time  size   tip_pct
0       16.99  1.01  Female     No  Sun  Dinner     2  0.059447
1       10.34  1.66    Male     No  Sun  Dinner     3  0.160542
2       21.01  3.50    Male     No  Sun  Dinner     3  0.166587
3       23.68  3.31    Male     No  Sun  Dinner     2  0.139780
4       24.59  3.61  Female     No  Sun  Dinner     4  0.146808
5       25.29  4.71    Male     No  Sun  Dinner     4  0.186240
面向列的多函式應用:根據sex和smoker對tips進行分組
grouped=tips.groupby(['sex','smoker'])

grouped_pct=grouped['tip_pct']

grouped_pct.agg('mean')
Out[60]: 
sex     smoker
Female  No        0.156921
        Yes       0.182150
Male    No        0.160669
        Yes       0.152771
Name: tip_pct, dtype: float64
如果傳入一組函式或函式名,得到的DataFrame的列就會以相應的函式命名:
grouped_pct.agg(['mean','std',peak_to_peak])
Out[61]: 
                   mean       std  peak_to_peak
sex    smoker                                  
Female No      0.156921  0.036421      0.195876
       Yes     0.182150  0.071595      0.360233
Male   No      0.160669  0.041849      0.220186
       Yes     0.152771  0.090588      0.674707
如果傳入的是一個由(name,function)元組組成的列表,各元組的第一個元素就會被用作DataFrame的列名
grouped_pct.agg([('foo','mean'),('bar','std')])
Out[62]: 
                    foo       bar
sex    smoker                    
Female No      0.156921  0.036421
       Yes     0.182150  0.071595
Male   No      0.160669  0.041849
       Yes     0.152771  0.090588

functions=['count','mean','max']

result=grouped['tip_pct','total_bill'].agg(functions)

result
Out[65]: 
              tip_pct                     total_bill                  
                count      mean       max      count       mean    max
sex    smoker                                                         
Female No          54  0.156921  0.252672         54  18.105185  35.83
       Yes         33  0.182150  0.416667         33  17.977879  44.30
Male   No          97  0.160669  0.291990         97  19.791237  48.33
       Yes         60  0.152771  0.710345         60  22.284500  50.81

result['tip_pct']
Out[66]: 
               count      mean       max
sex    smoker                           
Female No         54  0.156921  0.252672
       Yes        33  0.182150  0.416667
Male   No         97  0.160669  0.291990
       Yes        60  0.152771  0.710345
傳入帶有自定義名稱的元組列表
ftuples=[('Durchschnitt','mean'),('Abweichung','var')]

grouped['tip_pct','total_bill'].agg(ftuples)
Out[68]: 
                   tip_pct              total_bill           
              Durchschnitt Abweichung Durchschnitt Abweichung
sex    smoker                                                
Female No         0.156921   0.001327    18.105185  53.092422
       Yes        0.182150   0.005126    17.977879  84.451517
Male   No         0.160669   0.001751    19.791237  76.152961
       Yes        0.152771   0.008206    22.284500  98.244673
想對不同的列應用不同的函式,向agg傳入一個從列名對映到函式的字典:
grouped.agg({'tip':np.max,'size':'sum'})
Out[69]: 
                tip  size
sex    smoker            
Female No       5.2   140
       Yes      6.5    74
Male   No       9.0   263
       Yes     10.0   150

grouped.agg({'tip_pct':['min','max','mean','std'],'size':'sum'})
Out[70]: 
                tip_pct                               size
                    min       max      mean       std  sum
sex    smoker                                             
Female No      0.056797  0.252672  0.156921  0.036421  140
       Yes     0.056433  0.416667  0.182150  0.071595   74
Male   No      0.071804  0.291990  0.160669  0.041849  263
       Yes     0.035638  0.710345  0.152771  0.090588  150
向groupby傳入as_index=False可以禁用

分組級運算及轉換

 想為一個DataFrame新增一個用於存放各索引分組平均值的列,先聚合再合併:

k1_means=df.groupby('key1').mean().add_prefix('mean_')

k1_means
Out[73]: 
      mean_data1  mean_data2
key1                        
a       0.201683    0.839444
b       0.302487   -0.328473

pd.merge(df,k1_means,left_on='key1',right_index=True)
Out[74]: 
      data1     data2 key1 key2  mean_data1  mean_data2
0  0.867596  0.739047    a  one    0.201683    0.839444
1 -0.271038 -0.621842    a  two    0.201683    0.839444
4  0.008490  2.401128    a  one    0.201683    0.839444
2  1.809116 -0.479957    b  one    0.302487   -0.328473
3 -1.204141 -0.176988    b  two    0.302487   -0.328473
在groupby上使用transform方法:tran sform會將一個函式應用於各個分組,並將結果返回到適當位置上
people.groupby(key).mean()
Out[76]: 
            a         b         c         d         e
one -0.920379  0.039354  0.319278  0.048616  0.300109
two -0.488246  0.570281  0.091618 -1.077113 -0.075775

people.groupby(key).transform(np.mean)
Out[77]: 
               a         b         c         d         e
Joe    -0.920379  0.039354  0.319278  0.048616  0.300109
Steve  -0.488246  0.570281  0.091618 -1.077113 -0.075775
Wes    -0.920379  0.039354  0.319278  0.048616  0.300109
Jim    -0.488246  0.570281  0.091618 -1.077113 -0.075775
Travis -0.920379  0.039354  0.319278  0.048616  0.300109
建立一個距平化函式,傳遞給transform:
def demean(arr):
    return arr-arr.mean()
demeaned=people.groupby(key).transform(demean)

demeaned
Out[80]: 
               a         b         c         d         e
Joe    -0.429036 -0.074218 -0.360751  0.268356 -1.378039
Steve  -0.440240  0.777855  0.557143 -0.188460 -1.722754
Wes     0.698722       NaN       NaN  0.795955 -0.050129
Jim     0.440240 -0.777855 -0.557143  0.188460  1.722754
Travis -0.269686  0.074218  0.360751 -1.064310  1.428167
demeaned.groupby(key).mean()
Out[81]: 
                a             b             c             d             e
one  7.401487e-17  0.000000e+00  2.775558e-17 -7.401487e-17  7.401487e-17
two  2.775558e-17 -5.551115e-17  0.000000e+00  5.551115e-17  0.000000e+00
apply函式:拆分-應用-合併。根絕分組選出最高的5個tip_pct值,編寫一個選取指定列具有最大值的行函式:
def top(df,n=5,column='tip_pct'):
    return df.sort_values(by=column)[-n:]

top(tips,n=6)
Out[84]: 
     total_bill   tip     sex smoker  day    time  size   tip_pct
109       14.31  4.00  Female    Yes  Sat  Dinner     2  0.279525
183       23.17  6.50    Male    Yes  Sun  Dinner     4  0.280535
232       11.61  3.39    Male     No  Sat  Dinner     2  0.291990
67         3.07  1.00  Female    Yes  Sat  Dinner     1  0.325733
178        9.60  4.00  Female    Yes  Sun  Dinner     2  0.416667
172        7.25  5.15    Male    Yes  Sun  Dinner     2  0.710345
對smoker分組並用該函式呼叫apply
tips.groupby('smoker').apply(top)
Out[85]: 
            total_bill   tip     sex smoker   day    time  size   tip_pct
smoker                                                                   
No     88        24.71  5.85    Male     No  Thur   Lunch     2  0.236746
       185       20.69  5.00    Male     No   Sun  Dinner     5  0.241663
       51        10.29  2.60  Female     No   Sun  Dinner     2  0.252672
       149        7.51  2.00    Male     No  Thur   Lunch     2  0.266312
       232       11.61  3.39    Male     No   Sat  Dinner     2  0.291990
Yes    109       14.31  4.00  Female    Yes   Sat  Dinner     2  0.279525
       183       23.17  6.50    Male    Yes   Sun  Dinner     4  0.280535
       67         3.07  1.00  Female    Yes   Sat  Dinner     1  0.325733
       178        9.60  4.00  Female    Yes   Sun  Dinner     2  0.416667
       172        7.25  5.15    Male    Yes   Sun  Dinner     2  0.710345
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')
Out[86]: 
                 total_bill    tip     sex smoker   day    time  size  \
smoker day                                                              
No     Fri  94        22.75   3.25  Female     No   Fri  Dinner     2   
       Sat  212       48.33   9.00    Male     No   Sat  Dinner     4   
       Sun  156       48.17   5.00    Male     No   Sun  Dinner     6   
       Thur 142       41.19   5.00    Male     No  Thur   Lunch     5   
Yes    Fri  95        40.17   4.73    Male    Yes   Fri  Dinner     4   
       Sat  170       50.81  10.00    Male    Yes   Sat  Dinner     3   
       Sun  182       45.35   3.50    Male    Yes   Sun  Dinner     3   
       Thur 197       43.11   5.00  Female    Yes  Thur   Lunch     4   

                  tip_pct  
smoker day                 
No     Fri  94   0.142857  
       Sat  212  0.186220  
       Sun  156  0.103799  
       Thur 142  0.121389  
Yes    Fri  95   0.117750  
       Sat  170  0.196812  
       Sun  182  0.077178  
       Thur 197  0.115982  

result=tips.groupby('smoker')['tip_pct'].describe()

result
Out[88]: 
smoker       
No      count    151.000000
        mean       0.159328
        std        0.039910
        min        0.056797
        25%        0.136906
        50%        0.155625
        75%        0.185014
        max        0.291990
Yes     count     93.000000
        mean       0.163196
        std        0.085119
        min        0.035638
        25%        0.106771
        50%        0.153846
        75%        0.195059
        max        0.710345
Name: tip_pct, dtype: float64

result.unstack('smoker')
Out[89]: 
smoker          No        Yes
count   151.000000  93.000000
mean      0.159328   0.163196
std       0.039910   0.085119
min       0.056797   0.035638
25%       0.136906   0.106771
50%       0.155625   0.153846
75%       0.185014   0.195059
max       0.291990   0.710345
分位數和桶分析:
frame=DataFrame({'data1':np.random.randn(1000),'data2':np.random.randn(1000)})

factor=pd.cut(frame.data1,4)

factor[:10]
Out[92]: 
0    (-1.307, 0.37]
1    (0.37, 2.0461]
2    (-1.307, 0.37]
3    (0.37, 2.0461]
4    (-1.307, 0.37]
5    (-1.307, 0.37]
6    (-1.307, 0.37]
7    (0.37, 2.0461]
8    (-1.307, 0.37]
9    (-1.307, 0.37]
Name: data1, dtype: category
Categories (4, object): [(-2.99, -1.307] < (-1.307, 0.37] < (0.37, 2.0461] < (2.0461, 3.723]]
cut返回的factor物件可直接用於groupby
def get_stats(group):
    return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()}
grouped=frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()
Out[95]: 
                 count       max      mean       min
data1                                               
(-2.99, -1.307]  101.0  2.730091 -0.206504 -2.922759
(-1.307, 0.37]   531.0  2.773289 -0.034422 -2.892153
(0.37, 2.0461]   344.0  3.078922  0.046008 -3.127290
(2.0461, 3.723]   24.0  1.671815 -0.020857 -3.019921
grouping=pd.qcut(frame.data1,10,labels=False)
grouped=frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()
Out[98]: 
       count       max      mean       min
data1                                     
0      100.0  2.730091 -0.212662 -2.922759
1      100.0  2.230289 -0.076260 -2.521450
2      100.0  2.404481  0.051184 -2.369298
3      100.0  2.773289  0.016575 -2.284056
4      100.0  2.328424 -0.043627 -2.892153
5      100.0  1.996065 -0.104378 -2.032999
6      100.0  2.166334  0.015241 -2.291139
7      100.0  2.687426 -0.057435 -2.409512
8      100.0  2.883604  0.267017 -3.127290
9      100.0  3.078922 -0.093742 -3.019921
#返回分位數編號
grouping=pd.qcut(frame.data1,10,labels=False)
grouped=frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()
s=Series(np.random.randn(6))#第0、2、4為nan
s[::2]=np.nan
s
a=Series(np.random.randn(9))#第0、3、6為nan
a[::3]=np.nan
a
a.dropna()
s.fillna(s.mean())
缺失值填充:
states=['Ohio','New York','Vermont','Florida','Oregon','Nevada','California','Idaho']

group_key=['East']*4+['West']*4

data=Series(np.random.randn(8),index=states)

data[['Vermont','Nevada','Idaho']]=np.nan

data
Out[117]: 
Ohio          0.326110
New York      0.136178
Vermont            NaN
Florida       0.392480
Oregon       -0.177571
Nevada             NaN
California    0.061360
Idaho              NaN
dtype: float64

data.groupby(group_key).mean()
Out[118]: 
East    0.284923
West   -0.058105
dtype: float64
用分組平均值去填充NA
fill_mean=lambda g:g.fillna(g.mean())

data.groupby(group_key).apply(fill_mean)
Out[120]: 
Ohio          0.326110
New York      0.136178
Vermont       0.284923
Florida       0.392480
Oregon       -0.177571
Nevada       -0.058105
California    0.061360
Idaho        -0.058105
dtype: float64
fill_values={'East':0.5,'West':-1}

fill_func=lambda g:g.fillna(fill_values[g.name])

data.groupby(group_key).apply(fill_func)

Out[123]: 
Ohio          0.326110
New York      0.136178
Vermont       0.500000
Florida       0.392480
Oregon       -0.177571
Nevada       -1.000000
California    0.061360
Idaho        -1.000000
dtype: float64
隨機抽牌
for suit in ['H','S','C','D']:
    cards.extend(str(num)+suit for num in base_names)
    #extend()接受一個列表引數,把引數列表的元素新增到列表的尾部
deck=Series(card_val,index=cards)#長度為52的Series,索引為排名
deck[:20]
#從整副牌中抽取5張
def draw(deck,n=5):
    return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)
#想要從每種花色中抽取兩張,花色是最後一個字元,用apply進行分組
get_suit=lambda card:card[-1]#只要最後一個字母
deck.groupby(get_suit).apply(draw,n=2)
deck.groupby(get_suit,group_keys=False).apply(draw,n=2)
分組加權平均數和相關係數
grouped=df.groupby('category')
get_wavg=lambda g:np.average(g['data'],weights=g['weights'])
grouped.apply(get_wavg)
Out[146]: 
category
a    0.858361
b   -0.098089
dtype: float64