第九章 資料分組和聚合(上)
阿新 • • 發佈:2019-01-25
key1import numpy as np from pandas import DataFrame df=DataFrame({'key1':['a','a','b','b','a'], 'key2':['one','two','one','two','one'], 'data1':np.random.randn(5),'data2':np.random.randn(5)}) df data1 data2 key1 key2 0 0.867596 0.739047 a one 1 -0.271038 -0.621842 a two 2 1.809116 -0.479957 b one 3 -1.204141 -0.176988 b two 4 0.008490 2.401128 a one #想按key1分組,計算data1列的平均值,訪問data1,根據key1呼叫groupby grouped=df['data1'].groupby(df['key1']) grouped.mean()
a 0.201683
b 0.302487
Name: data1, dtype: float64
means=df['data1'].groupby([df['key1'],df['key2']]).mean() means Out[8]: key1 key2 a one 0.438043 two -0.271038 b one 1.809116 two -1.204141 Name: data1, dtype: float64 #通過兩個鍵對資料進行分組,得到的Series具有一個層次化索引 means.unstack() Out[9]: key2 one two key1 a 0.438043 -0.271038 b 1.809116 -1.204141
states=np.array(['Ohio','California','California','Ohio','Ohio'])
years=np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()
Out[12]:
California 2005 -0.271038
2006 1.809116
Ohio 2005 -0.168272
2006 0.008490
Name: data1, dtype: float64
df.groupby('key1').mean() Out[13]: data1 data2 key1 a 0.201683 0.839444 b 0.302487 -0.328473 df.groupby(['key1','key2']).mean() Out[14]: data1 data2 key1 key2 a one 0.438043 1.570087 two -0.271038 -0.621842 b one 1.809116 -0.479957 two -1.204141 -0.176988
df.groupby(['key1','key2']).size()#size()返回一個有分組大小的列
Out[15]:
key1 key2
a one 2
two 1
b one 1
two 1
dtype: int64
對分組進行迭代:產生一組二元元組,由分組名和資料塊組成
for name,group in df.groupby('key1'):
print name
print group
a
data1 data2 key1 key2
0 0.867596 0.739047 a one
1 -0.271038 -0.621842 a two
4 0.008490 2.401128 a one
b
data1 data2 key1 key2
2 1.809116 -0.479957 b one
3 -1.204141 -0.176988 b two
對於多重鍵的情況,元組的第一個元素將會是由鍵值組成的元組:
for (k1,k2),group in df.groupby(['key1','key2']):
print k1,k2
print group
a one
data1 data2 key1 key2
0 0.867596 0.739047 a one
4 0.008490 2.401128 a one
a two
data1 data2 key1 key2
1 -0.271038 -0.621842 a two
b one
data1 data2 key1 key2
2 1.809116 -0.479957 b one
b two
data1 data2 key1 key2
3 -1.204141 -0.176988 b two
pieces=dict(list(df.groupby('key1')))
pieces['b']
Out[19]:
data1 data2 key1 key2
2 1.809116 -0.479957 b one
3 -1.204141 -0.176988 b two
groupby預設是在axis=0上進行分組的,通過設定也可在任何軸上進行分組,根據dtype對列進行分組:
df.dtypes
Out[20]:
data1 float64
data2 float64
key1 object
key2 object
dtype: object
grouped=df.groupby(df.dtypes,axis=1)
dict(list(grouped))
Out[22]:
{dtype('float64'): data1 data2
0 0.867596 0.739047
1 -0.271038 -0.621842
2 1.809116 -0.479957
3 -1.204141 -0.176988
4 0.008490 2.401128, dtype('O'): key1 key2
0 a one
1 a two
2 b one
3 b two
4 a one}
df.groupby(['key1','key2'])[['data2']].mean()
Out[26]:
data2
key1 key2
a one 1.570087
two -0.621842
b one -0.479957
two -0.176988
這種索引操作所返回的物件是一個已分組的DataFrame或已分組的Series
s_grouped=df.groupby(['key1','key2'])['data2']
s_grouped
Out[28]: <pandas.core.groupby.SeriesGroupBy object at 0x095467F0>
s_grouped.mean()
Out[29]:
key1 key2
a one 1.570087
two -0.621842
b one -0.479957
two -0.176988
Name: data2, dtype: float64
通過字典或Series進行分組:
people=DataFrame(np.random.randn(5,5),
columns=['a','b','c','d','e'],
index=['Joe','Steve','Wes','Jim','Travis'])
people.ix[2:3,['b','c']]=np.nan#新增幾個na值
people
Out[32]:
a b c d e
Joe -1.349415 -0.034864 -0.041473 0.316972 -1.077930
Steve -0.928486 1.348135 0.648762 -1.265573 -1.798529
Wes -0.221656 NaN NaN 0.844571 0.249980
Jim -0.048006 -0.207574 -0.465525 -0.888653 1.646979
Travis -1.190065 0.113572 0.680029 -1.015694 1.728276
by_column=people.groupby(mapping,axis=1)#axis=1表示按行,預設按列axis=0
by_column.sum()
Out[35]:
blue red
Joe 0.275499 -2.462209
Steve -0.616811 -1.378880
Wes 0.844571 0.028324
Jim -1.354177 1.391399
Travis -0.335665 0.651783
from pandas import Series
map_series=Series(mapping)
map_series
Out[39]:
a red
b red
c blue
d blue
e red
f orange
dtype: object
people.groupby(map_series,axis=1).count()
Out[40]:
blue red
Joe 2 3
Steve 2 3
Wes 1 2
Jim 2 3
Travis 2 3
通過函式進行分組:
索引值為人名,根據人名的長度進行分組,用len函式即可。
people.groupby(len).sum()
Out[41]:
a b c d e
3 -1.619077 -0.242438 -0.506997 0.272891 0.819028
5 -0.928486 1.348135 0.648762 -1.265573 -1.798529
6 -1.190065 0.113572 0.680029 -1.015694 1.728276
將函式與陣列、列表、字典、Series混用:
people.groupby([len,key_list]).min()
Out[43]:
a b c d e
3 one -1.349415 -0.034864 -0.041473 0.316972 -1.077930
two -0.048006 -0.207574 -0.465525 -0.888653 1.646979
5 one -0.928486 1.348135 0.648762 -1.265573 -1.798529
6 two -1.190065 0.113572 0.680029 -1.015694 1.728276
columns=pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],
names=['city','tenor'])
hier_df=DataFrame(np.random.randn(4,5),columns=columns)
hier_df
Out[48]:
city US JP
tenor 1 3 5 1 3
0 -0.905468 1.703159 0.239218 1.090464 -0.677526
1 -1.152784 -0.610877 1.476141 0.270902 0.118338
2 0.535665 -0.551337 -1.060250 0.890812 0.923537
3 -0.204244 -0.249316 0.886220 -1.238991 -0.793501
根據索引級別分組:層次化索引資料集最方便的地方在於它能夠索引級別進行聚合。通過level關鍵字傳入級別編號或名稱即可hier_df.groupby(level='city',axis=1).count()
Out[49]:
city JP US
0 2 3
1 2 3
2 2 3
3 2 3
資料聚合:
df
Out[50]:
data1 data2 key1 key2
0 0.867596 0.739047 a one
1 -0.271038 -0.621842 a two
2 1.809116 -0.479957 b one
3 -1.204141 -0.176988 b two
4 0.008490 2.401128 a one
grouped=df.groupby('key1')
grouped['data1'].quantile(0.9)#quantile計算Series或者dataframe樣本分位數
Out[52]:
key1
a 0.695775
b 1.507790
Name: data1, dtype: float64
def peak_to_peak(arr):
return arr.max()-arr.min()
grouped.agg(peak_to_peak)
Out[54]:
data1 data2
key1
a 1.138635 3.022970
b 3.013257 0.302969
tips=pd.read_csv('e:/tips.csv')
tips['tip_pct']=tips['tip']/tips['total_bill']
tips[:6]
Out[57]:
total_bill tip sex smoker day time size tip_pct
0 16.99 1.01 Female No Sun Dinner 2 0.059447
1 10.34 1.66 Male No Sun Dinner 3 0.160542
2 21.01 3.50 Male No Sun Dinner 3 0.166587
3 23.68 3.31 Male No Sun Dinner 2 0.139780
4 24.59 3.61 Female No Sun Dinner 4 0.146808
5 25.29 4.71 Male No Sun Dinner 4 0.186240
面向列的多函式應用:根據sex和smoker對tips進行分組
grouped=tips.groupby(['sex','smoker'])
grouped_pct=grouped['tip_pct']
grouped_pct.agg('mean')
Out[60]:
sex smoker
Female No 0.156921
Yes 0.182150
Male No 0.160669
Yes 0.152771
Name: tip_pct, dtype: float64
如果傳入一組函式或函式名,得到的DataFrame的列就會以相應的函式命名:
grouped_pct.agg(['mean','std',peak_to_peak])
Out[61]:
mean std peak_to_peak
sex smoker
Female No 0.156921 0.036421 0.195876
Yes 0.182150 0.071595 0.360233
Male No 0.160669 0.041849 0.220186
Yes 0.152771 0.090588 0.674707
如果傳入的是一個由(name,function)元組組成的列表,各元組的第一個元素就會被用作DataFrame的列名
grouped_pct.agg([('foo','mean'),('bar','std')])
Out[62]:
foo bar
sex smoker
Female No 0.156921 0.036421
Yes 0.182150 0.071595
Male No 0.160669 0.041849
Yes 0.152771 0.090588
functions=['count','mean','max']
result=grouped['tip_pct','total_bill'].agg(functions)
result
Out[65]:
tip_pct total_bill
count mean max count mean max
sex smoker
Female No 54 0.156921 0.252672 54 18.105185 35.83
Yes 33 0.182150 0.416667 33 17.977879 44.30
Male No 97 0.160669 0.291990 97 19.791237 48.33
Yes 60 0.152771 0.710345 60 22.284500 50.81
result['tip_pct']
Out[66]:
count mean max
sex smoker
Female No 54 0.156921 0.252672
Yes 33 0.182150 0.416667
Male No 97 0.160669 0.291990
Yes 60 0.152771 0.710345
傳入帶有自定義名稱的元組列表ftuples=[('Durchschnitt','mean'),('Abweichung','var')]
grouped['tip_pct','total_bill'].agg(ftuples)
Out[68]:
tip_pct total_bill
Durchschnitt Abweichung Durchschnitt Abweichung
sex smoker
Female No 0.156921 0.001327 18.105185 53.092422
Yes 0.182150 0.005126 17.977879 84.451517
Male No 0.160669 0.001751 19.791237 76.152961
Yes 0.152771 0.008206 22.284500 98.244673
想對不同的列應用不同的函式,向agg傳入一個從列名對映到函式的字典:
grouped.agg({'tip':np.max,'size':'sum'})
Out[69]:
tip size
sex smoker
Female No 5.2 140
Yes 6.5 74
Male No 9.0 263
Yes 10.0 150
grouped.agg({'tip_pct':['min','max','mean','std'],'size':'sum'})
Out[70]:
tip_pct size
min max mean std sum
sex smoker
Female No 0.056797 0.252672 0.156921 0.036421 140
Yes 0.056433 0.416667 0.182150 0.071595 74
Male No 0.071804 0.291990 0.160669 0.041849 263
Yes 0.035638 0.710345 0.152771 0.090588 150
向groupby傳入as_index=False可以禁用
分組級運算及轉換
想為一個DataFrame新增一個用於存放各索引分組平均值的列,先聚合再合併:
k1_means=df.groupby('key1').mean().add_prefix('mean_')
k1_means
Out[73]:
mean_data1 mean_data2
key1
a 0.201683 0.839444
b 0.302487 -0.328473
pd.merge(df,k1_means,left_on='key1',right_index=True)
Out[74]:
data1 data2 key1 key2 mean_data1 mean_data2
0 0.867596 0.739047 a one 0.201683 0.839444
1 -0.271038 -0.621842 a two 0.201683 0.839444
4 0.008490 2.401128 a one 0.201683 0.839444
2 1.809116 -0.479957 b one 0.302487 -0.328473
3 -1.204141 -0.176988 b two 0.302487 -0.328473
在groupby上使用transform方法:tran sform會將一個函式應用於各個分組,並將結果返回到適當位置上
people.groupby(key).mean()
Out[76]:
a b c d e
one -0.920379 0.039354 0.319278 0.048616 0.300109
two -0.488246 0.570281 0.091618 -1.077113 -0.075775
people.groupby(key).transform(np.mean)
Out[77]:
a b c d e
Joe -0.920379 0.039354 0.319278 0.048616 0.300109
Steve -0.488246 0.570281 0.091618 -1.077113 -0.075775
Wes -0.920379 0.039354 0.319278 0.048616 0.300109
Jim -0.488246 0.570281 0.091618 -1.077113 -0.075775
Travis -0.920379 0.039354 0.319278 0.048616 0.300109
建立一個距平化函式,傳遞給transform:
def demean(arr):
return arr-arr.mean()
demeaned=people.groupby(key).transform(demean)
demeaned
Out[80]:
a b c d e
Joe -0.429036 -0.074218 -0.360751 0.268356 -1.378039
Steve -0.440240 0.777855 0.557143 -0.188460 -1.722754
Wes 0.698722 NaN NaN 0.795955 -0.050129
Jim 0.440240 -0.777855 -0.557143 0.188460 1.722754
Travis -0.269686 0.074218 0.360751 -1.064310 1.428167
demeaned.groupby(key).mean()
Out[81]:
a b c d e
one 7.401487e-17 0.000000e+00 2.775558e-17 -7.401487e-17 7.401487e-17
two 2.775558e-17 -5.551115e-17 0.000000e+00 5.551115e-17 0.000000e+00
apply函式:拆分-應用-合併。根絕分組選出最高的5個tip_pct值,編寫一個選取指定列具有最大值的行函式:
def top(df,n=5,column='tip_pct'):
return df.sort_values(by=column)[-n:]
top(tips,n=6)
Out[84]:
total_bill tip sex smoker day time size tip_pct
109 14.31 4.00 Female Yes Sat Dinner 2 0.279525
183 23.17 6.50 Male Yes Sun Dinner 4 0.280535
232 11.61 3.39 Male No Sat Dinner 2 0.291990
67 3.07 1.00 Female Yes Sat Dinner 1 0.325733
178 9.60 4.00 Female Yes Sun Dinner 2 0.416667
172 7.25 5.15 Male Yes Sun Dinner 2 0.710345
對smoker分組並用該函式呼叫applytips.groupby('smoker').apply(top)
Out[85]:
total_bill tip sex smoker day time size tip_pct
smoker
No 88 24.71 5.85 Male No Thur Lunch 2 0.236746
185 20.69 5.00 Male No Sun Dinner 5 0.241663
51 10.29 2.60 Female No Sun Dinner 2 0.252672
149 7.51 2.00 Male No Thur Lunch 2 0.266312
232 11.61 3.39 Male No Sat Dinner 2 0.291990
Yes 109 14.31 4.00 Female Yes Sat Dinner 2 0.279525
183 23.17 6.50 Male Yes Sun Dinner 4 0.280535
67 3.07 1.00 Female Yes Sat Dinner 1 0.325733
178 9.60 4.00 Female Yes Sun Dinner 2 0.416667
172 7.25 5.15 Male Yes Sun Dinner 2 0.710345
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')
Out[86]:
total_bill tip sex smoker day time size \
smoker day
No Fri 94 22.75 3.25 Female No Fri Dinner 2
Sat 212 48.33 9.00 Male No Sat Dinner 4
Sun 156 48.17 5.00 Male No Sun Dinner 6
Thur 142 41.19 5.00 Male No Thur Lunch 5
Yes Fri 95 40.17 4.73 Male Yes Fri Dinner 4
Sat 170 50.81 10.00 Male Yes Sat Dinner 3
Sun 182 45.35 3.50 Male Yes Sun Dinner 3
Thur 197 43.11 5.00 Female Yes Thur Lunch 4
tip_pct
smoker day
No Fri 94 0.142857
Sat 212 0.186220
Sun 156 0.103799
Thur 142 0.121389
Yes Fri 95 0.117750
Sat 170 0.196812
Sun 182 0.077178
Thur 197 0.115982
result=tips.groupby('smoker')['tip_pct'].describe()
result
Out[88]:
smoker
No count 151.000000
mean 0.159328
std 0.039910
min 0.056797
25% 0.136906
50% 0.155625
75% 0.185014
max 0.291990
Yes count 93.000000
mean 0.163196
std 0.085119
min 0.035638
25% 0.106771
50% 0.153846
75% 0.195059
max 0.710345
Name: tip_pct, dtype: float64
result.unstack('smoker')
Out[89]:
smoker No Yes
count 151.000000 93.000000
mean 0.159328 0.163196
std 0.039910 0.085119
min 0.056797 0.035638
25% 0.136906 0.106771
50% 0.155625 0.153846
75% 0.185014 0.195059
max 0.291990 0.710345
分位數和桶分析:
frame=DataFrame({'data1':np.random.randn(1000),'data2':np.random.randn(1000)})
factor=pd.cut(frame.data1,4)
factor[:10]
Out[92]:
0 (-1.307, 0.37]
1 (0.37, 2.0461]
2 (-1.307, 0.37]
3 (0.37, 2.0461]
4 (-1.307, 0.37]
5 (-1.307, 0.37]
6 (-1.307, 0.37]
7 (0.37, 2.0461]
8 (-1.307, 0.37]
9 (-1.307, 0.37]
Name: data1, dtype: category
Categories (4, object): [(-2.99, -1.307] < (-1.307, 0.37] < (0.37, 2.0461] < (2.0461, 3.723]]
cut返回的factor物件可直接用於groupbydef get_stats(group):
return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()}
grouped=frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()
Out[95]:
count max mean min
data1
(-2.99, -1.307] 101.0 2.730091 -0.206504 -2.922759
(-1.307, 0.37] 531.0 2.773289 -0.034422 -2.892153
(0.37, 2.0461] 344.0 3.078922 0.046008 -3.127290
(2.0461, 3.723] 24.0 1.671815 -0.020857 -3.019921
grouping=pd.qcut(frame.data1,10,labels=False)
grouped=frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()
Out[98]:
count max mean min
data1
0 100.0 2.730091 -0.212662 -2.922759
1 100.0 2.230289 -0.076260 -2.521450
2 100.0 2.404481 0.051184 -2.369298
3 100.0 2.773289 0.016575 -2.284056
4 100.0 2.328424 -0.043627 -2.892153
5 100.0 1.996065 -0.104378 -2.032999
6 100.0 2.166334 0.015241 -2.291139
7 100.0 2.687426 -0.057435 -2.409512
8 100.0 2.883604 0.267017 -3.127290
9 100.0 3.078922 -0.093742 -3.019921
#返回分位數編號
grouping=pd.qcut(frame.data1,10,labels=False)
grouped=frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()
s=Series(np.random.randn(6))#第0、2、4為nan
s[::2]=np.nan
s
a=Series(np.random.randn(9))#第0、3、6為nan
a[::3]=np.nan
a
a.dropna()
s.fillna(s.mean())
缺失值填充:
states=['Ohio','New York','Vermont','Florida','Oregon','Nevada','California','Idaho']
group_key=['East']*4+['West']*4
data=Series(np.random.randn(8),index=states)
data[['Vermont','Nevada','Idaho']]=np.nan
data
Out[117]:
Ohio 0.326110
New York 0.136178
Vermont NaN
Florida 0.392480
Oregon -0.177571
Nevada NaN
California 0.061360
Idaho NaN
dtype: float64
data.groupby(group_key).mean()
Out[118]:
East 0.284923
West -0.058105
dtype: float64
用分組平均值去填充NA
fill_mean=lambda g:g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)
Out[120]:
Ohio 0.326110
New York 0.136178
Vermont 0.284923
Florida 0.392480
Oregon -0.177571
Nevada -0.058105
California 0.061360
Idaho -0.058105
dtype: float64
fill_values={'East':0.5,'West':-1}
fill_func=lambda g:g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)
Out[123]:
Ohio 0.326110
New York 0.136178
Vermont 0.500000
Florida 0.392480
Oregon -0.177571
Nevada -1.000000
California 0.061360
Idaho -1.000000
dtype: float64
隨機抽牌
for suit in ['H','S','C','D']:
cards.extend(str(num)+suit for num in base_names)
#extend()接受一個列表引數,把引數列表的元素新增到列表的尾部
deck=Series(card_val,index=cards)#長度為52的Series,索引為排名
deck[:20]
#從整副牌中抽取5張
def draw(deck,n=5):
return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)
#想要從每種花色中抽取兩張,花色是最後一個字元,用apply進行分組
get_suit=lambda card:card[-1]#只要最後一個字母
deck.groupby(get_suit).apply(draw,n=2)
deck.groupby(get_suit,group_keys=False).apply(draw,n=2)
分組加權平均數和相關係數
grouped=df.groupby('category')
get_wavg=lambda g:np.average(g['data'],weights=g['weights'])
grouped.apply(get_wavg)
Out[146]:
category
a 0.858361
b -0.098089
dtype: float64