資料聚合與分組運算
阿新 • • 發佈:2018-11-07
#資料聚合與分組
import pandas as pd
import numpy as np
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
'key2' : ['one', 'two', 'one', 'two', 'one'],
'data1' : np.random.randn(5),
'data2' : np.random.randn(5)})
df
key1 | key2 | data1 | data2 | |
---|---|---|---|---|
0 | a | one | 1.364596 | 0.352792 |
1 | a | two | 1.685626 | 0.236429 |
2 | b | one | -0.537077 | -0.018004 |
3 | b | two | 1.389866 | 0.826195 |
4 | a | one | 0.849733 | 1.619383 |
grouped = df['data1'].groupby(df['key1'])
grouped
<pandas.core.groupby.groupby.SeriesGroupBy object at 0x00000261B3E2CEB8>
grouped.mean()
key1
a 1.299985
b 0.426394
Name: data1, dtype: float64
means = df['data1'].groupby([df[ 'key1'],df['key2']]).mean()#兩個引數分組
means
key1 key2
a one 1.107165
two 1.685626
b one -0.537077
two 1.389866
Name: data1, dtype: float64
means.unstack()
key2 | one | two |
---|---|---|
key1 | ||
a | 1.107165 | 1.685626 |
b | -0.537077 | 1.389866 |
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
#分組鍵可以是任何長度適當的陣列
df['data1'].groupby([states, years]).mean()
California 2005 1.685626
2006 -0.537077
Ohio 2005 1.377231
2006 0.849733
Name: data1, dtype: float64
df.groupby('key1').mean()
data1 | data2 | |
---|---|---|
key1 | ||
a | 1.299985 | 0.736201 |
b | 0.426394 | 0.404096 |
df.groupby(['key1','key2']).mean()
data1 | data2 | ||
---|---|---|---|
key1 | key2 | ||
a | one | 1.107165 | 0.986087 |
two | 1.685626 | 0.236429 | |
b | one | -0.537077 | -0.018004 |
two | 1.389866 | 0.826195 |
df.groupby(['key1','key2']).size()#返回分組大小
key1 key2
a one 2
two 1
b one 1
two 1
dtype: int64
#對分組進行迭代,GroupBy物件支援迭代
for name,group in df.groupby('key1'):
print(name)
print(group)
a
key1 key2 data1 data2
0 a one 1.364596 0.352792
1 a two 1.685626 0.236429
4 a one 0.849733 1.619383
b
key1 key2 data1 data2
2 b one -0.537077 -0.018004
3 b two 1.389866 0.826195
for (k1,k2),group in df.groupby(['key1','key2']):
print((k1,k2))
print(group)
('a', 'one')
key1 key2 data1 data2
0 a one 1.364596 0.352792
4 a one 0.849733 1.619383
('a', 'two')
key1 key2 data1 data2
1 a two 1.685626 0.236429
('b', 'one')
key1 key2 data1 data2
2 b one -0.537077 -0.018004
('b', 'two')
key1 key2 data1 data2
3 b two 1.389866 0.826195
#將這些資料片段做成一個字典:
pieces = dict(list(df.groupby('key1')))
pieces['b']
key1 | key2 | data1 | data2 | |
---|---|---|---|---|
2 | b | one | -0.537077 | -0.018004 |
3 | b | two | 1.389866 | 0.826195 |
#對列進行分組
df.dtypes
key1 object
key2 object
data1 float64
data2 float64
dtype: object
grouped = df.groupby(df.dtypes,axis=1)
for dtype,group in grouped:
print(dtype)
print(group)
float64
data1 data2
0 1.364596 0.352792
1 1.685626 0.236429
2 -0.537077 -0.018004
3 1.389866 0.826195
4 0.849733 1.619383
object
key1 key2
0 a one
1 a two
2 b one
3 b two
4 a one
#選取一列或列的子集
df['data1'].groupby(df['key1'])
<pandas.core.groupby.groupby.SeriesGroupBy object at 0x00000261B3E74FD0>
df.groupby(['key1', 'key2'])[['data2']].mean()
data2 | ||
---|---|---|
key1 | key2 | |
a | one | 0.986087 |
two | 0.236429 | |
b | one | -0.018004 |
two | 0.826195 |
#通過字典或Series進行分組
people = pd.DataFrame(np.random.randn(5,5),
columns=['a','b','c','d','e'],
index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3,[1,2]] = np.nan
people
a | b | c | d | e | |
---|---|---|---|---|---|
Joe | 0.037796 | 1.631083 | 0.455609 | -1.405327 | 0.495940 |
Steve | 0.304393 | 0.326005 | 0.146350 | 0.075903 | -0.263559 |
Wes | -0.055827 | NaN | NaN | 1.268622 | -0.541199 |
Jim | 2.034394 | 0.818811 | 0.333991 | 0.158734 | 1.187207 |
Travis | 2.719235 | -0.459516 | -0.292250 | 0.158169 | 1.102169 |
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',#分組字典
'd': 'blue', 'e': 'red', 'f' : 'orange'}
by_column = people.groupby(mapping,axis=1)
by_column.sum()
blue | red | |
---|---|---|
Joe | -0.949718 | 2.164819 |
Steve | 0.222253 | 0.366838 |
Wes | 1.268622 | -0.597026 |
Jim | 0.492726 | 4.040412 |
Travis | -0.134082 | 3.361889 |
map_series = pd.Series(mapping)
map_series
a red
b red
c blue
d blue
e red
f orange
dtype: object
people.groupby(map_series,axis=1).count()
blue | red | |
---|---|---|
Joe | 2 | 3 |
Steve | 2 | 3 |
Wes | 1 | 2 |
Jim | 2 | 3 |
Travis | 2 | 3 |
#通過函式進行分組
people.groupby(len).sum()
a | b | c | d | e | |
---|---|---|---|---|---|
3 | 2.016363 | 2.449894 | 0.789601 | 0.022029 | 1.141948 |
5 | 0.304393 | 0.326005 | 0.146350 | 0.075903 | -0.263559 |
6 | 2.719235 | -0.459516 | -0.292250 | 0.158169 | 1.102169 |
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()
a | b | c | d | e | ||
---|---|---|---|---|---|---|
3 | one | -0.055827 | 1.631083 | 0.455609 | -1.405327 | -0.541199 |
two | 2.034394 | 0.818811 | 0.333991 | 0.158734 | 1.187207 | |
5 | one | 0.304393 | 0.326005 | 0.146350 | 0.075903 | -0.263559 |
6 | two | 2.719235 | -0.459516 | -0.292250 | 0.158169 | 1.102169 |
#根據索引級別分組
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
[1, 3, 5, 1, 3]],
names=['cty', 'tenor'])
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df
cty | US | JP | |||
---|---|---|---|---|---|
tenor | 1 | 3 | 5 | 1 | 3 |
0 | -1.865861 | 0.145454 | 1.476689 | 0.798411 | 0.546048 |
1 | 1.887149 | 0.218613 | 0.951165 | 2.790860 | -2.419909 |
2 | -0.681806 | -0.490238 | -2.215909 | 0.375548 | 0.145144 |
3 | 0.501322 | 1.358101 | -1.069453 | 1.916614 | -0.018305 |
#根據級別分組,使用level關鍵字傳遞級別序號或名字:
hier_df.groupby(level='cty',axis=1).count()
cty | JP | US |
---|---|---|
0 | 2 | 3 |
1 | 2 | 3 |
2 | 2 | 3 |
3 | 2 | 3 |
#資料聚合
#聚合指的是任何能夠從陣列產生標量值的資料轉換過程
df
key1 | key2 | data1 | data2 | |
---|---|---|---|---|
0 | a | one | 1.364596 | 0.352792 |
1 | a | two | 1.685626 | 0.236429 |
2 | b | one | -0.537077 | -0.018004 |
3 | b | two | 1.389866 | 0.826195 |
4 | a | one | 0.849733 | 1.619383 |
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)
key1
a 1.621420
b 1.197171
Name: data1, dtype: float64
def peak_to_peak(arr):
return arr.max()-arr.min()
grouped.agg(peak_to_peak)#使用你自己的聚合函式,只需將其傳入aggregate或agg方法
data1 | data2 | |
---|---|---|
key1 | ||
a | 0.835893 | 1.382954 |
b | 1.926943 | 0.844199 |
grouped.describe()
data1 | data2 | |||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
key1 | ||||||||||||||||
a | 3.0 | 1.299985 | 0.421675 | 0.849733 | 1.107165 | 1.364596 | 1.525111 | 1.685626 | 3.0 | 0.736201 | 0.767068 | 0.236429 | 0.294610 | 0.352792 | 0.986087 | 1.619383 |
b | 2.0 | 0.426394 | 1.362554 | -0.537077 | -0.055342 | 0.426394 | 0.908130 | 1.389866 | 2.0 | 0.404096 | 0.596939 | -0.018004 | 0.193046 | 0.404096 | 0.615145 | 0.826195 |
#面向列的多函式應用
tips = pd.read_csv('examples/tips.csv')
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips[:6]
total_bill | tip | smoker | day | time | size | tip_pct | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | No | Sun | Dinner | 2 | 0.059447 |
1 | 10.34 | 1.66 | No | Sun | Dinner | 3 | 0.160542 |
2 | 21.01 | 3.50 | No | Sun | Dinner | 3 | 0.166587 |
3 | 23.68 | 3.31 | No | Sun | Dinner | 2 | 0.139780 |
4 | 24.59 | 3.61 | No | Sun | Dinner | 4 | 0.146808 |
5 | 25.29 | 4.71 | No | Sun | Dinner | 4 | 0.186240 |
grouped = tips.groupby(['day','smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
day smoker
Fri No 0.151650
Yes 0.174783
Sat No 0.158048
Yes 0.147906
Sun No 0.160113
Yes 0.187250
Thur No 0.160298
Yes 0.163863
Name: tip_pct, dtype: float64
grouped_pct.agg(['mean', 'std', peak_to_peak])
mean | std | peak_to_peak | ||
---|---|---|---|---|
day | smoker | |||
Fri | No | 0.151650 | 0.028123 | 0.067349 |
Yes | 0.174783 | 0.051293 | 0.159925 | |
Sat | No | 0.158048 | 0.039767 | 0.235193 |
Yes | 0.147906 | 0.061375 | 0.290095 | |
Sun | No | 0.160113 | 0.042347 | 0.193226 |
Yes | 0.187250 | 0.154134 | 0.644685 | |
Thur | No | 0.160298 | 0.038774 | 0.193350 |
Yes | 0.163863 | 0.039389 | 0.151240 |
#修改聚合名字
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])
foo | bar | ||
---|---|---|---|
day | smoker | ||
Fri | No | 0.151650 | 0.028123 |
Yes | 0.174783 | 0.051293 | |
Sat | No | 0.158048 | 0.039767 |
Yes | 0.147906 | 0.061375 | |
Sun | No | 0.160113 | 0.042347 |
Yes | 0.187250 | 0.154134 | |
Thur | No | 0.160298 | 0.038774 |
Yes | 0.163863 | 0.039389 |
functions = ['count','mean','max']
result = grouped['tip_pct','total_bill'].agg(functions)
result
tip_pct | total_bill | ||||||
---|---|---|---|---|---|---|---|
count | mean | max | count | mean | max | ||
day | smoker | ||||||
Fri | No | 4 | 0.151650 | 0.187735 | 4 | 18.420000 | 22.75 |
Yes | 15 | 0.174783 | 0.263480 | 15 | 16.813333 | 40.17 | |
Sat | No | 45 | 0.158048 | 0.291990 | 45 | 19.661778 | 48.33 |
Yes | 42 | 0.147906 | 0.325733 | 42 | 21.276667 | 50.81 | |
Sun | No | 57 | 0.160113 | 0.252672 | 57 | 20.506667 | 48.17 |
Yes | 19 | 0.187250 | 0.710345 | 19 | 24.120000 | 45.35 | |
Thur | No | 45 | 0.160298 | 0.266312 | 45 | 17.113111 | 41.19 |
Yes | 17 | 0.163863 | 0.241255 | 17 | 19.190588 | 43.11 |
ftuples = [('Durchschnitt', 'mean'),('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples)
tip_pct | total_bill | ||||
---|---|---|---|---|---|
Durchschnitt | Abweichung | Durchschnitt | Abweichung | ||
day | smoker | ||||
Fri | No | 0.151650 | 0.000791 | 18.420000 | 25.596333 |
Yes | 0.174783 | 0.002631 | 16.813333 | 82.562438 | |
Sat | No | 0.158048 | 0.001581 | 19.661778 | 79.908965 |
Yes | 0.147906 | 0.003767 | 21.276667 | 101.387535 | |
Sun | No | 0.160113 | 0.001793 | 20.506667 | 66.099980 |
Yes | 0.187250 | 0.023757 | 24.120000 | 109.046044 | |
Thur | No | 0.160298 | 0.001503 | 17.113111 | 59.625081 |
Yes | 0.163863 | 0.001551 | 19.190588 | 69.808518 |
#對一個列或不同的列應用不同的函式
grouped.agg({'tip' : np.max, 'size' : 'sum'})
grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],
'size' : 'sum'})
tip_pct | size | |||||
---|---|---|---|---|---|---|
min | max | mean | std | sum | ||
day | smoker | |||||
Fri | No | 0.120385 | 0.187735 | 0.151650 | 0.028123 | 9 |
Yes | 0.103555 | 0.263480 | 0.174783 | 0.051293 | 31 | |
Sat | No | 0.056797 | 0.291990 | 0.158048 | 0.039767 | 115 |
Yes | 0.035638 | 0.325733 | 0.147906 | 0.061375 | 104 | |
Sun | No | 0.059447 | 0.252672 | 0.160113 | 0.042347 | 167 |
Yes | 0.065660 | 0.710345 | 0.187250 | 0.154134 | 49 | |
Thur | No | 0.072961 | 0.266312 | 0.160298 | 0.038774 | 112 |
Yes | 0.090014 | 0.241255 | 0.163863 | 0.039389 | 40 |
#apply:一般性的拆分應用合併
def top(df,n=5,column='tip_pct'):
return df.sort_values(by=column)[-n:]
top(tips,n=6)
total_bill | tip | smoker | day | time | size | tip_pct | |
---|---|---|---|---|---|---|---|
109 | 14.31 | 4.00 | Yes | Sat | Dinner | 2 | 0.279525 |
183 | 23.17 | 6.50 | Yes | Sun | Dinner | 4 | 0.280535 |
232 | 11.61 | 3.39 | No | Sat | Dinner | 2 | 0.291990 |
67 | 3.07 | 1.00 | Yes | Sat | Dinner | 1 | 0.325733 |
178 | 9.60 | 4.00 | Yes | Sun | Dinner | 2 | 0.416667 |
172 | 7.25 | 5.15 | Yes | Sun | Dinner | 2 | 0.710345 |
tips.groupby('smoker').apply(top)
total_bill | tip | smoker | day | time | size | tip_pct | ||
---|---|---|---|---|---|---|---|---|
smoker | ||||||||
No | 88 | 24.71 | 5.85 | No | Thur | Lunch | 2 | 0.236746 |
185 | 20.69 | 5.00 | No | Sun | Dinner | 5 | 0.241663 | |
51 | 10.29 | 2.60 | No | Sun | Dinner | 2 | 0.252672 | |
149 | 7.51 | 2.00 | No | Thur | Lunch | 2 | 0.266312 | |
232 | 11.61 | 3.39 | No | Sat | Dinner | 2 | 0.291990 | |
Yes | 109 | 14.31 | 4.00 | Yes | Sat | Dinner | 2 | 0.279525 |
183 | 23.17 | 6.50 | Yes | Sun | Dinner | 4 | 0.280535 | |
67 | 3.07 | 1.00 | Yes | Sat | Dinner | 1 | 0.325733 | |
178 | 9.60 | 4.00 | Yes | Sun | Dinner | 2 | 0.416667 | |
172 | 7.25 | 5.15 | Yes | Sun | Dinner | 2 | 0.710345 |
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')
total_bill | tip | smoker | day | time | size | tip_pct | |||
---|---|---|---|---|---|---|---|---|---|
smoker | day | ||||||||
No | Fri | 94 | 22.75 | 3.25 | No | Fri | Dinner | 2 | 0.142857 |
Sat | 212 | 48.33 | 9.00 | No | Sat | Dinner | 4 | 0.186220 | |
Sun | 156 | 48.17 | 5.00 | No | Sun | Dinner | 6 | 0.103799 | |
Thur | 142 | 41.19 | 5.00 | No | Thur | Lunch | 5 | 0.121389 | |
Yes | Fri | 95 | 40.17 | 4.73 | Yes | Fri | Dinner | 4 | 0.117750 |
Sat | 170 | 50.81 | 10.00 | Yes | Sat | Dinner | 3 | 0.196812 | |
Sun | 182 | 45.35 | 3.50 | Yes | Sun | Dinner | 3 | 0.077178 | |
Thur | 197 | 43.11 | 5.00 | Yes | Thur | Lunch | 4 | 0.115982 |
result = tips.groupby('smoker')['tip_pct'].describe()
result
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
smoker | ||||||||
No | 151.0 | 0.159328 | 0.039910 | 0.056797 | 0.136906 | 0.155625 | 0.185014 | 0.291990 |
Yes | 93.0 | 0.163196 | 0.085119 | 0.035638 | 0.106771 | 0.153846 | 0.195059 | 0.710345 |
result.unstack('smoker')
smoker
count No 151.000000
Yes 93.000000
mean No 0.159328
Yes 0.163196
std No 0.039910
Yes 0.085119
min No 0.056797
Yes 0.035638
25% No 0.136906
Yes 0.106771
50% No 0.155625
Yes 0.153846
75% No 0.185014
Yes 0.195059
max No 0.291990
Yes 0.710345
dtype: float64
#禁止分組鍵
tips.groupby('smoker',group_keys=False).apply(top)
total_bill | tip | smoker | day | time | size | tip_pct | |
---|---|---|---|---|---|---|---|
88 | 24.71 | 5.85 | No | Thur | Lunch | 2 | 0.236746 |
185 | 20.69 | 5.00 | No | Sun | Dinner | 5 | 0.241663 |
51 | 10.29 | 2.60 | No | Sun | Dinner | 2 | 0.252672 |
149 | 7.51 | 2.00 | No | Thur | Lunch | 2 | 0.266312 |
232 | 11.61 | 3.39 | No | Sat | Dinner | 2 | 0.291990 |
109 | 14.31 | 4.00 | Yes | Sat | Dinner | 2 | 0.279525 |
183 | 23.17 | 6.50 | Yes | Sun | Dinner | 4 | 0.280535 |
67 | 3.07 | 1.00 | Yes | Sat | Dinner | 1 | 0.325733 |
178 | 9.60 | 4.00 | Yes | Sun | Dinner | 2 | 0.416667 |
172 | 7.25 | 5.15 | Yes | Sun | Dinner | 2 | 0.710345 |
#分位數和桶分析
frame = pd.DataFrame({'data1':np.ran