1. 程式人生 > >資料聚合與分組運算

資料聚合與分組運算

#資料聚合與分組
import pandas as pd
import numpy as np
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                       'key2' : ['one', 'two', 'one', 'two', 'one'],
                       'data1' : np.random.randn(5),
                       'data2' : np.random.randn(5)})
df
key1 key2 data1 data2
0 a one 1.364596 0.352792
1 a two 1.685626 0.236429
2 b one -0.537077 -0.018004
3 b two 1.389866 0.826195
4 a one 0.849733 1.619383
grouped = df['data1'].groupby(df['key1'])
grouped
<pandas.core.groupby.groupby.SeriesGroupBy object at 0x00000261B3E2CEB8>
grouped.mean()
key1
a    1.299985
b    0.426394
Name: data1, dtype: float64
means = df['data1'].groupby([df[
'key1'],df['key2']]).mean()#兩個引數分組
means
key1  key2
a     one     1.107165
      two     1.685626
b     one    -0.537077
      two     1.389866
Name: data1, dtype: float64
means.unstack()
key2 one two
key1
a 1.107165 1.685626
b -0.537077 1.389866
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
#分組鍵可以是任何長度適當的陣列
df['data1'].groupby([states, years]).mean()
California  2005    1.685626
            2006   -0.537077
Ohio        2005    1.377231
            2006    0.849733
Name: data1, dtype: float64
df.groupby('key1').mean()
data1 data2
key1
a 1.299985 0.736201
b 0.426394 0.404096
df.groupby(['key1','key2']).mean()
data1 data2
key1 key2
a one 1.107165 0.986087
two 1.685626 0.236429
b one -0.537077 -0.018004
two 1.389866 0.826195
df.groupby(['key1','key2']).size()#返回分組大小
key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64
#對分組進行迭代,GroupBy物件支援迭代
for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one  1.364596  0.352792
1    a  two  1.685626  0.236429
4    a  one  0.849733  1.619383
b
  key1 key2     data1     data2
2    b  one -0.537077 -0.018004
3    b  two  1.389866  0.826195
for (k1,k2),group in df.groupby(['key1','key2']):
    print((k1,k2))
    print(group)
('a', 'one')
  key1 key2     data1     data2
0    a  one  1.364596  0.352792
4    a  one  0.849733  1.619383
('a', 'two')
  key1 key2     data1     data2
1    a  two  1.685626  0.236429
('b', 'one')
  key1 key2     data1     data2
2    b  one -0.537077 -0.018004
('b', 'two')
  key1 key2     data1     data2
3    b  two  1.389866  0.826195
#將這些資料片段做成一個字典:
pieces = dict(list(df.groupby('key1')))
pieces['b']
key1 key2 data1 data2
2 b one -0.537077 -0.018004
3 b two 1.389866 0.826195
#對列進行分組
df.dtypes
key1      object
key2      object
data1    float64
data2    float64
dtype: object
grouped = df.groupby(df.dtypes,axis=1)
for dtype,group in grouped:
    print(dtype)
    print(group)
float64
      data1     data2
0  1.364596  0.352792
1  1.685626  0.236429
2 -0.537077 -0.018004
3  1.389866  0.826195
4  0.849733  1.619383
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one
#選取一列或列的子集
df['data1'].groupby(df['key1'])
<pandas.core.groupby.groupby.SeriesGroupBy object at 0x00000261B3E74FD0>
df.groupby(['key1', 'key2'])[['data2']].mean()
data2
key1 key2
a one 0.986087
two 0.236429
b one -0.018004
two 0.826195
#通過字典或Series進行分組
people = pd.DataFrame(np.random.randn(5,5),
                     columns=['a','b','c','d','e'],
                     index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3,[1,2]] = np.nan
people
a b c d e
Joe 0.037796 1.631083 0.455609 -1.405327 0.495940
Steve 0.304393 0.326005 0.146350 0.075903 -0.263559
Wes -0.055827 NaN NaN 1.268622 -0.541199
Jim 2.034394 0.818811 0.333991 0.158734 1.187207
Travis 2.719235 -0.459516 -0.292250 0.158169 1.102169
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',#分組字典
              'd': 'blue', 'e': 'red', 'f' : 'orange'}
by_column = people.groupby(mapping,axis=1)
by_column.sum()
blue red
Joe -0.949718 2.164819
Steve 0.222253 0.366838
Wes 1.268622 -0.597026
Jim 0.492726 4.040412
Travis -0.134082 3.361889
map_series = pd.Series(mapping)
map_series
a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object
people.groupby(map_series,axis=1).count()
blue red
Joe 2 3
Steve 2 3
Wes 1 2
Jim 2 3
Travis 2 3
#通過函式進行分組
people.groupby(len).sum()
a b c d e
3 2.016363 2.449894 0.789601 0.022029 1.141948
5 0.304393 0.326005 0.146350 0.075903 -0.263559
6 2.719235 -0.459516 -0.292250 0.158169 1.102169
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()
a b c d e
3 one -0.055827 1.631083 0.455609 -1.405327 -0.541199
two 2.034394 0.818811 0.333991 0.158734 1.187207
5 one 0.304393 0.326005 0.146350 0.075903 -0.263559
6 two 2.719235 -0.459516 -0.292250 0.158169 1.102169
#根據索引級別分組
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                        [1, 3, 5, 1, 3]],
                                        names=['cty', 'tenor'])
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df
cty US JP
tenor 1 3 5 1 3
0 -1.865861 0.145454 1.476689 0.798411 0.546048
1 1.887149 0.218613 0.951165 2.790860 -2.419909
2 -0.681806 -0.490238 -2.215909 0.375548 0.145144
3 0.501322 1.358101 -1.069453 1.916614 -0.018305
#根據級別分組,使用level關鍵字傳遞級別序號或名字:
hier_df.groupby(level='cty',axis=1).count()
cty JP US
0 2 3
1 2 3
2 2 3
3 2 3
#資料聚合
#聚合指的是任何能夠從陣列產生標量值的資料轉換過程
df
key1 key2 data1 data2
0 a one 1.364596 0.352792
1 a two 1.685626 0.236429
2 b one -0.537077 -0.018004
3 b two 1.389866 0.826195
4 a one 0.849733 1.619383
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)
key1
a    1.621420
b    1.197171
Name: data1, dtype: float64
def peak_to_peak(arr):
    return arr.max()-arr.min()
grouped.agg(peak_to_peak)#使用你自己的聚合函式,只需將其傳入aggregate或agg方法
data1 data2
key1
a 0.835893 1.382954
b 1.926943 0.844199
grouped.describe()
data1 data2
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
key1
a 3.0 1.299985 0.421675 0.849733 1.107165 1.364596 1.525111 1.685626 3.0 0.736201 0.767068 0.236429 0.294610 0.352792 0.986087 1.619383
b 2.0 0.426394 1.362554 -0.537077 -0.055342 0.426394 0.908130 1.389866 2.0 0.404096 0.596939 -0.018004 0.193046 0.404096 0.615145 0.826195
#面向列的多函式應用
tips = pd.read_csv('examples/tips.csv')
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips[:6]
total_bill tip smoker day time size tip_pct
0 16.99 1.01 No Sun Dinner 2 0.059447
1 10.34 1.66 No Sun Dinner 3 0.160542
2 21.01 3.50 No Sun Dinner 3 0.166587
3 23.68 3.31 No Sun Dinner 2 0.139780
4 24.59 3.61 No Sun Dinner 4 0.146808
5 25.29 4.71 No Sun Dinner 4 0.186240
grouped = tips.groupby(['day','smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64
grouped_pct.agg(['mean', 'std', peak_to_peak])
mean std peak_to_peak
day smoker
Fri No 0.151650 0.028123 0.067349
Yes 0.174783 0.051293 0.159925
Sat No 0.158048 0.039767 0.235193
Yes 0.147906 0.061375 0.290095
Sun No 0.160113 0.042347 0.193226
Yes 0.187250 0.154134 0.644685
Thur No 0.160298 0.038774 0.193350
Yes 0.163863 0.039389 0.151240
#修改聚合名字
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])
foo bar
day smoker
Fri No 0.151650 0.028123
Yes 0.174783 0.051293
Sat No 0.158048 0.039767
Yes 0.147906 0.061375
Sun No 0.160113 0.042347
Yes 0.187250 0.154134
Thur No 0.160298 0.038774
Yes 0.163863 0.039389
functions = ['count','mean','max']
result = grouped['tip_pct','total_bill'].agg(functions)
result
tip_pct total_bill
count mean max count mean max
day smoker
Fri No 4 0.151650 0.187735 4 18.420000 22.75
Yes 15 0.174783 0.263480 15 16.813333 40.17
Sat No 45 0.158048 0.291990 45 19.661778 48.33
Yes 42 0.147906 0.325733 42 21.276667 50.81
Sun No 57 0.160113 0.252672 57 20.506667 48.17
Yes 19 0.187250 0.710345 19 24.120000 45.35
Thur No 45 0.160298 0.266312 45 17.113111 41.19
Yes 17 0.163863 0.241255 17 19.190588 43.11
ftuples = [('Durchschnitt', 'mean'),('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples)
tip_pct total_bill
Durchschnitt Abweichung Durchschnitt Abweichung
day smoker
Fri No 0.151650 0.000791 18.420000 25.596333
Yes 0.174783 0.002631 16.813333 82.562438
Sat No 0.158048 0.001581 19.661778 79.908965
Yes 0.147906 0.003767 21.276667 101.387535
Sun No 0.160113 0.001793 20.506667 66.099980
Yes 0.187250 0.023757 24.120000 109.046044
Thur No 0.160298 0.001503 17.113111 59.625081
Yes 0.163863 0.001551 19.190588 69.808518
#對一個列或不同的列應用不同的函式
grouped.agg({'tip' : np.max, 'size' : 'sum'})
grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],
                'size' : 'sum'})
tip_pct size
min max mean std sum
day smoker
Fri No 0.120385 0.187735 0.151650 0.028123 9
Yes 0.103555 0.263480 0.174783 0.051293 31
Sat No 0.056797 0.291990 0.158048 0.039767 115
Yes 0.035638 0.325733 0.147906 0.061375 104
Sun No 0.059447 0.252672 0.160113 0.042347 167
Yes 0.065660 0.710345 0.187250 0.154134 49
Thur No 0.072961 0.266312 0.160298 0.038774 112
Yes 0.090014 0.241255 0.163863 0.039389 40
#apply:一般性的拆分應用合併
def top(df,n=5,column='tip_pct'):
    return df.sort_values(by=column)[-n:]
top(tips,n=6)
total_bill tip smoker day time size tip_pct
109 14.31 4.00 Yes Sat Dinner 2 0.279525
183 23.17 6.50 Yes Sun Dinner 4 0.280535
232 11.61 3.39 No Sat Dinner 2 0.291990
67 3.07 1.00 Yes Sat Dinner 1 0.325733
178 9.60 4.00 Yes Sun Dinner 2 0.416667
172 7.25 5.15 Yes Sun Dinner 2 0.710345
tips.groupby('smoker').apply(top)
total_bill tip smoker day time size tip_pct
smoker
No 88 24.71 5.85 No Thur Lunch 2 0.236746
185 20.69 5.00 No Sun Dinner 5 0.241663
51 10.29 2.60 No Sun Dinner 2 0.252672
149 7.51 2.00 No Thur Lunch 2 0.266312
232 11.61 3.39 No Sat Dinner 2 0.291990
Yes 109 14.31 4.00 Yes Sat Dinner 2 0.279525
183 23.17 6.50 Yes Sun Dinner 4 0.280535
67 3.07 1.00 Yes Sat Dinner 1 0.325733
178 9.60 4.00 Yes Sun Dinner 2 0.416667
172 7.25 5.15 Yes Sun Dinner 2 0.710345
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')
total_bill tip smoker day time size tip_pct
smoker day
No Fri 94 22.75 3.25 No Fri Dinner 2 0.142857
Sat 212 48.33 9.00 No Sat Dinner 4 0.186220
Sun 156 48.17 5.00 No Sun Dinner 6 0.103799
Thur 142 41.19 5.00 No Thur Lunch 5 0.121389
Yes Fri 95 40.17 4.73 Yes Fri Dinner 4 0.117750
Sat 170 50.81 10.00 Yes Sat Dinner 3 0.196812
Sun 182 45.35 3.50 Yes Sun Dinner 3 0.077178
Thur 197 43.11 5.00 Yes Thur Lunch 4 0.115982
result = tips.groupby('smoker')['tip_pct'].describe()
result
count mean std min 25% 50% 75% max
smoker
No 151.0 0.159328 0.039910 0.056797 0.136906 0.155625 0.185014 0.291990
Yes 93.0 0.163196 0.085119 0.035638 0.106771 0.153846 0.195059 0.710345
result.unstack('smoker')
       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64
#禁止分組鍵
tips.groupby('smoker',group_keys=False).apply(top)
total_bill tip smoker day time size tip_pct
88 24.71 5.85 No Thur Lunch 2 0.236746
185 20.69 5.00 No Sun Dinner 5 0.241663
51 10.29 2.60 No Sun Dinner 2 0.252672
149 7.51 2.00 No Thur Lunch 2 0.266312
232 11.61 3.39 No Sat Dinner 2 0.291990
109 14.31 4.00 Yes Sat Dinner 2 0.279525
183 23.17 6.50 Yes Sun Dinner 4 0.280535
67 3.07 1.00 Yes Sat Dinner 1 0.325733
178 9.60 4.00 Yes Sun Dinner 2 0.416667
172 7.25 5.15 Yes Sun Dinner 2 0.710345
#分位數和桶分析
frame = pd.DataFrame({'data1':np.ran