1. 程式人生 > >Pyhton科學計算工具Pandas(九)—— 資料分組

Pyhton科學計算工具Pandas(九)—— 資料分組

Pyhton科學計算工具Pandas(九)—— 資料分組

分組統計 - groupby功能

  1. 根據某些條件將資料拆分成組
  2. 對每個組獨立應用函式
  3. 將結果合併到一個數據結構中

Dataframe在行(axis=0)或列(axis=1)上進行分組,將一個函式應用到各個分組併產生一個新值,然後函式執行結果被合併到最終的結果物件中。

df.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, **kwargs)

分組的基本操作

分組

#分組


df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
print(df)
print('-----'
) dfa = df.groupby('A') print(dfa.size(), type(dfa)) #groupby之後的資料並不是DataFrame格式的資料,而是特殊的groupby型別 #可以通過size()方法返回分組後的記錄數目的統計結果 print('========') a = df.groupby('A').sum() print(a, type(a)) b = df.groupby(['A','B']).mean() print(b, type(b)) c = df.groupby('A')['D'].mean() print(c, type(c)) # 通過分組後的計算,得到一個新的dataframe
# 預設axis = 0,以行來分組 # 可單個或多個([])列分組
     A      B         C         D
0  foo    one  2.479737 -2.368789
1  bar    one  1.028346  0.950277
2  foo    two  1.001758 -1.278156
3  bar  three -0.205714 -0.330909
4  foo    two  0.337572  1.256110
5  bar    two  0.244171 -0.820276
6  foo    one  0.554198  0.683419
7  foo  three -0.534419 -0.319840
-----
A
bar    3
foo    5
dtype: int64 <class 'pandas.core.groupby.DataFrameGroupBy'>
========
            C         D
A                      
bar  1.066804 -0.200907
foo  3.838847 -2.027256 <class 'pandas.core.frame.DataFrame'>
                  C         D
A   B                        
bar one    1.028346  0.950277
    three -0.205714 -0.330909
    two    0.244171 -0.820276
foo one    1.516967 -0.842685
    three -0.534419 -0.319840
    two    0.669665 -0.011023 <class 'pandas.core.frame.DataFrame'>
A
bar   -0.066969
foo   -0.405451
Name: D, dtype: float64 <class 'pandas.core.series.Series'>

分組是一個可迭代的物件

# 分組 - 可迭代物件

df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
print(df)
print(df.groupby('X'), type(df.groupby('X')))
print('-----')


print(list(df.groupby('X')), '→ 可迭代物件,直接生成list\n')
print(list(df.groupby('X'))[0], '→ 以元祖形式顯示\n')

for n,g in df.groupby('X'):
    print(n)
    print(g, type(g))
    print('======')
# n是組名,g是分組後的Dataframe
   X  Y
0  A  1
1  B  4
2  A  3
3  B  2
<pandas.core.groupby.DataFrameGroupBy object at 0x000002AF2EE7C080> <class 'pandas.core.groupby.DataFrameGroupBy'>
-----
[('A',    X  Y
0  A  1
2  A  3), ('B',    X  Y
1  B  4
3  B  2)] → 可迭代物件,直接生成list

('A',    X  Y
0  A  1
2  A  3) → 以元祖形式顯示

A
   X  Y
0  A  1
2  A  3 <class 'pandas.core.frame.DataFrame'>
======
B
   X  Y
1  B  4
3  B  2 <class 'pandas.core.frame.DataFrame'>
======

選擇分組 .get_group()

# 提取分組後的某組

df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})

print(df)
print('-------')

print(df.groupby('X').get_group('A'))
print('-------')
   X  Y
0  A  1
1  B  4
2  A  3
3  B  2
-------
   X  Y
0  A  1
2  A  3

將分組轉化為字典 .groups

#  將分組轉化為字典

df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
print(df)
print('---------')

a = df.groupby('X')
print(a.groups,'\n')
print(a.groups['A'],'\n')
print(a.groups['A'][0])
#  字典的值為index
   X  Y
0  A  1
1  B  4
2  A  3
3  B  2
---------
{'A': Int64Index([0, 2], dtype='int64'), 'B': Int64Index([1, 3], dtype='int64')} 

Int64Index([0, 2], dtype='int64') 

0

檢視分組裡的記錄數 .size()

#  .size()  檢視分組中的記錄的統計數目

df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})

print(df)
print('====')

a = df.groupby('X')
print(a.size())
   X  Y
0  A  1
1  B  4
2  A  3
3  B  2
====
X
A    2
B    2
dtype: int64

多個列分組

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
grouped = df.groupby(['A','B']).groups
print(df)
print('---------')
print(grouped)
print('=====')
print(grouped[('foo', 'three')])
# 按照兩個列進行分組
     A      B         C         D
0  foo    one -0.539735  0.252334
1  bar    one  1.247811 -0.144133
2  foo    two -0.965486  0.042095
3  bar  three -0.158520 -0.667123
4  foo    two  1.283692  1.201100
5  bar    two -0.795091  0.368176
6  foo    one -0.263945  0.085682
7  foo  three  0.710263 -1.238407
---------
{('bar', 'one'): Int64Index([1], dtype='int64'), ('bar', 'three'): Int64Index([3], dtype='int64'), ('bar', 'two'): Int64Index([5], dtype='int64'), ('foo', 'one'): Int64Index([0, 6], dtype='int64'), ('foo', 'three'): Int64Index([7], dtype='int64'), ('foo', 'two'): Int64Index([2, 4], dtype='int64')}
=====
Int64Index([7], dtype='int64')

在其他軸上分組

df = pd.DataFrame({'data1':np.random.rand(2),
                  'data2':np.random.rand(2),
                  'key1':['a','b'],
                  'key2':['one','two']})
print(df)
print('------')
print(df.dtypes)
print('------')

for n,p in df.groupby(df.dtypes, axis=1):
    print(n)
    print(p)
    print('===')
# 按照值型別分列
      data1    data2 key1 key2
0  0.257623  0.81153    a  one
1  0.325821  0.78845    b  two
------
data1    float64
data2    float64
key1      object
key2      object
dtype: object
------
float64
      data1    data2
0  0.257623  0.81153
1  0.325821  0.78845
===
object
  key1 key2
0    a  one
1    b  two
===

通過字典或者Series分組

# 通過字典或者Series分組

df = pd.DataFrame(np.arange(16).reshape(4,4),
                  columns = ['a','b','c','d'])
print(df)
print('-----')

mapping = {'a':'one','b':'one','c':'two','d':'two','e':'three'}
by_column = df.groupby(mapping, axis = 1)
print(by_column.sum())
print('-----')
# mapping中,a、b列對應的為one,c、d列對應的為two,以字典來分組

s = pd.Series(mapping)
print(s,'\n')
print(s.groupby(s).count())
# s中,index中a、b對應的為one,c、d對應的為two,以Series來分組
'''??????'''
    a   b   c   d
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
-----
   one  two
0    1    5
1    9   13
2   17   21
3   25   29
-----
a      one
b      one
c      two
d      two
e    three
dtype: object 

one      2
three    1
two      2
dtype: int64





'??????'

通過函式分組

# 通過函式分組

df = pd.DataFrame(np.arange(16).reshape(4,4),
                  columns = ['a','b','c','d'],
                 index = ['abc','bcd','aa','b'])
print(df,'\n')
print(df.groupby(len).sum())
# 按照字母長度分組
      a   b   c   d
abc   0   1   2   3
bcd   4   5   6   7
aa    8   9  10  11
b    12  13  14  15 

    a   b   c   d
1  12  13  14  15
2   8   9  10  11
3   4   6   8  10

分組中常見的函式

# 分組計算函式方法

s = pd.Series([1, 2, 3, 10, 20, 30], index = [1, 2, 3, 1, 2, 3])
grouped = s.groupby(level=0)  # 唯一索引用.groupby(level=0),將同一個index的分為一組
print(grouped)
print(grouped.first(),'→ first:非NaN的第一個值\n')
print(grouped.last(),'→ last:非NaN的最後一個值\n')
print(grouped.sum(),'→ sum:非NaN的和\n')
print(grouped.mean(),'→ mean:非NaN的平均值\n')
print(grouped.median(),'→ median:非NaN的算術中位數\n')
print(grouped.count(),'→ count:非NaN的值\n')
print(grouped.min(),'→ min、max:非NaN的最小值、最大值\n')
print(grouped.std(),'→ std,var:非NaN的標準差和方差\n')
print(grouped.prod(),'→ prod:非NaN的積\n')
<pandas.core.groupby.SeriesGroupBy object at 0x000002AF2F1B7278>
1    1
2    2
3    3
dtype: int64 → first:非NaN的第一個值

1    10
2    20
3    30
dtype: int64 → last:非NaN的最後一個值

1    11
2    22
3    33
dtype: int64 → sum:非NaN的和

1     5.5
2    11.0
3    16.5
dtype: float64 → mean:非NaN的平均值

1     5.5
2    11.0
3    16.5
dtype: float64 → median:非NaN的算術中位數

1    2
2    2
3    2
dtype: int64 → count:非NaN的值

1    1
2    2
3    3
dtype: int64 → min、max:非NaN的最小值、最大值

1     6.363961
2    12.727922
3    19.091883
dtype: float64 → std,var:非NaN的標準差和方差

1    10
2    40
3    90
dtype: int64 → prod:非NaN的積

多函式計算

# 多函式計算:agg()

df = pd.DataFrame({'a':[1,1,2,2],
                  'b':np.random.randint(100, size=4),
                  'c':np.random.randint(100, size=4),
                  'd':np.random.randint(100, size=4)})
print(df)
print(df.groupby('a').agg(['mean',sum]))
print(df.groupby('a')['b'].agg({'mean':np.mean,
                               'sum':'sum'}))
# 函式寫法可以用str,或者np.方法
# 可以通過list,dict傳入,當用dict時,key名為columns
   a   b   c   d
0  1  47   0  61
1  1  83  52   2
2  2  54  77  87
3  2  52  99  97
     b         c          d     
  mean  sum mean  sum  mean  sum
a                               
1   65  130   26   52  31.5   63
2   53  106   88  176  92.0  184
   mean  sum
a           
1    65  130
2    53  106


F:\Anaconda3\lib\site-packages\ipykernel_launcher.py:10: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
  # Remove the CWD from sys.path while we load stuff.

分組轉換

資料分組轉換 transform

# 資料分組轉換,transform

df = pd.DataFrame({'data1':np.random.randint(100, size=5),
                  'data2':np.random.randint(100, size=5),
                  'key1':list('aabba'),
                  'key2':['one','two','one','two','one']})
k_mean = df.groupby('key1').mean()
print(df)
print(k_mean)
print(pd.merge(df, k_mean, left_on='key1', right_index=True).add_prefix('mean_'))  # .add_prefix('mean_'):新增字首
print('============')
# 通過分組、合併,得到一個包含均值的Dataframe

print(df.groupby('key2').mean()) # 按照key2分組求均值
print(df.groupby('key2').transform(np.mean))
# data1、data2每個位置元素取對應分組列的均值
# 字串不能進行計算
   data1  data2 key1 key2
0      7     98    a  one
1     77      3    a  two
2     50     73    b  one
3     74     23    b  two
4     21      9    a  one
      data1      data2
key1                  
a      35.0  36.666667
b      62.0  48.000000
   mean_data1_x  mean_data2_x mean_key1 mean_key2  mean_data1_y  mean_data2_y
0             7            98         a       one          35.0     36.666667
1            77             3         a       two          35.0     36.666667
4            21             9         a       one          35.0     36.666667
2            50            73         b       one          62.0     48.000000
3            74            23         b       two          62.0     48.000000
============
      data1  data2
key2              
one    26.0   60.0
two    75.5   13.0
   data1  data2
0   26.0     60
1   75.5     13
2   26.0     60
3   75.5     13
4   26.0     60

一般化Groupby方法:apply

# 一般化Groupby方法:apply

df = pd.DataFrame({'data1':np.random.randint(100, size=5),
                  'data2':np.random.randint(100, size=5),
                  'key1':list('aabba'),
                  'key2':['one','two','one','two','one']})

print(df.groupby('key1').apply(lambda x: x.describe()))
# apply直接執行其中的函式
# 這裡為匿名函式,描述性統計
print('=========================')

def f_df1(d,n):
    return(d.sort_index()[:n])
def f_df2(d,k1):
    return(d[k1])
print(df.groupby('key1').apply(f_df1,2),'\n')
print(df.groupby('key1').apply(f_df2,'data2'))
print(type(df.groupby('key1').apply(f_df2,'data2')))
# f_df1函式:返回排序後的前n行資料
# f_df2函式:返回分組後表的k1列,結果為Series,層次化索引
# 直接執行f_df函式
# 引數直接寫在後面,也可以為.apply(f_df,n = 2))
                data1      data2
key1                            
a    count   3.000000   3.000000
     mean   39.666667  47.333333
     std    45.566801  33.306656
     min     4.000000  10.000000
     25%    14.000000  34.000000
     50%    24.000000  58.000000
     75%    57.500000  66.000000
     max    91.000000  74.000000
b    count   2.000000   2.000000
     mean   25.500000  18.500000
     std     3.535534  16.263456
     min    23.000000   7.000000
     25%    24.250000  12.750000
     50%    25.500000  18.500000
     75%    26.750000  24.250000
     max    28.000000  30.000000
=========================
        data1  data2 key1 key2
key1                          
a    0      4     10    a  one
     1     91     58    a  two
b    2     28      7    b  one
     3     23     30    b  two 

key1   
a     0    10
      1    58
      4    74
b     2     7
      3    30
Name: data2, dtype: int32
<class 'pandas.core.series.Series'>