Python之Pandas(5)
阿新 • • 發佈:2018-12-03
#資料分組 #根據某些條件將資料進行拆分成組 #每個組獨立應用函式 #將結果合併到一個數據結構中 import numpy as np import pandas as pd In [4]: #分組 df = pd.DataFrame({'A':['foo','bar','foo','bar','foo','foo','bar','bar'], 'B':['one','two','three','two','two','one','three','one'], 'C':np.random.randn(8), 'D':np.random.randn(8)}) In [5]: df Out[5]: A B C D 0 foo one 0.444202 1.406586 1 bar two -0.311666 1.196347 2 foo three 0.440234 0.949232 3 bar two -1.578572 2.464325 4 foo two -1.353510 0.773391 5 foo one 0.307378 -0.492570 6 bar three 1.446811 -2.350776 7 bar one -2.097978 0.296710 In [10]: df.groupby(['A','B']).mean() Out[10]: C D A B bar one -2.097978 0.296710 three 1.446811 -2.350776 two -0.945119 1.830336 foo one 0.375790 0.457008 three 0.440234 0.949232 two -1.353510 0.773391 In [26]: df.groupby(['A']).mean() Out[26]: C D A bar -0.635351 0.401652 foo -0.040424 0.659160 In [21]: list(df.groupby(['A']))[1]list(df.groupby(['A']))[1] Out[21]: ('foo', A B C D 0 foo one 0.444202 1.406586 2 foo three 0.440234 0.949232 4 foo two -1.353510 0.773391 5 foo one 0.307378 -0.492570) In [24]: list(df.groupby(['A']))[0] Out[24]: ('bar', A B C D 1 bar two -0.311666 1.196347 3 bar two -1.578572 2.464325 6 bar three 1.446811 -2.350776 7 bar one -2.097978 0.296710) In [28]: df.groupby(['A','B']).size() Out[28]: A B bar one 1 three 1 two 2 foo one 2 three 1 two 1 dtype: int64 In [31]: df.groupby(['A']).groups Out[31]: {'bar': [1, 3, 6, 7], 'foo': [0, 2, 4, 5]} In [51]: s = pd.Series([1,2,3,10,20,30],index=[1,2,3,1,2,3]) grouped = s.groupby(level=0) print(s) print(grouped.first())#顯示非NaN的第一個值 print(grouped.last())#顯示非NaN的最後一個值 print(grouped.sum())#顯示非NaN的和 print(grouped.mean)#平均值 print(grouped.median())#中值 print(grouped.count())#計數 print(grouped.min())#最小 print(grouped.max())#最大 print(grouped.std())#標準差 print(grouped.var())#方差 print(grouped.prod())#積 1 1 2 2 3 3 1 10 2 20 3 30 dtype: int64 1 1 2 2 3 3 dtype: int64 1 10 2 20 3 30 dtype: int64 1 11 2 22 3 33 dtype: int64 <bound method GroupBy.mean of <pandas.core.groupby.SeriesGroupBy object at 0x00000000049BBA90>> 1 5.5 2 11.0 3 16.5 dtype: float64 1 2 2 2 3 2 dtype: int64 1 1 2 2 3 3 dtype: int64 1 10 2 20 3 30 dtype: int64 1 6.363961 2 12.727922 3 19.091883 dtype: float64 1 40.5 2 162.0 3 364.5 dtype: float64 1 10 2 40 3 90 dtype: int64 In [53]: df = pd.DataFrame({'a':[1,1,2,2], 'b':np.random.rand(4), 'c':np.random.rand(4), 'd':np.random.rand(4)}) df Out[53]: a b c d 0 1 0.390076 0.664425 0.493986 1 1 0.534739 0.378918 0.813577 2 2 0.894389 0.680243 0.294173 3 2 0.741806 0.223494 0.160900 In [56]: print(df.groupby('a').agg(['mean',np.sum])) b c d mean sum mean sum mean sum a 1 0.462407 0.924815 0.521671 1.043342 0.653782 1.307563 2 0.818098 1.636195 0.451869 0.903737 0.227536 0.455073 In [58]: print(df.groupby('a')['b'].agg({'result1':np.mean, 'result2':np.sum})) result1 result2 a 1 0.462407 0.924815 2 0.818098 1.636195 In [61]: #小作業 df = pd.DataFrame({'A':['one','two','three','one','two','three','one','two'], 'B':['h','h','h','h','f','f','f','f'], 'C':[10,12,14,16,18,20,22,24], 'D':np.random.randn(8), 'E':np.random.rand(8)}) df Out[61]: A B C D E 0 one h 10 -1.188879 0.771559 1 two h 12 -0.414063 0.743417 2 three h 14 -0.241158 0.182954 3 one h 16 0.381358 0.100378 4 two f 18 -0.101517 0.291719 5 three f 20 -0.808872 0.007264 6 one f 22 -1.164982 0.351209 7 two f 24 -1.144294 0.831537 In [62]: df.groupby('A')['C','D'].mean() Out[62]: C D A one 16 -0.657501 three 17 -0.525015 two 18 -0.553291 In [63]: df.groupby(['A','B'])['D','E'].sum() Out[63]: D E A B one f -1.164982 0.351209 h -0.807521 0.871937 three f -0.808872 0.007264 h -0.241158 0.182954 two f -1.245811 1.123255 h -0.414063 0.743417 In [78]: print(dict(list(df.groupby('A')))) {'three': A B C D E 2 three h 14 -0.241158 0.182954 5 three f 20 -0.808872 0.007264, 'two': A B C D E 1 two h 12 -0.414063 0.743417 4 two f 18 -0.101517 0.291719 7 two f 24 -1.144294 0.831537, 'one': A B C D E 0 one h 10 -1.188879 0.771559 3 one h 16 0.381358 0.100378 6 one f 22 -1.164982 0.351209} In [72]: df2 = df[['C','D']] df2['sum'] = df2.sum(axis = 1) df2 C:\Program Files\Anaconda3\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy from ipykernel import kernelapp as app Out[72]: C D sum 0 10 -1.188879 8.811121 1 12 -0.414063 11.585937 2 14 -0.241158 13.758842 3 16 0.381358 16.381358 4 18 -0.101517 17.898483 5 20 -0.808872 19.191128 6 22 -1.164982 20.835018 7 24 -1.144294 22.855706