Python之Pandas(2)
阿新 • • 發佈:2018-12-03
import numpy as np import pandas as pd In [36]: df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,index=['one','two','three'],columns = ['a','b','c','d']) #列索引 print(df['a']) print(df['b']) print(df['c']) print(df[['a','c']]) #行索引 .loc[] print(df.loc['one']) print(df.loc[['one','two']]) one 74.508548 two 33.914635 three 89.037458 Name: a, dtype: float64 one 34.484117 two 36.413632 three 15.947303 Name: b, dtype: float64 one 18.774667 two 93.554980 three 6.831067 Name: c, dtype: float64 a c one 74.508548 18.774667 two 33.914635 93.554980 three 89.037458 6.831067 a 74.508548 b 34.484117 c 18.774667 d 31.343584 Name: one, dtype: float64 a b c d one 74.508548 34.484117 18.774667 31.343584 two 33.914635 36.413632 93.554980 31.626610 In [27]: #切片 ''' iloc主要使用數字來索引資料,而不能使用字元型的標籤來索引資料。而loc則剛好相反,只能使用字元型標籤來索引資料,不能使用數字來索引資料,不過有特殊情況,當資料框dataframe的行標籤或者列標籤為數字,loc就可以來其來索引。 ''' df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,index=['one','two','three'],columns = ['a','b','c','d']) df.loc['one':'three'] Out[27]: a b c d one 95.553289 88.567488 5.964061 46.211595 two 34.463585 75.759024 59.007925 36.239487 three 39.914204 30.086838 20.285195 50.854264 In [30]: #判斷切片 df[df < 30] Out[30]: a b c d one NaN NaN 5.964061 NaN two NaN NaN NaN NaN three NaN NaN 20.285195 NaN In [35]: df[df[['a','b']] > 40] Out[35]: a b c d one 95.553289 88.567488 NaN NaN two NaN 75.759024 NaN NaN three NaN NaN NaN NaN In [37]: #先索引行,再索引列 print(df['a'].loc[['one','three']]) one 74.508548 three 89.037458 Name: a, dtype: float64
import numpy as np import pandas as pd In [6]: #轉置 新增 修改 刪除 對齊 排序 df = pd.DataFrame(np.random.rand(16).reshape(8,2)*100,columns=['a','b']) #檢視前五條 print(df.head()) #檢視後五條 print(df.tail()) #轉置 print(df.T) a b 0 87.848371 52.160364 1 56.494315 40.510311 2 4.676018 87.834623 3 50.521130 48.060117 4 79.608305 0.083539 a b 3 50.521130 48.060117 4 79.608305 0.083539 5 27.151838 77.630977 6 31.472300 76.632293 7 6.982855 56.655203 0 1 2 3 4 5 \ a 87.848371 56.494315 4.676018 50.521130 79.608305 27.151838 b 52.160364 40.510311 87.834623 48.060117 0.083539 77.630977 6 7 a 31.472300 6.982855 b 76.632293 56.655203 In [20]: #新增 修改 df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,columns=['a','b','c','d']) df['e'] = 10 df.loc[4] = 20 df[['a','c']] = 30 #刪除 del df['a'] df.drop('b',1) Out[20]: c d e 0 30 13.046410 10 1 30 61.349300 10 2 30 19.733048 10 3 30 6.011787 10 4 30 20.000000 20 In [29]: #對齊 df = pd.DataFrame(np.random.rand(16).reshape(4,4),columns=['a','b','c','d']) df1 = pd.DataFrame(np.random.rand(20).reshape(4,5),columns=['a','b','c','d','e']) print(df-df1) print(df+df1) a b c d e 0 -0.305116 0.454676 0.026236 -0.059819 NaN 1 -0.381073 -0.171703 -0.024976 -0.082708 NaN 2 0.143118 -0.043733 0.013950 0.828276 NaN 3 0.359673 -0.584039 -0.179544 -0.793191 NaN a b c d e 0 1.630681 1.343964 1.505061 0.488630 NaN 1 0.800414 1.428047 1.450315 0.714401 NaN 2 1.111159 0.115849 0.243141 0.830235 NaN 3 0.797655 1.219097 1.003082 0.949165 NaN In [58]: #排序 #按值 ascending 升序 降序 print(df1.sort_values(['a'],ascending=1)) print(df1.sort_values(by = 'a',ascending=0)) a b c d e 3 0.218991 0.901568 0.591313 0.871178 0.587248 2 0.484020 0.079791 0.114596 0.000979 0.875343 1 0.590744 0.799875 0.737645 0.398555 0.740345 0 0.967899 0.444644 0.739412 0.274224 0.782209 a b c d e 0 0.967899 0.444644 0.739412 0.274224 0.782209 1 0.590744 0.799875 0.737645 0.398555 0.740345 2 0.484020 0.079791 0.114596 0.000979 0.875343 3 0.218991 0.901568 0.591313 0.871178 0.587248 In [62]: #d多列排序,先按照a=1排完,再按照c去排 df2 = pd.DataFrame({'a':[1,1,1,1,2,2,2,2], 'b':list(range(8)), 'c':list(range(8,0,-1))}) df2.sort_values(['a','c']) Out[62]: a b c 3 1 3 5 2 1 2 6 1 1 1 7 0 1 0 8 7 2 7 1 6 2 6 2 5 2 5 3 4 2 4 4 In [38]: #安找行的索引進行排序 ascending = False/True df1.sort_index(ascending=True) Out[38]: a b c d e 0 0.967899 0.444644 0.739412 0.274224 0.782209 1 0.590744 0.799875 0.737645 0.398555 0.740345 2 0.484020 0.079791 0.114596 0.000979 0.875343 3 0.218991 0.901568 0.591313 0.871178 0.587248 In [64]: #inplace 改變本身 df1.sort_index(inplace=1) df1 Out[64]: a b c d e 0 0.967899 0.444644 0.739412 0.274224 0.782209 1 0.590744 0.799875 0.737645 0.398555 0.740345 2 0.484020 0.079791 0.114596 0.000979 0.875343 3 0.218991 0.901568 0.591313 0.871178 0.587248 In [68]: #小作業 df = pd.DataFrame(np.random.randn(3,3)*100,columns=['v1','v2','v3'],index=['a','b','c']) df.sort_values(by = 'v2',ascending=1) Out[68]: v1 v2 v3 c 105.913696 -179.710097 120.055654 b -36.379390 -156.621727 -25.782565 a -67.891968 -41.681334 -30.058927 In [76]: df = pd.DataFrame(np.random.randn(5,2)*100,columns=['v1','v2'],index=['a','b','c','d','e']) df.T Out[76]: a b c d e v1 -56.708755 147.197783 91.484936 -41.872528 -17.752307 v2 -4.279827 33.372191 -27.462034 -103.270048 -132.076251 In [81]: df = pd.Series(np.arange(10),index=['a','b','c','d','e','f','g','h','i','j']) df.loc['a','b','c'] = 100 df Out[81]: a 100 b 100 c 100 d 3 e 4 f 5 g 6 h 7 i 8 j 9 dtype: int32 In [84]: df = pd.Series(np.arange(5),index=['a','b','c','d','e']) df1 = pd.Series(np.arange(5),index=['c','d','e','f','g']) df + df1 Out[84]: a NaN b NaN c 2.0 d 4.0 e 6.0 f NaN g NaN dtype: float64