Python之Pandas(4)
阿新 • • 發佈:2018-12-03
#Pandas具有全功能的,高效能記憶體中連線操作,與Sql關係資料庫非常相似 import numpy as np import pandas as pd In [18]: #合併 連線 去重 替換 df1 = pd.DataFrame({'key':['K0','K1','K2','K3'], 'A' :['A0','A1','A2','A3'], 'B' :['B0','B1','B2','B3']}) df2 = pd.DataFrame({'key':['K0','K1','K2','K3'], 'C' :['C0','C1','C2','C3'], 'D' :['D0','D1','D2','D3']}) df3 = pd.DataFrame({'key1':['K0','K0','K1','K2'], 'key2':['K0','K1','K0','K1'], 'A' :['A0','A1','A2','A3'], 'B' :['B0','B1','B2','B3']}) df4 = pd.DataFrame({'key1':['K0','K1','K1','K2'], 'key2':['K0','K0','K0','K0'], 'C' :['C0','C1','C2','C3'], 'D' :['D0','D1','D2','D3']}) In [20]: #合併 on='key'是參考鍵 print(df1) print(df2) print(pd.merge(df1,df2,on='key')) A B key 0 A0 B0 K0 1 A1 B1 K1 2 A2 B2 K2 3 A3 B3 K3 C D key 0 C0 D0 K0 1 C1 D1 K1 2 C2 D2 K2 3 C3 D3 K3 A B key C D 0 A0 B0 K0 C0 D0 1 A1 B1 K1 C1 D1 2 A2 B2 K2 C2 D2 3 A3 B3 K3 C3 D3 In [21]: print(pd.merge(df3,df4,on=['key1','key2'])) print(df3) print(df4) print(pd.merge(df3,df4,on=['key1','key2'])) A B key1 key2 0 A0 B0 K0 K0 1 A1 B1 K0 K1 2 A2 B2 K1 K0 3 A3 B3 K2 K1 C D key1 key2 0 C0 D0 K0 K0 1 C1 D1 K1 K0 2 C2 D2 K1 K0 3 C3 D3 K2 K0 A B key1 key2 C D 0 A0 B0 K0 K0 C0 D0 1 A2 B2 K1 K0 C1 D1 2 A2 B2 K1 K0 C2 D2 In [23]: #引數how 合併方式 #交集 print(pd.merge(df3,df4,on=['key1','key2'],how = 'inner')) #並集 print(pd.merge(df3,df4,on=['key1','key2'],how = 'outer')) #以df3為參考進行合併,資料缺失為NaN print(pd.merge(df3,df4,on=['key1','key2'],how = 'left')) #以df4為參考進行合併,資料缺失為NaN print(pd.merge(df3,df4,on=['key1','key2'],how = 'right')) A B key1 key2 C D 0 A0 B0 K0 K0 C0 D0 1 A2 B2 K1 K0 C1 D1 2 A2 B2 K1 K0 C2 D2 A B key1 key2 C D 0 A0 B0 K0 K0 C0 D0 1 A1 B1 K0 K1 NaN NaN 2 A2 B2 K1 K0 C1 D1 3 A2 B2 K1 K0 C2 D2 4 A3 B3 K2 K1 NaN NaN 5 NaN NaN K2 K0 C3 D3 A B key1 key2 C D 0 A0 B0 K0 K0 C0 D0 1 A1 B1 K0 K1 NaN NaN 2 A2 B2 K1 K0 C1 D1 3 A2 B2 K1 K0 C2 D2 4 A3 B3 K2 K1 NaN NaN A B key1 key2 C D 0 A0 B0 K0 K0 C0 D0 1 A2 B2 K1 K0 C1 D1 2 A2 B2 K1 K0 C2 D2 3 NaN NaN K2 K0 C3 D3 In [28]: left_on right_on left_index right_on #引數 left_on right_on left_index right_on 當鍵不為一個列時,可以單獨設定左鍵與右鍵 df1 = pd.DataFrame({'lkey':list('bbacaab'), 'data1':range(7)}) df2 = pd.DataFrame({'rkey':list('abd'), 'data2':range(3)}) print(df1) print(df2) print(pd.merge(df1,df2,left_on = 'lkey',right_on='rkey')) #df1 以lkey為鍵,df2 以rkey為鍵 data1 lkey 0 0 b 1 1 b 2 2 a 3 3 c 4 4 a 5 5 a 6 6 b data2 rkey 0 0 a 1 1 b 2 2 d data1 lkey data2 rkey 0 0 b 1 b 1 1 b 1 b 2 6 b 1 b 3 2 a 0 a 4 4 a 0 a 5 5 a 0 a In [43]: right df1 = pd.DataFrame({'key':list('abcdefg'), 'data':range(7)}) df2 = pd.DataFrame({'data1':range(100,105)}, index = list('abcde')) print(df1) print(df2) print(pd.merge(df1,df2,left_on='key',right_index=True)) #df1 以key為按鍵 df2 以index為按鍵 #left_index 為True時,第一個df以index為鍵,預設為False #right_index 為True時,第二個df以index為鍵,預設為False #可以相互組合 left_on right_on left_index right_on #left_on+right_on left_index+right_index right_on+left_index left_on+right_index data key 0 0 a 1 1 b 2 2 c 3 3 d 4 4 e 5 5 f 6 6 g data1 a 100 b 101 c 102 d 103 e 104 data key data1 0 0 a 100 1 1 b 101 2 2 c 102 3 3 d 103 4 4 e 104 In [52]: #concat axis = 1 列+列 axis = 0 行+行 s1 = pd.Series([1,2,3]) s2 = pd.Series([2,3,4]) print(s1) print(s2) print(pd.concat([s1,s2]).sort_index()) print(pd.concat([s1,s2])) 0 1 1 2 2 3 dtype: int64 0 2 1 3 2 4 dtype: int64 0 1 0 2 1 2 1 3 2 3 2 4 dtype: int64 0 1 1 2 2 3 0 2 1 3 2 4 dtype: int64 In [58]: 去掉重複 #去重 s = pd.Series([1,2,3,4,5,6,7,8,9,1,2,3,4,5,6]) print(s) print(s.duplicated()) #判斷是否重複 #去掉重複 print(s.drop_duplicates()) 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 1 10 2 11 3 12 4 13 5 14 6 dtype: int64 0 False 1 False 2 False 3 False 4 False 5 False 6 False 7 False 8 False 9 True 10 True 11 True 12 True 13 True 14 True dtype: bool 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 dtype: int64 In [62]: ' #替換 s = pd.Series(list('ascaazsd')) print(s) print(s.replace('a','z')) 0 a 1 s 2 c 3 a 4 a 5 z 6 s 7 d dtype: object 0 z 1 s 2 c 3 z 4 z 5 z 6 s 7 d dtype: object