Python資料科學入門-(Pandas)筆記02
阿新 • • 發佈:2019-01-05
第四節 Pandas 入門
慕課網python數科學入門課程學習筆記
一. Series 序列
import numpy as np
import pandas as pd
s1 = pd.Series([1,2,3,4]) # 建立序列
s1
0 1
1 2
2 3
3 4
dtype: int64
s1.values
array([1, 2, 3, 4], dtype=int64)
s1.index
RangeIndex(start=0, stop=4, step=1)
s2 = pd.Series(np.arange(10 )) # 使用numpy的陣列方法建立
s2
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int32
s3 = pd.Series({'1':1, '2':2, '3':3}) # 使用字典建立序列
s3
1 1
2 2
3 3
dtype: int64
s4 = pd.Series([1,2 ,3,4],index=['A','B','C','D'])
s4
A 1
B 2
C 3
D 4
dtype: int64
序列的訪問、轉換、屬性
s4['A']
1
s4[s4>2]
C 3
D 4
dtype: int64
s4.to_dict() # 序列轉換為字典
{‘A’: 1, ‘B’: 2, ‘C’: 3, ‘D’: 4}
s4
A 1
B 2
C 3
D 4
dtype: int64
index_1 = ['A' , 'B', 'C', 'D', 'E']
s5 = pd.Series(s4,index=index_1)
s5
A 1.0
B 2.0
C 3.0
D 4.0
E NaN
dtype: float64
pd.isnull(s5) # 判空 notnull() 判非空
A False
B False
C False
D False
E True
dtype: bool
s5.name = 'demo' # 給序列取名
s5.index.name = 'key value' # 給序列 index 取名
s5
key value
A 1.0
B 2.0
C 3.0
D 4.0
E NaN
Name: demo, dtype: float64
s5.index
Index([‘A’, ‘B’, ‘C’, ‘D’, ‘E’], dtype=’object’, name=’key value’)
s5.values
array([ 1., 2., 3., 4., nan])
——-
二.Dataframe 入門
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import webbrowser
link = 'https://www.tiobe.com/tiobe-index/'
webbrowser.open(link) # 開啟連結
# 讀取貼上板
df = pd.read_clipboard() # 先複製剛才開啟網頁的 表格的一部分
df
May | 2018 | May.1 | 2017 | Change | Programming | Language | Ratings | Change.1 | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | Java | 16.380% | +1.74% | NaN | NaN | NaN | NaN |
1 | 2 | 2 | C | 14.000% | +7.00% | NaN | NaN | NaN | NaN |
2 | 3 | 3 | C++ | 7.668% | +2.92% | NaN | NaN | NaN | NaN |
3 | 4 | 4 | Python | 5.192% | +1.64% | NaN | NaN | NaN | NaN |
4 | 5 | 5 | C# | 4.402% | +0.95% | NaN | NaN | NaN | NaN |
5 | 6 | 6 | Visual | Basic | .NET | 4.124% | +0.73% | NaN | NaN |
6 | 7 | 9 | change | PHP | 3.321% | +0.63% | NaN | NaN | NaN |
type(df)
pandas.core.frame.DataFrame
df.columns # 返回 表格列名
Index([‘May’, ‘2018’, ‘May.1’, ‘2017’, ‘Change’, ‘Programming’, ‘Language’,
‘Ratings’, ‘Change.1’],
dtype=’object’)
df.Change # 返回 Change 列的資料
0 +1.74%
1 +7.00%
2 +2.92%
3 +1.64%
4 +0.95%
5 .NET
6 3.321%
Name: Change, dtype: object
df_filter = DataFrame(df,columns=['May','2018','Change']) # 過濾
df_filter
May | 2018 | Change | |
---|---|---|---|
0 | 1 | 1 | +1.74% |
1 | 2 | 2 | +7.00% |
2 | 3 | 3 | +2.92% |
3 | 4 | 4 | +1.64% |
4 | 5 | 5 | +0.95% |
5 | 6 | 6 | .NET |
6 | 7 | 9 | 3.321% |
df_filter['Change'] # 訪問某一列
0 +1.74%
1 +7.00%
2 +2.92%
3 +1.64%
4 +0.95%
5 .NET
6 3.321%
Name: Change, dtype: object
type(df_filter['Change'])
pandas.core.series.Series
新增列 和 值
df_new = DataFrame(df,columns=['Change','Sep 2019']) # 新增不存在的 列
df_new
Change | Sep 2019 | |
---|---|---|
0 | +1.74% | NaN |
1 | +7.00% | NaN |
2 | +2.92% | NaN |
3 | +1.64% | NaN |
4 | +0.95% | NaN |
5 | .NET | NaN |
6 | 3.321% | NaN |
df_new['Sep 2019'] = range(0,7) # 給 無值得賦值
df_new
Change | Sep 2019 | |
---|---|---|
0 | +1.74% | 0 |
1 | +7.00% | 1 |
2 | +2.92% | 2 |
3 | +1.64% | 3 |
4 | +0.95% | 4 |
5 | .NET | 5 |
6 | 3.321% | 6 |
df_new['Sep 2019'] = np.arange(0,7)
df_new
Change | Sep 2019 | |
---|---|---|
0 | +1.74% | 0 |
1 | +7.00% | 1 |
2 | +2.92% | 2 |
3 | +1.64% | 3 |
4 | +0.95% | 4 |
5 | .NET | 5 |
6 | 3.321% | 6 |
df_new['Sep 2019'] = pd.Series(np.arange(0,7))
df_new
Change | Sep 2019 | |
---|---|---|
0 | +1.74% | 0 |
1 | +7.00% | 1 |
2 | +2.92% | 2 |
3 | +1.64% | 3 |
4 | +0.95% | 4 |
5 | .NET | 5 |
6 | 3.321% | 6 |
df_new['Sep 2019'] = pd.Series([100,200],index=[1,2]) # 修改特定位置的值
df_new
Change | Sep 2019 | |
---|---|---|
0 | +1.74% | NaN |
1 | +7.00% | 100.0 |
2 | +2.92% | 200.0 |
3 | +1.64% | NaN |
4 | +0.95% | NaN |
5 | .NET | NaN |
6 | 3.321% | NaN |
三.深入理解Series和DataFrame
1.Series 和 DataFrame 對比
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
data = {'Country':['A', 'B', 'C'],
'Capital':['AA', 'BB','CC'],
'Population':[11111,22222,33333]}
Series
s1 = pd.Series(data['Country'])
s1
0 A
1 B
2 C
dtype: object
Dataframe
df = pd.DataFrame(data)
df
Country | Capital | Population | |
---|---|---|---|
0 | A | AA | 11111 |
1 | B | BB | 22222 |
2 | C | CC | 33333 |
df_torows = df.iterrows()
for row in df_torows:
print(row)
print(type(row)) # 型別: tuple
(0, Country A
Capital AA
Population 11111
Name: 0, dtype: object)
for row in df.iterrows():
print(type(row[0]),type(row[1]))
break
通過 幾個 Series 建立 Dataframe
s1 = pd.Series(data['Country'])
s2 = pd.Series(data['Capital'])
s3 = pd.Series(data['Population'])
df_new = pd.DataFrame([s1,s2,s3],index=['Country', 'Capital', 'Capital'])
df_new # 可以看到 輸出的 Dataframe 倒過來了
0 | 1 | 2 | |
---|---|---|---|
Country | A | B | C |
Capital | AA | BB | CC |
Capital | 11111 | 22222 | 33333 |
df
Country | Capital | Population | |
---|---|---|---|
0 | A | AA | 11111 |
1 | B | BB | 22222 |
2 | C | CC | 33333 |
# 轉置操作
df_new = df_new.T
df_new
Country | Capital | Capital | |
---|---|---|---|
0 | A | AA | 11111 |
1 | B | BB | 22222 |
2 | C | CC | 33333 |
2.Dataframe IO
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import webbrowser
link = 'http://pandas.pydata.org/pandas-docs/version/0.20/io.html'
webbrowser.open(link)
True
df = pd.read_clipboard() # 讀取貼上板內容
df
Format Type | Data Description | Reader | Writer | |
---|---|---|---|---|
0 | text | CSV | read_csv | to_csv |
1 | text | JSON | read_json | to_json |
2 | text | HTML | read_html | to_html |
3 | text | Local clipboard | read_clipboard | to_clipboard |
4 | binary | MS Excel | read_excel | to_excel |
5 | binary | HDF5 Format | read_hdf | to_hdf |
6 | binary | Feather Format | read_feather | to_feather |
7 | binary | Msgpack | read_msgpack | to_msgpack |
8 | binary | Stata | read_stata | to_stata |
9 | binary | SAS | read_sas | None |
10 | binary | Python Pickle Format | read_pickle | to_pickle |
11 | SQL | SQL | read_sql | to_sql |
12 | SQL | Google Big Query | read_gbq | to_gbq |
df.to_csv('df1.csv',index=False) # 寫檔案 index=False 去掉index 預設不去
!ls
Dataframe IO.ipynb
Dataframe.ipynb
Series.ipynb
df1.csv
娣卞靉鐞嗚ВSeries鍜孌ataFrame.ipynb
!more df1.csv
Format Type,Data Description,Reader,Writer
text,CSV,read_csv,to_csv
text,JSON,read_json,to_json
text,HTML,read_html,to_html
text,Local clipboard,read_clipboard,to_clipboard
binary,MS Excel,read_excel,to_excel
binary,HDF5 Format,read_hdf,to_hdf
binary,Feather Format,read_feather,to_feather
binary,Msgpack,read_msgpack,to_msgpack
binary,Stata,read_stata,to_stata
binary,SAS,read_sas,
binary,Python Pickle Format,read_pickle,to_pickle
SQL,SQL,read_sql,to_sql
SQL,Google Big Query,read_gbq,to_gbq
df2 = pd.read_csv('df1.csv')
df2
Format Type | Data Description | Reader | Writer | |
---|---|---|---|---|
0 | text | CSV | read_csv | to_csv |
1 | text | JSON | read_json | to_json |
2 | text | HTML | read_html | to_html |
3 | text | Local clipboard | read_clipboard | to_clipboard |
4 | binary | MS Excel | read_excel | to_excel |
5 | binary | HDF5 Format | read_hdf | to_hdf |
6 | binary | Feather Format | read_feather | to_feather |
7 | binary | Msgpack | read_msgpack | to_msgpack |
8 | binary | Stata | read_stata | to_stata |
9 | binary | SAS | read_sas | NaN |
10 | binary | Python Pickle Format | read_pickle | to_pickle |
11 | SQL | SQL | read_sql | to_sql |
12 | SQL | Google Big Query | read_gbq | to_gbq |
df.to_json() # 轉換為 json 結構 反過來也行
'{"Format Type":{"0":"text","1":"text","2":"text","3":"text","4":"binary","5":"binary","6":"binary","7":"binary","8":"binary","9":"binary","10":"binary","11":"SQL","12":"SQL"},"Data Description":。。。
3.Selecting and Index
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
!ls
Dataframe IO.ipynb
Dataframe.ipynb
Selecting and Indexing.ipynb
Series.ipynb
data.csv
df1.csv
娣卞靉鐞嗚ВSeries鍜孌ataFrame.ipynb
read_data = pd.read_csv('data.csv')
read_data.shape # 行 列數
(20, 8)
read_data.head(4) # 返回前4行
A | B | C | D | E | F | G | H | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | Java | 16.38% | 1.74% | 21 | Apex | 0.90% |
1 | 2 | 2 | C | 14.00% | 7.00% | 22 | PL/SQL | 0.90% |
2 | 3 | 3 | C++ | 7.67% | 2.92% | 23 | Transact-SQL | 0.88% |
3 | 4 | 4 | Python | 5.19% | 1.64% | 24 | Ada | 0.87% |
read_data.tail(3) # 返回後3行
A | B | C | D | E | F | G | H | |
---|---|---|---|---|---|---|---|---|
17 | 18 | 10 | Perl | 0.91% | -1.69% | 38 | Alice | 0.47% |
18 | 19 | 13 | Swift | 0.91% | -1.37% | 39 | Lua | 0.42% |
19 | 20 | 31 | Scala | 0.90% | 0.18% | 40 | Fortran | 0.42% |
sub_data = read_data[['A', 'B','C']] # 返回某些列
sub_data.head(3)
A | B | C | |
---|---|---|---|
0 | 1 | 1 | Java |
1 | 2 | 2 | C |
2 | 3 | 3 | C++ |
sub_data.iloc[3:6, :] # 切片(iloc 基於index 和行名列名無關) 第3行到底6行,對列不過濾
A | B | C | |
---|---|---|---|
3 | 4 | 4 | Python |
4 | 5 | 5 | C# |
5 | 6 | 6 | Visual Basic .NET |
read_data.loc[10:13, : 'D'] # loc 基於label 過濾
A | B | C | D | |
---|---|---|---|---|
10 | 11 | 14 | R | 1.18% |
11 | 12 | 18 | Delphi/Object Pascal | 1.01% |
12 | 13 | 8 | Assembly language | 1.00% |
13 | 14 | 16 | Go | 0.97% |
四.Reindexing Series and DataFrame
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
1.series reindex
shift + Tab 鍵可檢視函式說明
s1 = Series([1,2,3,4],index=['A','B','C','D'])
s1
A 1
B 2
C 3
D 4
dtype: int64
s1.reindex(index=['A','B','C','D','E'],fill_value=10) # 重新設定 index
A 1
B 2
C 3
D 4
E 10
dtype: int64
s2 = Series(['A','B','C'],index=[1,3,6])
s2
1 A
3 B
6 C
dtype: object
s2.reindex(index=range(8))
0 NaN
1 A
2 NaN
3 B
4 NaN
5 NaN
6 C
7 NaN
dtype: object
s2.reindex(index=range(8),method='ffill') # 自動填充
0 NaN
1 A
2 A
3 B
4 B
5 B
6 C
7 C
dtype: object
2.reindex dataframe
# index 中故意 漏下 C
df1 = DataFrame(np.random.rand(25).reshape([5,5]),index=['A','B','D','E','F'], columns=['c1', 'c2', 'c3', 'c4', 'c5'])
df1
c1 | c2 | c3 | c4 | c5 | |
---|---|---|---|---|---|
A | 0.123618 | 0.348567 | 0.119156 | 0.380952 | 0.379118 |
B | 0.476492 | 0.254976 | 0.629318 | 0.728708 | 0.747153 |
D | 0.965314 | 0.424126 | 0.913850 | 0.092063 | 0.196096 |
E | 0.960760 | 0.866313 | 0.226766 | 0.865781 | 0.465341 |
F | 0.982832 | 0.340850 | 0.725084 | 0.519617 | 0.889651 |
# C 行填充了 NaN 若改 列 也一樣
df1.reindex(index=['A','B','C','D','E','F'])
c1 | c2 | c3 | c4 | c5 | |
---|---|---|---|---|---|
A | 0.123618 | 0.348567 | 0.119156 | 0.380952 | 0.379118 |
B | 0.476492 | 0.254976 | 0.629318 | 0.728708 | 0.747153 |
C | NaN | NaN | NaN | NaN | NaN |
D | 0.965314 | 0.424126 | 0.913850 | 0.092063 | 0.196096 |
E | 0.960760 | 0.866313 | 0.226766 | 0.865781 | 0.465341 |
F | 0.982832 | 0.340850 | 0.725084 | 0.519617 | 0.889651 |
3.利用 reindex() 選取某些值
s1
A 1
B 2
C 3
D 4
dtype: int64
s1.reindex(index=['A','B']) # Series 的reindex()
A 1
B 2
dtype: int64
df1.reindex(index=['A','B']) # Dataframe 的reindex()
c1 | c2 | c3 | c4 | c5 | |
---|---|---|---|---|---|
A | 0.123618 | 0.348567 | 0.119156 | 0.380952 | 0.379118 |
B | 0.476492 | 0.254976 | 0.629318 | 0.728708 | 0.747153 |
4.刪除
s1.drop('A')
B 2
C 3
D 4
dtype: int64
# axis=0 表示 A 代表 表格的行的index,axis=1 表示指定的是列的index
df1.drop('A',axis=0)
c1 | c2 | c3 | c4 | c5 | |
---|---|---|---|---|---|
B | 0.476492 | 0.254976 | 0.629318 | 0.728708 | 0.747153 |
D | 0.965314 | 0.424126 | 0.913850 | 0.092063 | 0.196096 |
E | 0.960760 | 0.866313 | 0.226766 | 0.865781 | 0.465341 |
F | 0.982832 | 0.340850 | 0.725084 | 0.519617 | 0.889651 |
五.NaN —— Not a Number
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
Nan in Numpy
n = np.nan
type(n)
float
m = 1
m + n # nan
nan
Nan in Series
s1 = Series([1,2,np.nan,3,4],index=['A', 'B', 'C', 'D', 'E'])
s1
A 1.0
B 2.0
C NaN
D 3.0
E 4.0
dtype: float64
s1.isnull()
A False
B False
C True
D False
E False
dtype: bool
s1.dropna() # 刪除 NaN
A 1.0
B 2.0
D 3.0
E 4.0
dtype: float64
NaN in DataFrame
dframe = DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan, np.nan, np.nan]])
dframe
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 2.0 | 3.0 |
1 | NaN | 5.0 | 6.0 |
2 | 7.0 | NaN | 9.0 |
3 | NaN | NaN | NaN |
# 不寫 預設0 表示對行操作 只要該行有nan就刪
# how='any/all' 該行 部分/全部 是 NaN時刪除
df1 = dframe.dropna(axis=0, how='any')
df1
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 2.0 | 3.0 |
# thresh=2 引數表示 該行/列的NaN大於2的就刪
df2 = dframe.dropna(thresh=2)
df2
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 2.0 | 3.0 |
1 | NaN | 5.0 | 6.0 |
2 | 7.0 | NaN | 9.0 |
# 值為 NaN的改為 1
df3 = dframe.fillna(value=1)
df3
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 2.0 | 3.0 |
1 | 1.0 | 5.0 | 6.0 |
2 | 7.0 | 1.0 | 9.0 |
3 | 1.0 | 1.0 | 1.0 |
# 某一列值為 NaN的改為 1
df3 = dframe.fillna(value={0:0, 1:1,2:2,3:3})
df3
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 2.0 | 3.0 |
1 | 0.0 | 5.0 | 6.0 |
2 | 7.0 | 1.0 | 9.0 |
3 | 0.0 | 1.0 | 2.0 |
六、多級Index
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
#二級 index
s1 = Series(np.random.randn(6),index=[['1','1','1','2','2','2'],
['a','b','c','a','b','c']])
s1
1 a 0.301681
b -1.596626
c -0.261337
2 a 0.739900
b 0.299108
c 0.074713
dtype: float64
s1['1']
a 0.301681
b -1.596626
c -0.261337
dtype: float64
s1['1']['a']
0.3016807350048885
s1[:,'a'] # 二級的 a 也返回了
1 0.301681
2 0.739900
dtype: float64
二級 index Series 與 DataFrame互相轉換
# 二級的Series 轉置 為dataframe
df1 = s1.unstack()
df1
a | b | c | |
---|---|---|---|
1 | 0.301681 | -1.596626 | -0.261337 |
2 | 0.739900 | 0.299108 | 0.074713 |
# 二級 Series 轉換為 Datafraem
df2 = DataFrame([s1['1'],s1['2']])
df2
a | b | c | |
---|---|---|---|
0 | 0.301681 | -1.596626 | -0.261337 |
1 | 0.739900 | 0.299108 | 0.074713 |
# DataFrame 轉換為多級 index 的Series
s2 = df1.unstack()
s2
a 1 0.301681
2 0.739900
b 1 -1.596626
2 0.299108
c 1 -0.261337
2 0.074713
dtype: float64
s2 = df1.T.unstack()
s2
1 a 0.301681
b -1.596626
c -0.261337
2 a 0.739900
b 0.299108
c 0.074713
dtype: float64
多級index DataFrame
df = DataFrame(np.arange(16).reshape(4,4),
index=[['a','a','b','b'],[1,2,1,2]],
columns=[['c','c','d','d'],[5,5,6,7]])
df
c | d | ||||
---|---|---|---|---|---|
5 | 5 | 6 | 7 | ||
a | 1 | 0 | 1 | 2 | 3 |
2 | 4 | 5 | 6 | 7 | |
b | 1 | 8 | 9 | 10 | 11 |
2 | 12 | 13 | 14 | 15 |
df['d']
6 | 7 | ||
---|---|---|---|
a | 1 | 2 | 3 |
2 | 6 | 7 | |
b | 1 | 10 | 11 |
2 | 14 | 15 |
df['d'][6]
a 1 2
2 6
b 1 10
2 14
Name: 6, dtype: int32
—–
七、Mapping 和 Replace
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
df1 = DataFrame({"城市":["北京","上海","廣州"],"人口":[1000,2000,3000]})
df1
城市 | 人口 | |
---|---|---|
0 | 北京 | 1000 |
1 | 上海 | 2000 |
2 | 廣州 | 3000 |
# 普通方法 給DataFrame 新增列
# 如果 df1 的index 不是順序的 0,1,2...,將不能正確賦值,需要知道 index
df1['GDP'] = Series([1000,2000,1500])
df1
城市 | 人口 | GDP | |
---|---|---|---|
0 | 北京 | 1000 | 1000 |
1 | 上海 | 2000 | 2000 |
2 | 廣州 | 3000 | 1500 |
map() 方法給DataFrame新增列
# map方法(字典) 新增新列,
#這樣就無需關心 按順序對應,以及index匹配問題
gdp_map = {"北京":100,"廣州":300,"上海":200}
df1['GDP'] = df1['城市'].map(gdp_map)
df1
城市 | 人口 | GDP | |
---|---|---|---|
0 | 北京 | 1000 | 100 |
1 | 上海 | 2000 | 200 |
2 | 廣州 | 3000 | 300 |
df2 = DataFrame({"城市":["北京","上海","廣州"],"人口":[1000,2000,3000]},
index=['A','B','C'])
df2
城市 | 人口 | |
---|---|---|
A | 北京 | 1000 |
B | 上海 | 2000 |
C | 廣州 | 3000 |
# index 不是預設的 需要指定index,否則為 NaN
df2['GDP'] = Series([1000,2000,1500])
df2
城市 | 人口 | GDP | |
---|---|---|---|
A | 北京 | 1000 | NaN |
B | 上海 | 2000 | NaN |
C | 廣州 | 3000 | NaN |
# 指定 index
df2['GDP'] = Series([1000,2000,1500],index=['A','B','C'])
df2
城市 | 人口 | GDP | |
---|---|---|---|
A | 北京 | 1000 | 1000 |
B | 上海 | 2000 | 2000 |
C | 廣州 | 3000 | 1500 |
replace in series
s1 = Series(np.arange(6))
s1
0 0
1 1
2 2
3 3
4 4
5 5
dtype: int32
s1.replace(1,np.nan)
0 0.0
1 NaN
2 2.0
3 3.0
4 4.0
5 5.0
dtype: float64
s1.replace([1,2,3],[10,20,30])
0 0
1 10
2 20
3 30
4 4
5 5
dtype: int64