機器學習之路--Matplotlib
阿新 • • 發佈:2018-10-21
清晰 scatter tom 進行 ngs 對比 日期 posit 需要
1.繪制折線圖
在pandas裏面有一種數據類型為datatime ,可以將不規範的日期改為:xxxx-xx-xx
import pandas as pd import numpy as np a = pd.read_csv(‘UNRATE.csv‘) a[‘DATE‘] = pd.to_datetime(a[‘DATE‘]) print(a.head(12))
折線圖
import pandas as pd import numpy as np import matplotlib.pyplot as plt a = pd.read_csv(‘UNRATE.csv‘) b = a[0:12] plt.plot(b[‘DATE‘],b[‘VALUE‘]) plt.show()
這樣就能繪制出一個折線圖了
如果橫坐標寫不下怎麽辦?我們可以將文字豎著寫或者指定一個角度
plt.xticks(rotation = 45) #其中的45表示45°(和數學裏面一樣)
一般情況下要寫橫坐標與縱坐標要表達什麽,還有標題
import pandas as pd import numpy as np import matplotlib.pyplot as plt a = pd.read_csv(‘UNRATE.csv‘) #導入文件 b = a[0:12] #將數據的前12條提取出來plt.plot(b[‘DATE‘],b[‘VALUE‘]) #導入橫縱坐標的數據 plt.xticks(rotation = 90) #橫坐標90 plt.xlabel(‘Month‘) #橫坐標名稱 plt.ylabel(‘Unemployment Rate‘) #縱坐標名稱 plt.title(‘Monthly Unemployment Trends, 1948‘) #標題 plt.show() #展示
輸出;
unrate[‘MONTH‘] = unrate[‘DATE‘].dt.month unrate[‘MONTH‘] = unrate[‘DATE‘].dt.month fig = plt.figure(figsize=(6,3)) #圖的大小 plt.plot(unrate[0:12][‘MONTH‘], unrate[0:12][‘VALUE‘], c=‘red‘) #c為顏色 plt.plot(unrate[12:24][‘MONTH‘], unrate[12:24][‘VALUE‘], c=‘blue‘) #在同一張圖上繪制兩條折線並進行對比 plt.show()
fig = plt.figure(figsize=(10,6)) colors = [‘red‘, ‘blue‘, ‘green‘, ‘orange‘, ‘black‘] for i in range(5): start_index = i*12 end_index = (i+1)*12 subset = unrate[start_index:end_index] plt.plot(subset[‘MONTH‘], subset[‘VALUE‘], c=colors[i]) #繪制5條折線在一張圖中,用顏色加以區分 plt.show()
fig = plt.figure(figsize=(10,6)) colors = [‘red‘, ‘blue‘, ‘green‘, ‘orange‘, ‘black‘] for i in range(5): start_index = i*12 end_index = (i+1)*12 subset = unrate[start_index:end_index] label = str(1948 + i) plt.plot(subset[‘MONTH‘], subset[‘VALUE‘], c=colors[i], label=label) plt.legend(loc=‘best‘) #legend表示添加圖例,loc是圖例在折線圖中的位置,best表示在系統覺得合適的位置,當然也可以自定義位置,位置的選擇請help(legend) #print help(plt.legend) plt.show()
輸出:
最終版:
fig = plt.figure(figsize=(10,6)) colors = [‘red‘, ‘blue‘, ‘green‘, ‘orange‘, ‘black‘] for i in range(5): start_index = i*12 end_index = (i+1)*12 subset = unrate[start_index:end_index] #數據區間 label = str(1948 + i) #圖例每次寫的折線標題 plt.plot(subset[‘MONTH‘], subset[‘VALUE‘], c=colors[i], label=label) plt.legend(loc=‘upper left‘) #放到左上角 plt.xlabel(‘Month, Integer‘) #橫坐標標題 plt.ylabel(‘Unemployment Rate, Percent‘) #縱坐標標題 plt.title(‘Monthly Unemployment Trends, 1948-1952‘) #折線圖標題 plt.show()
輸出:
3、條形圖與散點圖
import pandas as pd import numpy as np from numpy import arange import matplotlib.pyplot as plt reviews = pd.read_csv(‘fandango_scores.csv‘) cols = [‘FILM‘, ‘RT_user_norm‘, ‘Metacritic_user_nom‘, ‘IMDB_norm‘, ‘Fandango_Ratingvalue‘, ‘Fandango_Stars‘] norm_reviews = reviews[cols] num_cols = [‘RT_user_norm‘, ‘Metacritic_user_nom‘, ‘IMDB_norm‘, ‘Fandango_Ratingvalue‘, ‘Fandango_Stars‘] bar_heights = norm_reviews.ix[0, num_cols].values #當前柱的高度 #print bar_heights bar_positions = arange(5) + 0.75 #0.75是第一個柱離原點的距離 然後每個柱距離為1 一共5個柱 #print bar_positions fig, ax = plt.subplots() ax.bar(bar_positions, bar_heights, 0.5) #0.5表示柱子的寬度 plt.show()
num_cols = [‘RT_user_norm‘, ‘Metacritic_user_nom‘, ‘IMDB_norm‘, ‘Fandango_Ratingvalue‘, ‘Fandango_Stars‘] bar_heights = norm_reviews.ix[0, num_cols].values bar_positions = arange(5) + 0.75 tick_positions = range(1,6) fig, ax = plt.subplots() ax.bar(bar_positions, bar_heights, 0.5) ax.set_xticks(tick_positions) ax.set_xticklabels(num_cols, rotation=45) ax.set_xlabel(‘Rating Source‘) #橫坐標 ax.set_ylabel(‘Average Rating‘) #縱坐標 ax.set_title(‘Average User Rating For Avengers: Age of Ultron (2015)‘) #標題
plt.show()
輸出:
當然,也可以將柱形圖變為橫著的
import matplotlib.pyplot as plt from numpy import arange num_cols = [‘RT_user_norm‘, ‘Metacritic_user_nom‘, ‘IMDB_norm‘, ‘Fandango_Ratingvalue‘, ‘Fandango_Stars‘] bar_widths = norm_reviews.ix[0, num_cols].values bar_positions = arange(5) + 0.75 tick_positions = range(1,6) fig, ax = plt.subplots() ax.barh(bar_positions, bar_widths, 0.5) #需要改變的地方,將bar改為barh ax.set_yticks(tick_positions) ax.set_yticklabels(num_cols) ax.set_ylabel(‘Rating Source‘) ax.set_xlabel(‘Average Rating‘) ax.set_title(‘Average User Rating For Avengers: Age of Ultron (2015)‘) plt.show()
輸出:
散點圖:
fig, ax = plt.subplots() ax.scatter(norm_reviews[‘Fandango_Ratingvalue‘], norm_reviews #scatter畫散點圖 [‘RT_user_norm‘]) ax.set_xlabel(‘Fandango‘) ax.set_ylabel(‘Rotten Tomatoes‘) plt.show()
輸出:
畫兩個散點圖:
fig = plt.figure(figsize=(5,10)) ax1 = fig.add_subplot(2,1,1) ax2 = fig.add_subplot(2,1,2) ax1.scatter(norm_reviews[‘Fandango_Ratingvalue‘], norm_reviews[‘RT_user_norm‘]) ax1.set_xlabel(‘Fandango‘) ax1.set_ylabel(‘Rotten Tomatoes‘) ax2.scatter(norm_reviews[‘RT_user_norm‘], norm_reviews[‘Fandango_Ratingvalue‘]) ax2.set_xlabel(‘Rotten Tomatoes‘) ax2.set_ylabel(‘Fandango‘) plt.show()
輸出:
用fig設置參數,ax做實際畫圖的操作
4、柱形圖與盒圖
求數據的頻數,並可視化
import pandas as pd import numpy as np from numpy import arange import matplotlib.pyplot as plt reviews = pd.read_csv(‘fandango_scores.csv‘) cols = [‘FILM‘, ‘RT_user_norm‘, ‘Metacritic_user_nom‘, ‘IMDB_norm‘, ‘Fandango_Ratingvalue‘] norm_reviews = reviews[cols] print(norm_reviews[:5]) #輸出數據 fandango_distribution = norm_reviews[‘Fandango_Ratingvalue‘].value_counts() #需要數據 fandango_distribution = fandango_distribution.sort_index() #從小到大排序 imdb_distribution = norm_reviews[‘IMDB_norm‘].value_counts() imdb_distribution = imdb_distribution.sort_index() print(fandango_distribution) #一組數據的頻數,比如4.3出現了6次 表示為:4.3 6 print(imdb_distribution) #另一組數據的頻數 fig, ax = plt.subplots() ax.hist(norm_reviews[‘Fandango_Ratingvalue‘]) #畫出柱形圖 #ax.hist(norm_reviews[‘Fandango_Ratingvalue‘],bins=20) #bins = 20 表示一共有20個柱子 #ax.hist(norm_reviews[‘Fandango_Ratingvalue‘], range=(4, 5),bins=20) #range代表了橫坐標的區間 plt.show()
import pandas as pd import numpy as np from numpy import arange import matplotlib.pyplot as plt reviews = pd.read_csv(‘fandango_scores.csv‘) cols = [‘FILM‘, ‘RT_user_norm‘, ‘Metacritic_user_nom‘, ‘IMDB_norm‘, ‘Fandango_Ratingvalue‘] norm_reviews = reviews[cols] fig = plt.figure(figsize=(5,20)) ax1 = fig.add_subplot(4,1,1) ax2 = fig.add_subplot(4,1,2) ax3 = fig.add_subplot(4,1,3) ax4 = fig.add_subplot(4,1,4) ax1.hist(norm_reviews[‘Fandango_Ratingvalue‘], bins=20, range=(0, 5)) ax1.set_title(‘Distribution of Fandango Ratings‘) ax1.set_ylim(0, 50) #指定了這組數據的y軸取值區間 ax2.hist(norm_reviews[‘RT_user_norm‘], 20, range=(0, 5)) ax2.set_title(‘Distribution of Rotten Tomatoes Ratings‘) ax2.set_ylim(0, 50) ax3.hist(norm_reviews[‘Metacritic_user_nom‘], 20, range=(0, 5)) ax3.set_title(‘Distribution of Metacritic Ratings‘) ax3.set_ylim(0, 50) ax4.hist(norm_reviews[‘IMDB_norm‘], 20, range=(0, 5)) ax4.set_title(‘Distribution of IMDB Ratings‘) ax4.set_ylim(0, 50) plt.show()
輸出:(在ml裏run一下,太長了)
盒圖(四分圖,找中位數):
import pandas as pd import numpy as np from numpy import arange import matplotlib.pyplot as plt reviews = pd.read_csv(‘fandango_scores.csv‘) cols = [‘FILM‘, ‘RT_user_norm‘, ‘Metacritic_user_nom‘, ‘IMDB_norm‘, ‘Fandango_Ratingvalue‘] norm_reviews = reviews[cols] fig, ax = plt.subplots() ax.boxplot(norm_reviews[‘RT_user_norm‘]) ax.set_xticklabels([‘Rotten Tomatoes‘]) ax.set_ylim(0, 5) plt.show()
輸出:
這樣,就可以清晰的看到中位數的位置以及大致的數據區間
也可以在一張圖上放入多張盒圖,這樣就可以區分各個屬性的特征了
import pandas as pd import numpy as np from numpy import arange import matplotlib.pyplot as plt reviews = pd.read_csv(‘fandango_scores.csv‘) cols = [‘FILM‘, ‘RT_user_norm‘, ‘Metacritic_user_nom‘, ‘IMDB_norm‘, ‘Fandango_Ratingvalue‘] norm_reviews = reviews[cols] num_cols = [‘RT_user_norm‘, ‘Metacritic_user_nom‘, ‘IMDB_norm‘, ‘Fandango_Ratingvalue‘] fig, ax = plt.subplots() ax.boxplot(norm_reviews[num_cols].values) ax.set_xticklabels(num_cols, rotation=90) ax.set_ylim(0,5) plt.show()
輸出:
5、閑的蛋疼系列:
可以將坐標軸去掉:
for key,spine in ax.spines.items(): spine.set_visible(False) #去掉橫縱坐標軸的線
可以去掉坐標軸的鋸齒:
ax.tick_params(bottom="off", top="off", left="off", right="off")
6、最後的一些方法
*****一般在做圖時為了讓圖中表達的清晰,讓圖盡量在一行或兩行
fig = plt.figure(figsize=(12, 12)) #figsize參數調試
在作圖時的顏色可以用自己定義的顏色
#Color import pandas as pd import matplotlib.pyplot as plt women_degrees = pd.read_csv(‘percent-bachelors-degrees-women-usa.csv‘) major_cats = [‘Biology‘, ‘Computer Science‘, ‘Engineering‘, ‘Math and Statistics‘] cb_dark_blue = (0/255, 107/255, 164/255) #自定義顏色,註意格式 cb_orange = (255/255, 128/255, 14/255) fig = plt.figure(figsize=(12, 12)) for sp in range(0,4): ax = fig.add_subplot(2,2,sp+1) # The color for each line is assigned here. ax.plot(women_degrees[‘Year‘], women_degrees[major_cats[sp]], c=cb_dark_blue, label=‘Women‘) ax.plot(women_degrees[‘Year‘], 100-women_degrees[major_cats[sp]], c=cb_orange, label=‘Men‘) for key,spine in ax.spines.items(): spine.set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(major_cats[sp]) ax.tick_params(bottom="off", top="off", left="off", right="off") plt.legend(loc=‘upper right‘) plt.show()
如果要讓線的寬度改變,讓
ax.plot(women_degrees[‘Year‘], women_degrees[major_cats[sp]], c=cb_dark_blue, label=‘Women‘, linewidth=10) #linewidth是改變線寬度的參數 ax.plot(women_degrees[‘Year‘], 100-women_degrees[major_cats[sp]], c=cb_orange, label=‘Men‘, linewidth=10)
最終附上一波此例完整版:(其中有在圖中某一坐標上標出此點名稱):
import pandas as pd import numpy as np from numpy import arange import matplotlib.pyplot as plt women_degrees = pd.read_csv(‘percent-bachelors-degrees-women-usa.csv‘) major_cats = [‘Biology‘, ‘Computer Science‘, ‘Engineering‘, ‘Math and Statistics‘] stem_cats = [‘Engineering‘, ‘Computer Science‘, ‘Psychology‘, ‘Biology‘, ‘Physical Sciences‘, ‘Math and Statistics‘] cb_dark_blue = (0/255, 107/255, 164/255) cb_orange = (255/255, 128/255, 14/255) fig = plt.figure(figsize=(18, 3)) for sp in range(0, 6): ax = fig.add_subplot(1, 6, sp + 1) ax.plot(women_degrees[‘Year‘], women_degrees[stem_cats[sp]], c=cb_dark_blue, label=‘Women‘, linewidth=3) ax.plot(women_degrees[‘Year‘], 100 - women_degrees[stem_cats[sp]], c=cb_orange, label=‘Men‘, linewidth=3) for key, spine in ax.spines.items(): spine.set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0, 100) ax.set_title(stem_cats[sp]) ax.tick_params(bottom="off", top="off", left="off", right="off") plt.legend(loc=‘upper right‘) plt.show() fig = plt.figure(figsize=(18, 3)) for sp in range(0, 6): ax = fig.add_subplot(1, 6, sp + 1) ax.plot(women_degrees[‘Year‘], women_degrees[stem_cats[sp]], c=cb_dark_blue, label=‘Women‘, linewidth=3) ax.plot(women_degrees[‘Year‘], 100 - women_degrees[stem_cats[sp]], c=cb_orange, label=‘Men‘, linewidth=3) for key, spine in ax.spines.items(): spine.set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0, 100) ax.set_title(stem_cats[sp]) ax.tick_params(bottom="off", top="off", left="off", right="off") if sp == 0: #設置if語句後會對需要的圖上加點的名稱 ax.text(2005, 87, ‘Men‘) #在坐標(2005,87)處標men ax.text(2002, 8, ‘Women‘) elif sp == 5: ax.text(2005, 62, ‘Men‘) ax.text(2001, 35, ‘Women‘) plt.show()
輸出:
機器學習之路--Matplotlib