1. 程式人生 > >機器學習之路--Matplotlib

機器學習之路--Matplotlib

清晰 scatter tom 進行 ngs 對比 日期 posit 需要

1.繪制折線圖

在pandas裏面有一種數據類型為datatime ,可以將不規範的日期改為:xxxx-xx-xx

import pandas as pd
import numpy as np
a = pd.read_csv(UNRATE.csv)
a[DATE] = pd.to_datetime(a[DATE])
print(a.head(12))

折線圖

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
a = pd.read_csv(UNRATE.csv)
b = a[0:12]
plt.plot(b[
DATE],b[VALUE]) plt.show()

這樣就能繪制出一個折線圖了

如果橫坐標寫不下怎麽辦?我們可以將文字豎著寫或者指定一個角度

plt.xticks(rotation = 45)   #其中的45表示45°(和數學裏面一樣)

一般情況下要寫橫坐標與縱坐標要表達什麽,還有標題

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt      
a = pd.read_csv(UNRATE.csv)     #導入文件
b = a[0:12]      #將數據的前12條提取出來
plt.plot(b[DATE],b[VALUE]) #導入橫縱坐標的數據 plt.xticks(rotation = 90) #橫坐標90 plt.xlabel(Month) #橫坐標名稱 plt.ylabel(Unemployment Rate) #縱坐標名稱 plt.title(Monthly Unemployment Trends, 1948) #標題 plt.show() #展示

輸出;技術分享圖片

unrate[MONTH] = unrate[DATE].dt.month
unrate[
MONTH] = unrate[DATE].dt.month fig = plt.figure(figsize=(6,3)) #圖的大小 plt.plot(unrate[0:12][MONTH], unrate[0:12][VALUE], c=red) #c為顏色 plt.plot(unrate[12:24][MONTH], unrate[12:24][VALUE], c=blue) #在同一張圖上繪制兩條折線並進行對比 plt.show()
fig = plt.figure(figsize=(10,6))
colors = [red, blue, green, orange, black]
for i in range(5):
    start_index = i*12
    end_index = (i+1)*12
    subset = unrate[start_index:end_index]
    plt.plot(subset[MONTH], subset[VALUE], c=colors[i])
    #繪制5條折線在一張圖中,用顏色加以區分
plt.show()
fig = plt.figure(figsize=(10,6))
colors = [red, blue, green, orange, black]
for i in range(5):
    start_index = i*12
    end_index = (i+1)*12
    subset = unrate[start_index:end_index]
    label = str(1948 + i)
    plt.plot(subset[MONTH], subset[VALUE], c=colors[i], label=label)
plt.legend(loc=best)      #legend表示添加圖例,loc是圖例在折線圖中的位置,best表示在系統覺得合適的位置,當然也可以自定義位置,位置的選擇請help(legend)
#print help(plt.legend)
plt.show()

輸出:技術分享圖片

最終版:

fig = plt.figure(figsize=(10,6))
colors = [red, blue, green, orange, black]
for i in range(5):
    start_index = i*12
    end_index = (i+1)*12
    subset = unrate[start_index:end_index]     #數據區間
    label = str(1948 + i)       #圖例每次寫的折線標題
    plt.plot(subset[MONTH], subset[VALUE], c=colors[i], label=label)
plt.legend(loc=upper left)       #放到左上角
plt.xlabel(Month, Integer)       #橫坐標標題
plt.ylabel(Unemployment Rate, Percent)   #縱坐標標題
plt.title(Monthly Unemployment Trends, 1948-1952)      #折線圖標題

plt.show()

輸出:技術分享圖片

3、條形圖與散點圖

import pandas as pd
import numpy as np
from numpy import arange
import matplotlib.pyplot as plt
reviews = pd.read_csv(fandango_scores.csv)
cols = [FILM, RT_user_norm, Metacritic_user_nom, IMDB_norm, Fandango_Ratingvalue, Fandango_Stars]
norm_reviews = reviews[cols]
num_cols = [RT_user_norm, Metacritic_user_nom, IMDB_norm, Fandango_Ratingvalue, Fandango_Stars]

bar_heights = norm_reviews.ix[0, num_cols].values     #當前柱的高度
#print bar_heights
bar_positions = arange(5) + 0.75     #0.75是第一個柱離原點的距離    然後每個柱距離為1 一共5個柱
#print bar_positions
fig, ax = plt.subplots()
ax.bar(bar_positions, bar_heights, 0.5)      #0.5表示柱子的寬度
plt.show()
num_cols = [RT_user_norm, Metacritic_user_nom, IMDB_norm, Fandango_Ratingvalue, Fandango_Stars]
bar_heights = norm_reviews.ix[0, num_cols].values
bar_positions = arange(5) + 0.75
tick_positions = range(1,6)
fig, ax = plt.subplots()

ax.bar(bar_positions, bar_heights, 0.5)
ax.set_xticks(tick_positions)
ax.set_xticklabels(num_cols, rotation=45)

ax.set_xlabel(Rating Source)     #橫坐標
ax.set_ylabel(Average Rating)     #縱坐標
ax.set_title(Average User Rating For Avengers: Age of Ultron (2015))   #標題
plt.show()

輸出:技術分享圖片

當然,也可以將柱形圖變為橫著的

import matplotlib.pyplot as plt
from numpy import arange
num_cols = [RT_user_norm, Metacritic_user_nom, IMDB_norm, Fandango_Ratingvalue, Fandango_Stars]

bar_widths = norm_reviews.ix[0, num_cols].values
bar_positions = arange(5) + 0.75
tick_positions = range(1,6)
fig, ax = plt.subplots()
ax.barh(bar_positions, bar_widths, 0.5)     #需要改變的地方,將bar改為barh

ax.set_yticks(tick_positions)
ax.set_yticklabels(num_cols)
ax.set_ylabel(Rating Source)
ax.set_xlabel(Average Rating)
ax.set_title(Average User Rating For Avengers: Age of Ultron (2015))
plt.show()

輸出:技術分享圖片

散點圖:

fig, ax = plt.subplots()
ax.scatter(norm_reviews[Fandango_Ratingvalue], norm_reviews    #scatter畫散點圖
[RT_user_norm])
ax.set_xlabel(Fandango)
ax.set_ylabel(Rotten Tomatoes)
plt.show()

輸出:

技術分享圖片

畫兩個散點圖:

fig = plt.figure(figsize=(5,10))
ax1 = fig.add_subplot(2,1,1)
ax2 = fig.add_subplot(2,1,2)
ax1.scatter(norm_reviews[Fandango_Ratingvalue], norm_reviews[RT_user_norm])
ax1.set_xlabel(Fandango)
ax1.set_ylabel(Rotten Tomatoes)
ax2.scatter(norm_reviews[RT_user_norm], norm_reviews[Fandango_Ratingvalue])
ax2.set_xlabel(Rotten Tomatoes)
ax2.set_ylabel(Fandango)
plt.show()

輸出:

技術分享圖片

用fig設置參數,ax做實際畫圖的操作

4、柱形圖與盒圖

求數據的頻數,並可視化

import pandas as pd
import numpy as np
from numpy import arange
import matplotlib.pyplot as plt
reviews = pd.read_csv(fandango_scores.csv)
cols = [FILM, RT_user_norm, Metacritic_user_nom, IMDB_norm, Fandango_Ratingvalue]
norm_reviews = reviews[cols]
print(norm_reviews[:5])      #輸出數據
fandango_distribution = norm_reviews[Fandango_Ratingvalue].value_counts()       #需要數據
fandango_distribution = fandango_distribution.sort_index()     #從小到大排序

imdb_distribution = norm_reviews[IMDB_norm].value_counts()
imdb_distribution = imdb_distribution.sort_index()

print(fandango_distribution)    #一組數據的頻數,比如4.3出現了6次 表示為:4.3     6
print(imdb_distribution)        #另一組數據的頻數
fig, ax = plt.subplots()
ax.hist(norm_reviews[Fandango_Ratingvalue])       #畫出柱形圖
#ax.hist(norm_reviews[‘Fandango_Ratingvalue‘],bins=20)     #bins = 20 表示一共有20個柱子
#ax.hist(norm_reviews[‘Fandango_Ratingvalue‘], range=(4, 5),bins=20)     #range代表了橫坐標的區間
plt.show()
import pandas as pd
import numpy as np
from numpy import arange
import matplotlib.pyplot as plt
reviews = pd.read_csv(fandango_scores.csv)
cols = [FILM, RT_user_norm, Metacritic_user_nom, IMDB_norm, Fandango_Ratingvalue]
norm_reviews = reviews[cols]

fig = plt.figure(figsize=(5,20))     
ax1 = fig.add_subplot(4,1,1)
ax2 = fig.add_subplot(4,1,2)
ax3 = fig.add_subplot(4,1,3)
ax4 = fig.add_subplot(4,1,4)
ax1.hist(norm_reviews[Fandango_Ratingvalue], bins=20, range=(0, 5))
ax1.set_title(Distribution of Fandango Ratings)
ax1.set_ylim(0, 50)    #指定了這組數據的y軸取值區間

ax2.hist(norm_reviews[RT_user_norm], 20, range=(0, 5))
ax2.set_title(Distribution of Rotten Tomatoes Ratings)
ax2.set_ylim(0, 50)

ax3.hist(norm_reviews[Metacritic_user_nom], 20, range=(0, 5))
ax3.set_title(Distribution of Metacritic Ratings)
ax3.set_ylim(0, 50)

ax4.hist(norm_reviews[IMDB_norm], 20, range=(0, 5))
ax4.set_title(Distribution of IMDB Ratings)
ax4.set_ylim(0, 50)

plt.show()

輸出:(在ml裏run一下,太長了)

盒圖(四分圖,找中位數):

import pandas as pd
import numpy as np
from numpy import arange
import matplotlib.pyplot as plt
reviews = pd.read_csv(fandango_scores.csv)
cols = [FILM, RT_user_norm, Metacritic_user_nom, IMDB_norm, Fandango_Ratingvalue]
norm_reviews = reviews[cols]
fig, ax = plt.subplots()
ax.boxplot(norm_reviews[RT_user_norm])
ax.set_xticklabels([Rotten Tomatoes])
ax.set_ylim(0, 5)
plt.show()

輸出:

技術分享圖片

這樣,就可以清晰的看到中位數的位置以及大致的數據區間

也可以在一張圖上放入多張盒圖,這樣就可以區分各個屬性的特征了

import pandas as pd
import numpy as np
from numpy import arange
import matplotlib.pyplot as plt
reviews = pd.read_csv(fandango_scores.csv)
cols = [FILM, RT_user_norm, Metacritic_user_nom, IMDB_norm, Fandango_Ratingvalue]
norm_reviews = reviews[cols]
num_cols = [RT_user_norm, Metacritic_user_nom, IMDB_norm, Fandango_Ratingvalue]
fig, ax = plt.subplots()
ax.boxplot(norm_reviews[num_cols].values)
ax.set_xticklabels(num_cols, rotation=90)
ax.set_ylim(0,5)
plt.show()

輸出:

技術分享圖片

5、閑的蛋疼系列:

可以將坐標軸去掉:

for key,spine in ax.spines.items():
    spine.set_visible(False)     #去掉橫縱坐標軸的線

可以去掉坐標軸的鋸齒:

ax.tick_params(bottom="off", top="off", left="off", right="off")

6、最後的一些方法

*****一般在做圖時為了讓圖中表達的清晰,讓圖盡量在一行或兩行

fig = plt.figure(figsize=(12, 12))   #figsize參數調試

在作圖時的顏色可以用自己定義的顏色

#Color
import pandas as pd
import matplotlib.pyplot as plt

women_degrees = pd.read_csv(percent-bachelors-degrees-women-usa.csv)
major_cats = [Biology, Computer Science, Engineering, Math and Statistics]


cb_dark_blue = (0/255, 107/255, 164/255)    #自定義顏色,註意格式
cb_orange = (255/255, 128/255, 14/255)

fig = plt.figure(figsize=(12, 12))

for sp in range(0,4):
    ax = fig.add_subplot(2,2,sp+1)
    # The color for each line is assigned here.
    ax.plot(women_degrees[Year], women_degrees[major_cats[sp]], c=cb_dark_blue, label=Women)
    ax.plot(women_degrees[Year], 100-women_degrees[major_cats[sp]], c=cb_orange, label=Men)
    for key,spine in ax.spines.items():
        spine.set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    ax.set_title(major_cats[sp])
    ax.tick_params(bottom="off", top="off", left="off", right="off")

plt.legend(loc=upper right)
plt.show()

如果要讓線的寬度改變,讓

ax.plot(women_degrees[Year], women_degrees[major_cats[sp]], c=cb_dark_blue, label=Women, linewidth=10)   #linewidth是改變線寬度的參數
    ax.plot(women_degrees[Year], 100-women_degrees[major_cats[sp]], c=cb_orange, label=Men, linewidth=10)

最終附上一波此例完整版:(其中有在圖中某一坐標上標出此點名稱):

import pandas as pd
import numpy as np
from numpy import arange
import matplotlib.pyplot as plt
women_degrees = pd.read_csv(percent-bachelors-degrees-women-usa.csv)
major_cats = [Biology, Computer Science, Engineering, Math and Statistics]
stem_cats = [Engineering, Computer Science, Psychology, Biology, Physical Sciences, Math and Statistics]
cb_dark_blue = (0/255, 107/255, 164/255)
cb_orange = (255/255, 128/255, 14/255)
fig = plt.figure(figsize=(18, 3))

for sp in range(0, 6):
    ax = fig.add_subplot(1, 6, sp + 1)
    ax.plot(women_degrees[Year], women_degrees[stem_cats[sp]], c=cb_dark_blue, label=Women, linewidth=3)
    ax.plot(women_degrees[Year], 100 - women_degrees[stem_cats[sp]], c=cb_orange, label=Men, linewidth=3)
    for key, spine in ax.spines.items():
        spine.set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0, 100)
    ax.set_title(stem_cats[sp])
    ax.tick_params(bottom="off", top="off", left="off", right="off")
plt.legend(loc=upper right)
plt.show()
fig = plt.figure(figsize=(18, 3))

for sp in range(0, 6):
    ax = fig.add_subplot(1, 6, sp + 1)
    ax.plot(women_degrees[Year], women_degrees[stem_cats[sp]], c=cb_dark_blue, label=Women, linewidth=3)
    ax.plot(women_degrees[Year], 100 - women_degrees[stem_cats[sp]], c=cb_orange, label=Men, linewidth=3)
    for key, spine in ax.spines.items():
        spine.set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0, 100)
    ax.set_title(stem_cats[sp])
    ax.tick_params(bottom="off", top="off", left="off", right="off")

    if sp == 0:            #設置if語句後會對需要的圖上加點的名稱
        ax.text(2005, 87, Men)    #在坐標(2005,87)處標men
        ax.text(2002, 8, Women)
    elif sp == 5:
        ax.text(2005, 62, Men)
        ax.text(2001, 35, Women)
plt.show()

輸出:

技術分享圖片


機器學習之路--Matplotlib