1. 程式人生 > >Python視覺化庫matplotlib庫各種圖demo

Python視覺化庫matplotlib庫各種圖demo

關聯分析、數值比較:散點圖、曲線圖
分佈分析:灰度圖、密度圖
涉及分類的分析:柱狀圖、箱式圖

核密度估計(Kernel density estimation),是一種用於估計概率密度函式的非引數方法,採用平滑的峰值函式(“核”)來擬合觀察到的資料點,從而對真實的概率分佈曲線進行模擬。
https://en.wikipedia.org/wiki/Kernel_density_estimation

核密度函式的基本想法是,在知道某一事物概率分佈的情況下,如果某一個數在觀察中出現了,可以認為這個數的概率密度很大,和這個數比較近的數的概率密度也會比較大,而那些離這個數遠的數的概率密度會比較小。

給定獨立同分布的n個樣本點,核密度估計為:
這裡寫圖片描述


K為核函式(非負、積分為1,符合概率密度性質,並且均值為0);有多種核函式,常用的是Gaussian KDE。h>0是一個平滑引數,稱作頻寬(bandwidth),也叫視窗。
針對每個樣本點,用K去擬合上述想象的遠小近大的概率密度。對每一個觀察數擬合出的多個概率密度分佈函式取平均。如果某些數是比較重要的,則可以取加權平均。總而言之,核密度估計通過核函式(如高斯)將每個資料點的資料+頻寬當作核函式的引數,得到N個核函式,再線性疊加就形成了核密度的估計函式,歸一化後得核密度概率密度函式。

匯入資料,基本處理

import pandas as pd # 讀取資料到DataFrame
import
urllib # 獲取網路資料 import shutil # 檔案操作 import zipfile # 壓縮解壓 import os # 建立臨時目錄 try: os.system('mkdir bike_data') except: os.system('rm -rf bike_data; mkdir bike_data') data_source = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip' # 網路資料地址 zipname = 'bike_data/Bike-Sharing-Dataset.zip'
# 拼接檔案和路徑 urllib.request.urlretrieve(data_source, zipname) # 獲得資料 zip_ref = zipfile.ZipFile(zipname, 'r') # 建立一個ZipFile物件處理壓縮檔案 #zip_ref.extractall(temp_dir) # 解壓 zip_ref.extractall('bike_data') zip_ref.close() daily_path = 'bike_data/day.csv' daily_data = pd.read_csv(daily_path) # 讀取csv檔案 daily_data['dteday'] = pd.to_datetime(daily_data['dteday']) # 把字串資料傳換成日期資料 drop_list = ['instant', 'season', 'yr', 'mnth', 'holiday', 'workingday', 'weathersit', 'atemp', 'hum'] # 不關注的列 daily_data.drop(drop_list, inplace = True, axis = 1) # inplace=true在物件上直接操作 daily_data.head() # 看一看資料~

這裡寫圖片描述

配置引數

from __future__ import division, print_function # 引入3.x版本的除法和列印
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
# 在notebook中顯示繪圖結果
%matplotlib inline

# 設定一些全域性的資源引數,可以進行個性化修改
import matplotlib
# 設定圖片尺寸 14" x 7"
# rc: resource configuration
matplotlib.rc('figure', figsize = (14, 7))
# 設定字型 14
matplotlib.rc('font', size = 14)
# 不顯示頂部和右側的座標線
matplotlib.rc('axes.spines', top = False, right = False)
# 不顯示網格
matplotlib.rc('axes', grid = False)
# 設定背景顏色是白色
matplotlib.rc('axes', facecolor = 'white')

關聯分析

散點圖,分析變數關係

from matplotlib import font_manager   #設定字型的一個包
fontP = font_manager.FontProperties()
fontP.set_family('SimHei')
fontP.set_size(14)

# 包裝一個散點圖的函式便於複用
def scatterplot(x_data, y_data, x_label, y_label, title):

    # 建立一個繪圖物件
    fig, ax = plt.subplots()

    # 設定資料、點的大小、點的顏色和透明度
    ax.scatter(x_data, y_data, s = 10, color = '#539caf', alpha = 0.9) # http://www.114la.com/other/rgb.htm

    # 新增標題和座標說明
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)

# 繪製散點圖
scatterplot(x_data = daily_data['temp']
            , y_data = daily_data['cnt']
            , x_label = 'Normalized temperature (C)'
            , y_label = 'Check outs'
            , title = 'Number of Check Outs vs Temperature')

這裡寫圖片描述

# 線性迴歸
import statsmodels.api as sm # 最小二乘
from statsmodels.stats.outliers_influence import summary_table # 獲得彙總資訊
x = sm.add_constant(daily_data['temp']) # 線性迴歸增加常數項 y=kx+b
y = daily_data['cnt']
regr = sm.OLS(y, x) # 普通最小二乘模型,ordinary least square model
res = regr.fit()    #res.model.endog
# 從模型獲得擬合數據
st, data, ss2 = summary_table(res, alpha=0.05) # 置信水平alpha=5%,st資料彙總,data資料詳情,ss2資料列名
fitted_values = data[:,2]  #等價於res.fittedvalues

# 包裝曲線繪製函式
def lineplot(x_data, y_data, x_label, y_label, title):
    # 建立繪圖物件
    _, ax = plt.subplots()

    # 繪製擬合曲線,lw=linewidth,alpha=transparancy
    ax.plot(x_data, y_data, lw = 2, color = '#539caf', alpha = 1)

    # 新增標題和座標說明
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)

# 呼叫繪圖函式
lineplot(x_data = daily_data['temp']
         , y_data = fitted_values
         , x_label = 'Normalized temperature (C)'
         , y_label = 'Check outs'
         , title = 'Line of Best Fit for Number of Check Outs vs Temperature')

這裡寫圖片描述

帶置信區間的曲線圖,評估曲線擬合結果

# 獲得5%置信區間的上下界
predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T  #summary_table

# 建立置信區間DataFrame,上下界
CI_df = pd.DataFrame(columns = ['x_data', 'low_CI', 'upper_CI'])
CI_df['x_data'] = daily_data['temp']  
CI_df['low_CI'] = predict_mean_ci_low
CI_df['upper_CI'] = predict_mean_ci_upp
CI_df.sort_values('x_data', inplace = True) # 根據x_data進行排序

# 繪製置信區間
def lineplotCI(x_data, y_data, sorted_x, low_CI, upper_CI, x_label, y_label, title):
    # 建立繪圖物件
    _, ax = plt.subplots()

    # 繪製預測曲線
    ax.plot(x_data, y_data, lw = 1, color = '#539caf', alpha = 1, label = 'Fit')
    # 繪製置信區間,順序填充
    ax.fill_between(sorted_x, low_CI, upper_CI, color = '#539caf', alpha = 0.4, label = '95% CI')   #####
    # 新增標題和座標說明
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)

    # 顯示圖例,配合label引數,loc=“best”自適應方式
    ax.legend(loc = 'best')

# Call the function to create plot
lineplotCI(x_data = daily_data['temp']
           , y_data = fitted_values
           , sorted_x = CI_df['x_data']
           , low_CI = CI_df['low_CI']
           , upper_CI = CI_df['upper_CI']
           , x_label = 'Normalized temperature (C)'
           , y_label = 'Check outs'
           , title = 'Line of Best Fit for Number of Check Outs vs Temperature')

這裡寫圖片描述

雙座標曲線圖,曲線擬合不滿足置信閾值時,考慮增加獨立變數;分析不同尺度多變數的關係

# 雙縱座標繪圖函式
def lineplot2y(x_data, x_label, y1_data, y1_color, y1_label, y2_data, y2_color, y2_label, title):
    _, ax1 = plt.subplots()
    ax1.plot(x_data, y1_data, color = y1_color)
    # 新增標題和座標說明
    ax1.set_ylabel(y1_label, color = y1_color)
    ax1.set_xlabel(x_label)
    ax1.set_title(title)

    ax2 = ax1.twinx() # 兩個繪圖物件共享橫座標軸
    ax2.plot(x_data, y2_data, color = y2_color)
    ax2.set_ylabel(y2_label, color = y2_color)
    # 右側座標軸可見
    ax2.spines['right'].set_visible(True)

# 呼叫繪圖函式
lineplot2y(x_data = daily_data['dteday']
           , x_label = 'Day'
           , y1_data = daily_data['cnt']
           , y1_color = '#539caf'
           , y1_label = 'Check outs'
           , y2_data = daily_data['windspeed']
           , y2_color = '#7663b0'
           , y2_label = 'Normalized windspeed'
           , title = 'Check Outs and Windspeed Over Time')

這裡寫圖片描述

分佈分析

灰度圖,粗略區間計算

# 繪製灰度圖的函式
def histogram(data, x_label, y_label, title):
    _, ax = plt.subplots()
    res = ax.hist(data, color = '#539caf', bins=20) # 設定bin的數量
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)
    return res

# 繪圖函式呼叫
res = histogram(data = daily_data['registered']
           , x_label = 'Check outs'
           , y_label = 'Frequency'
           , title = 'Distribution of Registered Check Outs')
res[0] # value of bins 每個桶內頻數
res[1] # boundary of bins 桶的邊界橫座標

這裡寫圖片描述

堆疊直方圖,比較兩個分佈

# 繪製堆疊的直方圖
def overlaid_histogram(data1, data1_name, data1_color, data2, data2_name, data2_color, x_label, y_label, title):
    # 歸一化資料區間,對齊兩個直方圖的bins
    max_nbins = 10
    data_range = [min(min(data1), min(data2)), max(max(data1), max(data2))]
    binwidth = (data_range[1] - data_range[0]) / max_nbins
    bins = np.arange(data_range[0], data_range[1] + binwidth, binwidth) # 生成直方圖bins區間

    # Create the plot
    _, ax = plt.subplots()
    ax.hist(data1, bins = bins, color = data1_color, alpha = 1, label = data1_name)
    ax.hist(data2, bins = bins, color = data2_color, alpha = 0.75, label = data2_name)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)
    ax.legend(loc = 'best')

# Call the function to create plot
overlaid_histogram(data1 = daily_data['registered']
                   , data1_name = 'Registered'
                   , data1_color = '#539caf'
                   , data2 = daily_data['casual']
                   , data2_name = 'Casual'
                   , data2_color = '#7663b0'
                   , x_label = 'Check outs'
                   , y_label = 'Frequency'
                   , title = 'Distribution of Check Outs By Type')

這裡寫圖片描述

密度圖,精細刻畫概率密度

# 計算概率密度
from scipy.stats import gaussian_kde
data = daily_data['registered']
density_est = gaussian_kde(data) # kernal density estimate: https://en.wikipedia.org/wiki/Kernel_density_estimation
# 控制平滑程度,數值越大,越平滑
density_est.covariance_factor = lambda : 0.3   #頻寬
density_est._compute_covariance()
x_data = np.arange(min(data), max(data), 200)    #data是無序的,x_data從小到大排序作圖

# 繪製密度估計曲線
def densityplot(x_data, density_est, x_label, y_label, title):
    _, ax = plt.subplots()
    ax.plot(x_data, density_est(x_data), color = '#539caf', lw =2)  #lw是曲線亮度
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)

# 呼叫繪圖函式
densityplot(x_data = x_data
            , density_est = density_est
            , x_label = 'Check outs'
            , y_label = 'Frequency'
            , title = 'Distribution of Registered Check Outs')
#type(density_est)  #scipy.stats.kde.gaussian_kde

這裡寫圖片描述

分類組間分析

組間定量比較,分組粒度,組間聚類

柱狀圖,一級類間均值方差比較

# 分天分析統計特徵
mean_total_co_day = daily_data[['weekday', 'cnt']].groupby('weekday').agg([np.mean, np.std])
mean_total_co_day.columns = mean_total_co_day.columns.droplevel()  #變成一維columns

# 定義繪製柱狀圖的函式
def barplot(x_data, y_data, error_data, x_label, y_label, title):
    _, ax = plt.subplots()
    # 柱狀圖
    ax.bar(x_data, y_data, color = '#539caf', align = 'center')
    # 繪製方差
    # ls='none'去掉bar之間的連線
    ax.errorbar(x_data, y_data, yerr = error_data, color = '#297083', ls = 'none', lw = 5)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)

# 繪圖函式呼叫
barplot(x_data = mean_total_co_day.index.values
        , y_data = mean_total_co_day['mean']
        , error_data = mean_total_co_day['std']
        , x_label = 'Day of week'
        , y_label = 'Check outs'
        , title = 'Total Check Outs By Day of Week (0 = Sunday)')

這裡寫圖片描述

堆積柱狀圖,多級類間相對佔比比較

# 分天統計註冊和偶然使用的情況
mean_by_reg_co_day = daily_data[['weekday', 'registered', 'casual']].groupby('weekday').mean()
# 分天統計註冊和偶然使用的佔比
mean_by_reg_co_day['total'] = mean_by_reg_co_day['registered'] + mean_by_reg_co_day['casual']
mean_by_reg_co_day['reg_prop'] = mean_by_reg_co_day['registered'] / mean_by_reg_co_day['total']
mean_by_reg_co_day['casual_prop'] = mean_by_reg_co_day['casual'] / mean_by_reg_co_day['total']


# 繪製堆積柱狀圖
def stackedbarplot(x_data, y_data_list, y_data_names, colors, x_label, y_label, title):
    _, ax = plt.subplots()
    # 迴圈繪製堆積柱狀圖
    for i in range(0, len(y_data_list)):
        if i == 0:
            ax.bar(x_data, y_data_list[i], color = colors[i], align = 'center', label = y_data_names[i])
        else:
            # 採用堆積的方式,除了第一個分類,後面的分類都從前一個分類的柱狀圖接著畫
            # 用歸一化保證最終累積結果為1
            ax.bar(x_data, y_data_list[i], color = colors[i], bottom = y_data_list[1 - i], align = 'center', label = y_data_names[i])
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)
    ax.legend(loc = 'upper right') # 設定圖例位置

# 呼叫繪圖函式
stackedbarplot(x_data = mean_by_reg_co_day.index.values
               , y_data_list = [mean_by_reg_co_day['reg_prop'], mean_by_reg_co_day['casual_prop']]
               , y_data_names = ['Registered', 'Casual']
               , colors = ['#539caf', '#7663b0']
               , x_label = 'Day of week'
               , y_label = 'Proportion of check outs'
               , title = 'Check Outs By Registration Status and Day of Week (0 = Sunday)')

這裡寫圖片描述

分組柱狀圖,多級類間絕對數值比較
(按每一類遍歷,先畫第一類的x_data各柱子,再畫第二類的x_data各柱子)

# 繪製分組柱狀圖的函式
def groupedbarplot(x_data, y_data_list, y_data_names, colors, x_label, y_label, title):
    _, ax = plt.subplots()
    # 設定每一組柱狀圖的寬度
    total_width = 0.8
    # 設定每一個柱狀圖的寬度
    ind_width = total_width / len(y_data_list)
    # 計算每一個柱狀圖的中心偏移
    alteration = np.arange(-total_width/2+ind_width/2, total_width/2+ind_width/2, ind_width)

    # 分別繪製每一個柱狀圖
    for i in range(0, len(y_data_list)):
        # 橫向散開繪製
        ax.bar(x_data + alteration[i], y_data_list[i], color = colors[i], label = y_data_names[i], width = ind_width)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)
    ax.legend(loc = 'upper right')

# 呼叫繪圖函式
groupedbarplot(x_data = mean_by_reg_co_day.index.values
               , y_data_list = [mean_by_reg_co_day['registered'], mean_by_reg_co_day['casual']]
               , y_data_names = ['Registered', 'Casual']
               , colors = ['#539caf', '#7663b0']
               , x_label = 'Day of week'
               , y_label = 'Check outs'
               , title = 'Check Outs By Registration Status and Day of Week (0 = Sunday)')

這裡寫圖片描述

箱式圖,多級類間資料分析比較,柱狀圖+堆疊灰度圖

# 只需要指定分類的依據,就能自動繪製箱式圖
days = np.unique(daily_data['weekday'])   #np.unique返回排好序的出現值(集合)
bp_data = []
for day in days:
    bp_data.append(daily_data[daily_data['weekday'] == day]['cnt'].values)   #一次迴圈在列表中追加一個array

# 定義繪圖函式
def boxplot(x_data, y_data, base_color, median_color, x_label, y_label, title):
    _, ax = plt.subplots()

    # 設定樣式
    ax.boxplot(y_data
               # 箱子是否顏色填充
               , patch_artist = True
               # 中位數線顏色
               , medianprops = {'color': base_color}
               # 箱子顏色設定,color:邊框顏色,facecolor:填充顏色
               , boxprops = {'color': base_color, 'facecolor': median_color}
               # 貓須顏色whisker
               , whiskerprops = {'color': median_color}
               # 貓須界限顏色whisker cap
               , capprops = {'color': base_color})

    # 箱圖與x_data保持一致
    ax.set_xticklabels(x_data)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_title(title)

# 呼叫繪圖函式
boxplot(x_data = days
        , y_data = bp_data
        , base_color = 'b'
        , median_color = 'r'
        , x_label = 'Day of week'
        , y_label = 'Check outs'
        , title = 'Total Check Outs By Day of Week (0 = Sunday)')

這裡寫圖片描述