1. 程式人生 > >python資料分析學習筆記七

python資料分析學習筆記七

第七章 訊號處理與時間序列

(需要統計學知識)

1 statsmodels 子庫

示例程式碼如下

import pkgutil as pu
import pydoc
import statsmodels as sm

# statmodels版本號
print("statmodels version", sm.__version__)


def clean(astr):
    s = astr
    # remove multiple spaces
    s = ' '.join(s.split())
    s = s.replace('=', '')

    return s


def print_desc(prefix, pkg_path):     print("pkg_path", pkg_path)     '''     :param prefix: 模組名稱     :param pkg_path:模組包路徑     :return:     '''     for pkg in pu.iter_modules(path=pkg_path):         '''Yields (module_loader,模組          name,子庫名          ispkg)是否'''         print("pkg", pkg)         name = prefix +
"." + pkg[1]         # 輸出子庫名,幫助文件資訊         if pkg[2] == True:             try:                 docstr = pydoc.plain(pydoc.render_doc(name))                 docstr = clean(docstr)                 start = docstr.find("DESCRIPTION")                 docstr = docstr[start: start + 140]                 print
(name, docstr)             except:                 continue print_desc("statsmodels", sm.__path__)

執行結果如下:

statmodels version 0.8.0rc1

statsmodels.base

statsmodels.compat

statsmodels.datasets

statsmodels.discrete

statsmodels.distributions

statsmodels.duration

statsmodels.emplike

statsmodels.formula

statsmodels.genmod

statsmodels.graphics

statsmodels.imputation

statsmodels.interface

statsmodels.iolib

statsmodels.miscmodels

statsmodels.multivariate

statsmodels.nonparametric DESCRIPTION Foran overview of this module, see docs/source/nonparametric.rst PACKAGE CONTENTS_kernel_base _smoothers_lowess api bandwidths

statsmodels.regression

statsmodels.resampling

statsmodels.robust

statsmodels.sandbox

statsmodels.src

statsmodels.stats

statsmodels.tools

statsmodels.tsa

2 移動平均值

示例程式碼如下:

import matplotlib.pyplot as plt
import statsmodels.api as sm
from pandas.stats.moments import rolling_mean

data_loader = sm.datasets.sunspots.load_pandas()
df = data_loader.data

year_range = df['YEAR'].values
plt.plot(year_range, df["SUNACTIVITY"].values, label="Original")
plt.plot(year_range, rolling_mean(df, 11)["SUNACTIVITY"].values, label="SMA 11")
plt.plot(year_range, rolling_mean(df, 22)["SUNACTIVITY"].values, label="SMA 22")
plt.legend()
plt.show()

方法二:

import matplotlib.pyplot as plt
import statsmodels.api as sm
from pandas.stats.moments import rolling_mean
import pandas.core.generic

data_loader = sm.datasets.sunspots.load_pandas()
df = data_loader.data
df11 = data_loader.data.rolling(window=11, center=False).mean()
df22 = data_loader.data.rolling(window=22, center=False).mean()
year_range = df['YEAR'].values
plt.plot(year_range, df["SUNACTIVITY"].values, label="Original")
# plt.plot(year_range, rolling_mean(df, window=11, center=False)["SUNACTIVITY"].values, label="SMA 11")
# plt.plot(year_range, rolling_mean(df, window=22, center=False)["SUNACTIVITY"].values, label="SMA 22")
plt.plot(year_range, df11["SUNACTIVITY"].values, label="SMA 11")
plt.plot(year_range, df22["SUNACTIVITY"].values, label="SMA 22")
plt.legend()
plt.show()

執行結果如下:

3 視窗函式

 是定義在一個區間(視窗)上的函式,超出定義,函式值取零,可以用來分析頻譜,設計濾波器

import matplotlib.pyplot as plt
import statsmodels.api as sm
from pandas.stats.moments import rolling_window
import pandas as pd

data_loader = sm.datasets.sunspots.load_pandas()
# 從尾部取150df = data_loader.data.tail(150)

df = pd.DataFrame({"SUNACTIVITY": df['SUNACTIVITY'].values}, index=df['YEAR'])
ax = df.plot()


def plot_window(win_type):
    # df2 = rolling_window(df, 22, win_type)
    df2 = df.rolling(window=22, win_type=win_type).mean()
    df2.columns = [win_type]
    # 顯示原始資料
    df2.plot(ax=ax)


# 矩形視窗
plot_window("boxcar")
# 三角形視窗
plot_window("triang")
# 布萊克曼視窗
plot_window("blackman")
# 漢寧窗
plot_window("hanning")
# 巴特萊特窗
plot_window("bartlett")
# 顯示網格
plt.grid()
plt.show()

執行結果如下:

4 協整的定義

示例程式碼如下:

import statsmodels.api as sm
from pandas.stats.moments import rolling_window
import pandas as pd
import statsmodels.tsa.stattools as ts
import numpy as np


# 用來計算ADF統計量
def calc_adf(x, y):
    result = sm.OLS(x, y).fit()
    return ts.adfuller(result.resid)


# 太陽黑子資料載入到numpy陣列
data_loader = sm.datasets.sunspots.load_pandas()
data = data_loader.data.values
N = len(data)

# 計算正弦值,求出該值與自身的協整關係
t = np.linspace(-2 * np.pi, 2 * np.pi, N)
sine = np.sin(np.sin(t))
print('Self ADF', calc_adf(sine, sine))

# 給正弦波新增噪音
noise = np.random.normal(0, .01, N)
print('ADF sine with noise', calc_adf(sine, sine + noise))

# 生成一個幅值和偏移量更大的餘弦波,並混入噪音
cosine = 100 * np.cos(t) + 10
print('ADF sine vs cosine with noise', calc_adf(sine, cosine + noise))
# 正弦與太陽黑子
print('Sine vs sunspots', calc_adf(sine, data))

執行結果如下:

Self ADF (2.1752959320935576e-16,

 0.95853208606005602,

0,

308,

 {'10%': -2.5717944160060719,

'1%': -3.4517611601803702,

'5%': -2.8709700936076912},

 -21598.896016765088)

ADF sine with noise (-11.80572756306368,

 9.1062110841151392e-22,

 2,

306,

{'10%': -2.5718274501260199,

'1%': -3.4519023023726696,

 '5%': -2.8710320399170537},

 -1857.1417094083959)

ADF sine vs cosine with noise(-6.9222457355201135,

1.1386106445203264e-09,

 16,

292,

{'10%': -2.5720714378870331,

 '1%': -3.4529449243622383,

'5%': -2.8714895534256861},

-10180.957513197414)

Sine vs sunspots (-6.7242691810700963,

 3.4210811915549913e-09,

 16,

292,

{'10%': -2.5720714378870331,

'1%': -3.4529449243622383,

'5%': -2.8714895534256861},

-1102.5867415291168)

5 自相關

資料集內部的相關性,可以用來指明趨勢

示例程式碼如下:

import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from pandas.tools.plotting import autocorrelation_plot

# 讀入測試資料
data_loader = sm.datasets.sunspots.load_pandas()
data = data_loader.data['SUNACTIVITY'].values

# 計算自相關值
y = data - np.mean(data)
norm = np.sum(y ** 2)
correlated = np.correlate(y, y, mode='full') / norm
res = correlated[len(correlated) / 2:]

# 關聯度最高值的索引
print(np.argsort(res)[-5:])
# 結果為[ 9 11 10  1  0]

# 繪製圖形
plt.plot(res)
plt.grid(True)
plt.xlabel('Lag')
plt.ylabel('Autocorrelation')
plt.show()

# 使用pandas繪製
autocorrelation_plot(data)
plt.show()

執行結果如下:

使用pandas繪製

6 自迴歸模型

可用於預測時間序列將來的值

一元線性迴歸的計算公式(因變數y與自變數x之間的線性關係)

β0和β1為模型的引數

ε誤差項一個期望值為0的隨機變數

迴歸方程

E(y)=β0+β1x

自加歸模型的數學公式

示例程式碼如下:

from scipy.optimize import leastsq
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np


# 搭建模型程式碼
def model(p, x1, x10):
    p1, p10 = p
    return p1 * x1 + p10 * x10


def error(p, data, x1, x10):
    return data - model(p, x1, x10)


# 給引數表賦值
def fit(data):
    p0 = [.5, 0.5]
    params = leastsq(error, p0, args=(data[10:], data[9:-1], data[:-10]))[0]
    return params


# 載入資料
data_loader = sm.datasets.sunspots.load_pandas()
sunspots = data_loader.data['SUNACTIVITY'].values
print(sunspots)

# 訓練模型
cutoff = .9 * len(sunspots)
params = fit(sunspots[:cutoff])
print('Params', params)

# 取得各個指標的值
pred = params[0] * sunspots[cutoff - 1:-1] + params[1] * sunspots[cutoff - 10:-10]
actual = sunspots[cutoff:]
print('Root mean square error', np.sqrt(np.mean(actual - pred) ** 2))
print('Mean absolute error ', np.mean(np.abs(actual - pred)))
print('Mean absolute percentage error', 100 * np.mean(np.abs(actual - pred) / actual))
mid = (actual + pred) / 2
print('Symmetric Mean absolute percentage error', 100 * np.mean(np.abs(actual - pred) / mid))
print('Cofficient of determination', 1 - ((actual - pred) ** 2).sum() / ((actual - actual.mean()) ** 2).sum())
year_range = data_loader.data['YEAR'].values[cutoff:]

# 繪製圖像
# 太陽黑子活動數值
plt.plot(year_range, actual, 'o', label='Sunspots')
# 預測值
plt.plot(year_range, pred, 'x', label='Prediction')
plt.grid(True)
plt.xlabel('YEAR')
plt.ylabel('SUNACTIVITY')
plt.legend()
plt.show()

執行結果如下:

Params [ 0.67172672  0.33626295]

Root mean square error 1.02884848439

Mean absolute error  17.6515446503

Mean absolute percentage error60.7817800736

Symmetric Mean absolute percentage error34.9843386176

Cofficient of determination 0.799940292779

7 ARMA模型

由自迴歸模型和移動平均模型結合而成,用於時間序列的預測

示例程式碼如下:

import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import datetime

# 載入資料
data_loader = sm.datasets.sunspots.load_pandas()
df = data_loader.data

# 擬合數據
years = df['YEAR'].values.astype(int)
str(years[0])
df.index = pd.Index(sm.tsa.datetools.dates_from_range(str(years[0]), str(years[-1])))

del df['YEAR']
# 預測資料
model = sm.tsa.ARMA(df, (10, 1)).fit()
prediction = model.predict('1975', str(years[-1]), dynamic=True)

# 繪製資料
# 太陽黑子活動資料
df['1975':].plot()
# 預測資料
prediction.plot(style='--', label='Prediction')
plt.grid(True)
plt.legend()
plt.show()

執行結果如下:

8 生成周期訊號

示例程式碼如下:

from scipy.optimize import leastsq
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np


# 搭建模型
def model(p, t):
    C, p1, f1, phi1, p2, f2, phi2, p3, f3, phi3 = p
    return C + p1 * np.sin(f1 * t + phi1) + p2 * np.sin(f2 * t + phi2) + p3 * np.sin(f3 * t + phi3)


def error(p, y, t):
    return y - model(p, t)


# 給引數表賦值
def fit(y, t):
    p0 = [y.mean(), 0, 2 * np.pi / 11, 0, 0, 2 * np.pi / 22, 0, 0, 2 * np.pi / 100, 0]
    params = leastsq(error, p0, args=(y, t))[0]
    return params


# 載入資料
data_loader = sm.datasets.sunspots.load_pandas()
sunspots = data_loader.data["SUNACTIVITY"].values
years = data_loader.data["YEAR"].values

# 訓練模型
cutoff = .9 * len(sunspots)
params = fit(sunspots[:cutoff], years[:cutoff])
print("Params", params)

# 取得各個指標的值
pred = model(params, years[cutoff:])
actual = sunspots[cutoff:]
print("Root mean square error", np.sqrt(np.mean((actual - pred) ** 2)))
print("Mean absolute error", np.mean(np.abs(actual - pred)))
print("Mean absolute percentage error", 100 * np.mean(np.abs(actual - pred) / actual))
mid = (actual + pred) / 2
print("Symmetric Mean absolute percentage error", 100 * np.mean(np.abs(actual - pred) / mid))
print("Coefficient of determination", 1 - ((actual - pred) ** 2).sum() / ((actual - actual.mean()) ** 2).sum())
year_range = data_loader.data["YEAR"].values[cutoff:]

# 繪製圖形
plt.plot(year_range, actual, 'o', label="Sunspots")
plt.plot(year_range, pred, 'x', label="Prediction")
plt.grid(True)
plt.xlabel("YEAR")
plt.ylabel("SUNACTIVITY")
plt.legend()
plt.show()

執行結果如下:

Params [ 47.18800156  28.89947466  0.56827281   6.51174621   4.55214731

  0.29372076 -14.3092358 -18.16524066   0.06574835  -4.37789397]

Root mean square error 59.5619988827

Mean absolute error 44.581532168  #平均絕結誤差

Mean absolute percentage error65.1643378479

Symmetric Mean absolute percentage error78.4479302694

Coefficient of determination-0.363528934815  #判定係數應儘量接近於1

9 傅量葉分析

FFT (fast fourier transform)快速傅立葉變換

示例程式碼如下:

import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.fftpack import rfft
from scipy.fftpack import fftshift

# 載入資料
data_loader = sm.datasets.sunspots.load_pandas()
sunspots = data_loader.data["SUNACTIVITY"].values

# 建立一個正弦波
t = np.linspace(-2 * np.pi, 2 * np.pi, len(sunspots))
mid = np.ptp(sunspots) / 2
sine = mid + mid * np.sin(np.sin(t))

sine_fft = np.abs(fftshift(rfft(sine)))
# 最大振幅的相應索引
print("Index of max sine FFT", np.argsort(sine_fft)[-5:])

# 對太陽黑子進行 fft
transformed = np.abs(fftshift(rfft(sunspots)))
print("Indices of max sunspots FFT", np.argsort(transformed)[-5:])

# 太陽黑子活動資料和sine函式
plt.subplot(311)
plt.plot(sunspots, label="Sunspots")
plt.plot(sine, lw=2, label="Sine")
plt.grid(True)
plt.legend()

# 傅立葉變換後的太陽黑子活動資料
plt.subplot(312)
plt.plot(transformed, label="Transformed Sunspots")
plt.grid(True)
plt.legend()

# 傅立葉變換後的sine函式
plt.subplot(313)
plt.plot(sine_fft, lw=2, label="Transformed Sine")
plt.grid(True)
plt.legend()

plt.show()

執行結果如下:

Index of max sine FFT [160 157 166 158 154]

Indices of max sunspots FFT [205 212 215209 154]

10 譜分析

功率頻譜

示例程式碼如下:

import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.fftpack import rfft
from scipy.fftpack import fftshift

# 載入資料
data_loader = sm.datasets.sunspots.load_pandas()
sunspots = data_loader.data["SUNACTIVITY"].values

# 建立一個正弦波
t = np.linspace(-2 * np.pi, 2 * np.pi, len(sunspots))
mid = np.ptp(sunspots) / 2
sine = mid + mid * np.sin(np.sin(t))

# 對正弦波進行FFT
sine_fft = np.abs(fftshift(rfft(sine)))
# 最大振幅的相應索引
print("Index of max sine FFT", np.argsort(sine_fft)[-5:])

# 對太陽黑子進行 fft
transformed = fftshift(rfft(sunspots))
print("Indices of max sunspots FFT", np.argsort(transformed)[-5:])

# 太陽黑子活動資料和sine函式
plt.subplot(311)
plt.plot(sunspots, label="Sunspots")
plt.plot(sine, lw=2, label="Sine")
plt.grid(True)
plt.legend()

# 繪製功率頻譜
plt.subplot(312)
plt.plot(transformed ** 2, label="Power Spectrum")
plt.grid(True)
plt.legend()

# 相位譜,正弦函式的起始角
plt.subplot(313)
plt.plot(np.angle(transformed), label="Phase Spectrum")
plt.grid(True)
plt.legend()

plt.show()

執行結果如下:

11 濾波

是一種訊號處理技術,可以對訊號的某些部分進行刪減或抑制,可以對高頻或是低頻進行濾波

中值濾波

Wiener濾波

Detrend濾波

示例程式碼如下:

import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.signal import medfilt
from scipy.signal import wiener
from scipy.signal import detr