1. 程式人生 > >pthon時間序列分析

pthon時間序列分析

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.tsa.arima_model import ARIMA #ARIMA模型
import patsy
from tsfresh.examples.robot_execution_failures import download_robot_execution_failures,load_robot_execution_failures
from tsfresh import extract_features,extract_relevant_features,select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


"""
#TIMES 2018 Oct 12 10/12/2018 2018-10-12 2018/10/12
rng = pd.date_range("2018-10-12",periods=10,freq="D") #D 天(3D 三天);M 月;H 小時
#print(rng)

#將時間作為索引
time_index = pd.Series(np.random.randn(20),index=pd.date_range("2018-10-12",periods=20,freq="D")) #預設為D增加
#print(time_index)
#print(time_index["2018-10-12"])

#過濾
time_index_tr = time_index.truncate(before="2018-10-20") #before2018-10-20之前的資料都不要;after是之後的資料都不要
#print(time_index_tr)

#時間戳
time_stamp = pd.Timestamp("2018-10-12 19")
#print(time_stamp) #2018-10-12 19:00:00

time_delta = pd.Timestamp("2018-10-12 19") + pd.Timedelta("1 day")
#print(time_delta) #2018-10-13 19:00:00

#時間週期
time_period = pd.period_range("2018-10-12","2018-12-12",freq="M")
#print(time_period) #這是包括2018-10-12
"""

"""
#資料重取樣
rng = pd.date_range("1/1/2011",periods=30,freq="3D")
ts = pd.Series(np.random.randn(len(rng)),index=rng)
print(ts)
#降取樣
ts_dec_re = ts.resample("M").sum()
print(ts_dec_re)

#升取樣(ffill空值取前面的值,bfill空值取後面的值,interpolate線性取值)
ts_asc_re = ts.resample("D").interpolate("linear")
print(ts_asc_re)
"""

"""
#滑動視窗
df = pd.Series(np.random.randn(600),index=pd.date_range("2018-1-1",freq="D",periods=600)) #資料的生成
print(df.head()) #預設列印5行
r = df.rolling(window=10)
print(r.mean()) #mean median(中位數) std(標準差) skew(傾斜度) sum var
"""

"""
#平穩性(差分)
fr = pd.read_csv("filename")
fr["diff_1"] = fr["column_name"].diff(1) #column_name這列資料的一階差分
fr["diff_2"] = fr["diff_1"].diff(1) #diff_1這列資料的一階差分
"""

"""
#ARIMA(差分自迴歸移動平均模型)
#AR自迴歸,p為自迴歸項,MA為移動平均,q為移動平均項數,d為時間序列成為平穩時所做的差分次數
#原理:將非平穩的時間序列轉化為平穩時間序列,然後僅對因變數的滯後值以及隨機誤差項的現值和滯後值進行迴歸建立模型
#步驟:平穩化(可以確定d);p q的確定;呼叫ARIMA模型

#股票預測案列(做迴歸)
stockFile = "T10yr.csv"
stock = pd.read_csv(stockFile,index_col=0,parse_dates=[0]) #0列做索引
#print(stock.head())

#按Close列進行分析
stock_week = stock["Close"].resample("W-MON").mean() #Close列按照每一週的均值進行重取樣
stock_train = stock_week["2000":"2015"] #訓練資料

#plt.plot(stock_train)
#plt.show() #資料的波動比較大,需要差分來平穩化

stock_diff = stock_train.diff(1)
stock_diff = stock_diff.dropna() #去除空值

#plt.plot(stock_diff)
#plt.show() #一階差分基本上可以平穩化

#畫ACF(確定q)
#acf = plot_acf(stock_diff,lags=20)
#acf.show() #q=1
#pacf = plot_pacf(stock_diff,lags=20)
#pacf.show() #p=1

model = ARIMA(stock_train,order=(1,1,1),freq="W-MON")  #order=(1,1,1) p d q
result = model.fit()
#print(result.summary())
pred = result.predict("2015-05-04","2016-04-18",dynamic=True,typ="levels") #"2015-05-04"(開始值)必須是資料集中已有的值,否則報錯
#print(pred)

plt.plot(pred)
plt.plot(stock_train)
#plt.show()
"""

"""
#時間序列的分類任務(tsfresh庫做特徵的提取和過濾)
download_robot_execution_failures()
df,y = load_robot_execution_failures() #匯入tsfresh官網上例子資料
print(df.head())

#特徵提取
extraction_settings = ComprehensiveFCParameters() #例項化
X = extract_features(df,column_id="id",column_sort="time",
                     default_fc_parameters=extraction_settings,impute_function=impute)
print(X.head()) #列印提取完的特徵

#特徵過濾
X_filtered = extract_relevant_features(df,y,column_id="id",column_sort="time",default_fc_parameters=extraction_settings)

#切分資料集
x_train,x_test,x_filtered_train,x_filtered_test,y_train,y_test = train_test_split(X,X_filtered,y,test_size=0.4)

#訓練 預測
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)
print(classification_report(y_test,dtc.predict(x_test)))
"""

train = pd.read_csv("train_1.csv").fillna(0) #開啟檔案並用0填充空值
#print(train.head()) #資料是浮點數但是都是.0形式,小數部分無貢獻,轉化成int型別

#pandas轉化資料型別
for col in train.columns[1:]:
    train[col] = pd.to_numeric(train[col],downcast="integer")
#print(train.head())