プログラミング備忘録

初級プログラマ。python、DL勉強中

sarimax

import pandas  as pd
import statsmodels.api as sm

import matplotlib
import matplotlib.pyplot as plt


### pandas setting
#pd.get_option("display.max_columns")
#pd.set_option('display.max_columns', 5)
pd.set_option("display.max_colwidth", 80)

### pyplot setting 

plt.style.use('ggplot')
#font = {'family' : 'meiryo'}
#matplotlib.rc('font', **font)

plt.rcParams["figure.figsize"] = [10,10]


df=pd.read_csv("./data/tokyo-2016.csv")
df["TIMESTAMP"]=pd.to_datetime(df["DATE"])
#print(df["TIMESTAMP"])

### グループ化
df_sum=df.groupby("TIMESTAMP")["POWER"].sum().reset_index()
df_sum=df_sum.sort_values("TIMESTAMP")
df_sum= df_sum.set_index(['TIMESTAMP'])
#print(df_sum)

### plot
#df_sum.plot(alpha=0.6)
#plt.title("timeplot")
#plt.savefig("./timelog/plot.png")

### トレンド、季節性、ノイズ
res = sm.tsa.seasonal_decompose(df_sum)
#res.plot()
#plt.title("trend")
#plt.savefig("./timelog/trend.png")

observed=res.observed["POWER"]
seasonal=res.seasonal["POWER"]
trend=res.trend["POWER"]
resid=res.resid["POWER"]

df_new=pd.DataFrame(index=[])
df_new["observed"]=observed
df_new["trend"]=trend
df_new["seasonal"]=seasonal
df_new["resid"]=resid

#print(df_new)


df_new["check"]=df_new["trend"]+df_new["seasonal"]+df_new["resid"]
#print(df_new)

### 単位値検定
# トレンド項あり、定数項あり
ct = sm.tsa.stattools.adfuller(df_sum["POWER"], regression="ct")
# トレンド項なし、定数項あり
c = sm.tsa.stattools.adfuller(df_sum["POWER"], regression="c")
# トレンド項なし、定数項なし
nc = sm.tsa.stattools.adfuller(df_sum["POWER"], regression="nc")
# 確認
print("diff-0: ", ct[1],c[1],nc[1])


df_diff1=df_sum.diff(1)
diff = df_diff1.dropna(inplace=True)
# トレンド項あり、定数項あり
ct = sm.tsa.stattools.adfuller(df_diff1["POWER"], regression="ct")
# トレンド項なし、定数項あり
c = sm.tsa.stattools.adfuller(df_diff1["POWER"], regression="c")
# トレンド項なし、定数項なし
nc = sm.tsa.stattools.adfuller(df_diff1["POWER"], regression="nc")
# 確認
print("diff-1: ", ct[1],c[1],nc[1])

#df_diff1.plot()
#plt.title("diff1")
#plt.savefig("./timelog/diff1.png")

#print(df_sum)

df_train= df_sum[df_sum.index < "2016-12"]
print(df_train)
df_test = df_sum[df_sum.index >= "2016-12"]
print(df_test)

def paramselect(df_train):
    max_p = 3 #AR
    max_q = 3 #MA
    max_d = 2 #diff
    max_sp = 1 #seasonal AR
    max_sq = 1 #seasonal MA
    max_sd = 1 #seasonal diff

    pattern = max_p*(max_d + 1)*(max_q + 1)*(max_sp + 1)*(max_sq + 1)*(max_sd + 1)

    modelSelection = pd.DataFrame(index=range(pattern), columns=["model", "aic"])

    # 自動SARIMA選択
    num = 0

    for p in range(1, max_p + 1):
        for d in range(0, max_d + 1):
            for q in range(0, max_q + 1):
                for sp in range(0, max_sp + 1):
                    for sd in range(0, max_sd + 1):
                        for sq in range(0, max_sq + 1):
                            sarima = sm.tsa.SARIMAX(
                                df_train, order=(p,d,q),
                                seasonal_order=(sp,sd,sq,12),
                                enforce_stationarity = False,
                                enforce_invertibility = False
                            ).fit()
                            modelSelection.ix[num]["model"] =  \
                                "order=(" + str(p) + ","+ str(d) + ","+ str(q) + "), \
                                season=("+ str(sp) + ","+ str(sd) + "," + str(sq) + ")"
                            modelSelection.ix[num]["aic"] = sarima.aic
                            num = num + 1

    print(modelSelection.sort_values(by='aic').head())

### parameter select
#paramselect(df_train)

### SARIMAX model
model = sm.tsa.SARIMAX(df_train, order=(3,0,3), seasonal_order=(1,1,1,12),
                        enforce_stationarity=False,enforce_invertibility=False).fit()
ljungbox_result = sm.stats.diagnostic.acorr_ljungbox(model.resid, lags=10)
df_ljungbox_result = pd.DataFrame({"p-value":ljungbox_result[1]})
df_tmp=df_ljungbox_result[df_ljungbox_result["p-value"] <  0.05]
print(df_tmp)


### pred(overfitting)
pred=model.predict('2016-05-01', '2016-11-30')
plt.figure()
plt.plot(df_train, label="train" ,color="blue")
plt.plot(pred, "r", label="pred" ,color="red")
plt.legend()
plt.title("sarimax_overfit")
plt.savefig("./timelog/sarimax_overfit.png")


### pred
pred=model.predict('2016-11-30', '2016-12-31')
plt.figure()
plt.plot(df_train, label="train" ,color="blue")
plt.plot(pred, "r", label="pred" ,color="red")
plt.legend()
plt.title("sarimax_train")
plt.savefig("./timelog/sarimax_pred.png")


### predcheck
pred=model.predict('2016-11-30', '2016-12-31')
plt.figure()
plt.plot(df_test, label="test" ,color="blue")
plt.plot(pred, "r", label="pred" ,color="red")
plt.legend()
plt.title("sarimax_test")
plt.savefig("./timelog/sarimax_check.png")

参考