import pandas as pd
import statsmodels.api as sm
import matplotlib
import matplotlib.pyplot as plt
pd.set_option("display.max_colwidth", 80)
plt.style.use('ggplot')
plt.rcParams["figure.figsize"] = [10,10]
df=pd.read_csv("./data/tokyo-2016.csv")
df["TIMESTAMP"]=pd.to_datetime(df["DATE"])
df_sum=df.groupby("TIMESTAMP")["POWER"].sum().reset_index()
df_sum=df_sum.sort_values("TIMESTAMP")
df_sum= df_sum.set_index(['TIMESTAMP'])
res = sm.tsa.seasonal_decompose(df_sum)
observed=res.observed["POWER"]
seasonal=res.seasonal["POWER"]
trend=res.trend["POWER"]
resid=res.resid["POWER"]
df_new=pd.DataFrame(index=[])
df_new["observed"]=observed
df_new["trend"]=trend
df_new["seasonal"]=seasonal
df_new["resid"]=resid
df_new["check"]=df_new["trend"]+df_new["seasonal"]+df_new["resid"]
ct = sm.tsa.stattools.adfuller(df_sum["POWER"], regression="ct")
c = sm.tsa.stattools.adfuller(df_sum["POWER"], regression="c")
nc = sm.tsa.stattools.adfuller(df_sum["POWER"], regression="nc")
print("diff-0: ", ct[1],c[1],nc[1])
df_diff1=df_sum.diff(1)
diff = df_diff1.dropna(inplace=True)
ct = sm.tsa.stattools.adfuller(df_diff1["POWER"], regression="ct")
c = sm.tsa.stattools.adfuller(df_diff1["POWER"], regression="c")
nc = sm.tsa.stattools.adfuller(df_diff1["POWER"], regression="nc")
print("diff-1: ", ct[1],c[1],nc[1])
df_train= df_sum[df_sum.index < "2016-12"]
print(df_train)
df_test = df_sum[df_sum.index >= "2016-12"]
print(df_test)
def paramselect(df_train):
max_p = 3
max_q = 3
max_d = 2
max_sp = 1
max_sq = 1
max_sd = 1
pattern = max_p*(max_d + 1)*(max_q + 1)*(max_sp + 1)*(max_sq + 1)*(max_sd + 1)
modelSelection = pd.DataFrame(index=range(pattern), columns=["model", "aic"])
num = 0
for p in range(1, max_p + 1):
for d in range(0, max_d + 1):
for q in range(0, max_q + 1):
for sp in range(0, max_sp + 1):
for sd in range(0, max_sd + 1):
for sq in range(0, max_sq + 1):
sarima = sm.tsa.SARIMAX(
df_train, order=(p,d,q),
seasonal_order=(sp,sd,sq,12),
enforce_stationarity = False,
enforce_invertibility = False
).fit()
modelSelection.ix[num]["model"] = \
"order=(" + str(p) + ","+ str(d) + ","+ str(q) + "), \
season=("+ str(sp) + ","+ str(sd) + "," + str(sq) + ")"
modelSelection.ix[num]["aic"] = sarima.aic
num = num + 1
print(modelSelection.sort_values(by='aic').head())
model = sm.tsa.SARIMAX(df_train, order=(3,0,3), seasonal_order=(1,1,1,12),
enforce_stationarity=False,enforce_invertibility=False).fit()
ljungbox_result = sm.stats.diagnostic.acorr_ljungbox(model.resid, lags=10)
df_ljungbox_result = pd.DataFrame({"p-value":ljungbox_result[1]})
df_tmp=df_ljungbox_result[df_ljungbox_result["p-value"] < 0.05]
print(df_tmp)
pred=model.predict('2016-05-01', '2016-11-30')
plt.figure()
plt.plot(df_train, label="train" ,color="blue")
plt.plot(pred, "r", label="pred" ,color="red")
plt.legend()
plt.title("sarimax_overfit")
plt.savefig("./timelog/sarimax_overfit.png")
pred=model.predict('2016-11-30', '2016-12-31')
plt.figure()
plt.plot(df_train, label="train" ,color="blue")
plt.plot(pred, "r", label="pred" ,color="red")
plt.legend()
plt.title("sarimax_train")
plt.savefig("./timelog/sarimax_pred.png")
pred=model.predict('2016-11-30', '2016-12-31')
plt.figure()
plt.plot(df_test, label="test" ,color="blue")
plt.plot(pred, "r", label="pred" ,color="red")
plt.legend()
plt.title("sarimax_test")
plt.savefig("./timelog/sarimax_check.png")
参考