import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels as sm
%matplotlib inline
# Read Data
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m')
data = pd.read_csv('data/AirPassengers.csv', parse_dates=['Month'], index_col='Month',date_parser=dateparse)
ts = data['#Passengers']
sns.tsplot(ts)
# Stationary Test
import statsmodels.api as smapi
stationary = smapi.tsa.stattools.adfuller(ts)
print('=> original data stationary:', stationary, sep='\n')
# P > 0.05, not stationary
# Try difference on the series to make it stationary
ts_diff1 = ts.diff().dropna()
stationary = sm.api.tsa.stattools.adfuller(ts_diff1)
print('Diff 1:', stationary, sep='\n')
# P ~= 0.05 and the test statistics belongs to (5%, 10%)
# Thus, if the critical level we choose is 10%, the d=1,
# otherwise, we need do difference again
# Supposing we choose critical level = 5%
ts_diff2 = ts_diff1.diff().dropna()
stationary = sm.api.tsa.stattools.adfuller(ts_diff2)
print('Diff 2:', stationary, sep='\n')
# P < 0.05, so the d = 2
# Ploting the ACF & PACF to calculate q & p for MA & AR
sm.graphics.tsaplots.plot_acf(ts_diff2, lags=20)
sm.graphics.tsaplots.plot_pacf(ts_diff2, lags=20)
# Both ACF & PACF < 0, so the q = p = 0
# Here we got the model ARIMA(0,2,0)
# compare the fitted model with original data
# if do not add this, statsmodel will report error
ts = ts.astype(float)
model = sm.tsa.arima_model.ARIMA(ts, order=(0,2,0))
fitted = model.fit(disp=-1)
fitted.plot_predict(start=ts.index[3],
end=ts.index[-1] + relativedelta(months=0), alpha=0.1)
fitted.pvalues
Taking back on the process, where did we go wrong?
Notice that we d=1, it has already approximate to stationary, but we still do another difference again. Is this appropriate or not?
How about making d=1, and try to fit new model? Following section will do this.
# Plot ACF & PACF for ts_diff1
#plt.plot(ts_diff1)
from statsmodels.graphics import tsaplots
plt.figure()
tsaplots.plot_acf(ts_diff1, lags=20)
tsaplots.plot_pacf(ts_diff1, lags=20)
# Obviously, the p = q = 1
# Thus, we will fit model ARIMA(1,1,1)
ts = ts.astype(float)
model = sm.tsa.arima_model.ARIMA(ts, order=(1,1,1))
fitted = model.fit(disp=-1)
fitted.plot_predict(start=ts.index[1],
end=ts.index[-1] + relativedelta(months=0), alpha=0.1)
fitted.pvalues
Although the new model is better, but still not good enougth.
It's noted that the var after diff=1 still goes large, so how about do a log before diff?
ts = ts.astype(float)
tslog = np.log(ts)
tslog_diff1 = tslog.diff().dropna()
plt.plot(tslog_diff1)
plt.figure()
tsaplots.plot_acf(tslog_diff1, lags=20)
tsaplots.plot_pacf(tslog_diff1, lags=20)
# p = q = 1
# fitting ARIMA(1,1,1) after log
model = sm.tsa.arima_model.ARIMA(tslog, order=(1,1,1))
fitted = model.fit(disp=-1)
fitted.plot_predict(start=tslog.index[1],
end=tslog.index[-1] + relativedelta(months=0), alpha=0.1)
fitted.pvalues
Look great, but the seasonal seems not expressed good! How about try add p or q, or both?
model = sm.tsa.arima_model.ARIMA(tslog, order=(2,1,2))
fitted = model.fit(disp=-1)
fitted.plot_predict(start=tslog.index[1],
end=tslog.index[-1] + relativedelta(months=0), alpha=0.1)
fitted.pvalues