In [ ]:

# The purpose of this notebook is to play around with the SARIMAX algorithm in Python.
# I was following this page for data and steps: https://www.kaggle.com/code/yemi99/time-series-forecasting-with-sarimax-model
# These articles have some good information about the algorithm:
#    https://medium.com/swlh/a-brief-introduction-to-arima-and-sarima-modeling-in-python-87a58d375def
#    https://www.statsmodels.org/stable/examples/notebooks/generated/statespace_sarimax_stata.html
# This page has information about the python function: https://www.statsmodels.org/dev/generated/statsmodels.tsa.statespace.sarimax.SARIMAX.html

In [ ]:

# My GitHub page for this project is here: https://github.com/aabromowitz/SarimaxProject

In [ ]:

# S = seasonal
# AR = Autoregressive
# I = Integrated
# MA = Moving average
# X = exogeneous

In [1]:

 # Importing
import pandas as pd
import numpy as np
import statsmodels
from math import sqrt
import matplotlib.pyplot as plt
import statsmodels.api as sm
from pmdarima import auto_arima
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima_model import ARIMA
from pandas.plotting import autocorrelation_plot
from pandas.tseries.offsets import DateOffset
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics  import mean_squared_error

In [2]:

# Cleaning up data
data = pd.read_csv(r'C:\Users\aabro\OneDrive\Desktop\PersonalCodingProjects\Sarimax\portland-oregon-average-monthly-.csv')
data.columns = ['Month', 'Riders']
data.drop(114, axis=0, inplace=True)
data['Riders'] = data['Riders'].astype('int')
data['Month'] = pd.to_datetime(data['Month'])
data.set_index('Month', inplace=True)

General EDA¶

In [ ]:

# I wanted to mark off this section in the notebook, sice it might be useful for later anylsis I do with data.

In [2]:

# Pull in data from the folder on my computer: C:\Users\aabro\OneDrive\Desktop\PersonalCodingProjects\Sarimax
data = pd.read_csv(r'C:\Users\aabro\OneDrive\Desktop\PersonalCodingProjects\Sarimax\portland-oregon-average-monthly-.csv')

In [ ]:

# Check the top rows
data.head()

In [4]:

# The second column needs to be renamed
data.columns = ['Month', 'Riders']

In [7]:

data.head() # that's better

Out[7]:

	Month	Riders
0	1960-01	648
1	1960-02	646
2	1960-03	639
3	1960-04	654
4	1960-05	630

In [8]:

data.shape

Out[8]:

(115, 2)

In [10]:

data.tail() # need to deal with the last row

Out[10]:

	Month	Riders
110	1969-03	1419
111	1969-04	1432
112	1969-05	1394
113	1969-06	1327
114	Portland Oregon average monthly bus ridership ...	n=114

In [11]:

data.drop(114, axis=0, inplace=True)

In [12]:

data.tail() # that's better

Out[12]:

	Month	Riders
109	1969-02	1425
110	1969-03	1419
111	1969-04	1432
112	1969-05	1394
113	1969-06	1327

In [13]:

data.dtypes # datatypes shouldn't be "object"

Out[13]:

Month     object
Riders    object
dtype: object

In [15]:

# Set datatypes to int and Month
data['Riders'] = data['Riders'].astype('int')
data['Month'] = pd.to_datetime(data['Month'])

In [16]:

data.dtypes # no longer "object"

Out[16]:

Month     datetime64[ns]
Riders             int32
dtype: object

In [17]:

# We need to set the month as the index. This is important in time series.
data.set_index('Month', inplace=True)

In [18]:

data.head() # It looks like the Riders header is over the Month header.  My guess is the Month column is the index for the dataframe.

Out[18]:

	Riders
Month
1960-01-01	648
1960-02-01	646
1960-03-01	639
1960-04-01	654
1960-05-01	630

In [19]:

data.describe()

Out[19]:

	Riders
count	114.000000
mean	1120.543860
std	270.888317
min	613.000000
25%	885.000000
50%	1158.000000
75%	1340.000000
max	1558.000000

Visualize the data in plots¶

In [3]:

# Plot the riders over time
data.plot(figsize=(12,6))

Out[3]:

<Axes: xlabel='Month'>

No description has been provided for this image

In [4]:

# Split the results into its Systematic (Seasonal and Non-Seasonal) and Non-Systematic components by using the seasonal_decompose function
results = seasonal_decompose(data['Riders']);
results.plot();

In [ ]:

# This page has a nice description of what the seasonal_decompose function is trying to do: https://machinelearningmastery.com/decompose-time-series-data-trend-seasonality/

# Systematic: Components of the time series that have consistency or recurrence and can be described and modeled.
# Non-Systematic: Components of the time series that cannot be directly modeled.

# A given time series is thought to consist of three systematic components including level, trend, seasonality, and one non-systematic component called noise.
# Level: The average value in the series.
# Trend: The increasing or decreasing value in the series.
# Seasonality: The repeating short-term cycle in the series.
# Noise: The random variation in the series.

# You can assume that the model is additive (data = level + trend + seasonality + noise) or multiplicitive (data = level * trend * seaonality * noise).
# I will try plotting both just to see what it looks like.

In [5]:

# EXTRA: Additive model
resultsAdd = seasonal_decompose(data['Riders'], model = 'additive');
resultsAdd.plot(); # This looks the same as above, so apparently that is the default for the seasonal_decompose function

In [6]:

# EXTRA: Multiplicitive model
resultsMult = seasonal_decompose(data['Riders'], model = 'multiplicative');
resultsMult.plot(); # Actually, this looks a lot more similar to the Additive model than I was expecting.

In [7]:

# Augmented Dickey Fuller test (ADF Test) is a common statistical test used to test whether a given Time series is stationary or not.
# Here is some nice background info about it, and how to call it in Python: https://www.jdeconomics.com/python-tutorials/stationarity-in-python
def adfuller_test(riders):
    result = adfuller(riders)
    labels = ['ADF Test Statistic', 'p-value','#Lags used', 'Number of Observations used']
    for value,label in zip(result, labels):
        print(label+' : '+str(value))
    if result[1] <= 0.05:
        print('strong evidence against the null hypothesis(Ho), reject the null hypothesis, data is stationary')
    else:
        print('weak evidence against the null hypothesis(Ho), data is not stationary')

In [8]:

# Do an ADF Test and output results
adfuller_test(data['Riders'])

ADF Test Statistic : -1.5365971444531592
p-value : 0.5153358797821737
#Lags used : 12
Number of Observations used : 101
weak evidence against the null hypothesis(Ho), data is not stationary

In [9]:

# Plot the data, with indicators for the years
ax = data.plot(figsize=(12,6))

xcoords = ['1960-01-01', '1961-01-01','1962-01-01', '1963-01-01', '1964-01-01', '1965-01-01',
          '1966-01-01', '1967-01-01', '1968-01-01', '1969-01-01']
for xc in xcoords:
    plt.axvline(x=xc, color='black', linestyle='--')

In [10]:

# Plotting the Autocorrelation and Partial Autocorrelation.
# This page has some info about these two terms: https://statisticsbyjim.com/time-series/autocorrelation-partial-autocorrelation/
# Autocorrelation is the correlation between two values in a time series.
# The partial autocorrelation function is similar to the ACF, 
#    except that it displays only the correlation between two observations that the shorter lags between those observations do not explain.
testing = data['Riders'].dropna()
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(testing, lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(testing, lags=40, ax=ax2)
# Since the points in the Autocorrelation plot are tapering off slowly, this indicates that there is a trend to the data.
# However, the points are going pretty steadily down instead of waving, which indicates not much seasonality.
# I thought the plots above looked pretty seasonal though.
# Since the first 2 points in the partial autocorrelation plot are above the shaded part, 
#    this indicates fitting a 2nd order autogressive model

# This page explains about autogressive models: https://sjmiller8182.github.io/LearningTS/build/autoregressive_models.html
# The interpretation of AR(2) is 
#    "the value at time t is a linear combination of two values at the two previous time periods plus a random noise component".

In [11]:

# Add a 12 month shift to the data
data['Seasonal_1st_diff'] = data['Riders'] - data['Riders'].shift(12)

In [12]:

# Check to make sure the shift was added correctly
data.head(20)

Out[12]:

	Riders	Seasonal_1st_diff
Month
1960-01-01	648	NaN
1960-02-01	646	NaN
1960-03-01	639	NaN
1960-04-01	654	NaN
1960-05-01	630	NaN
1960-06-01	622	NaN
1960-07-01	617	NaN
1960-08-01	613	NaN
1960-09-01	661	NaN
1960-10-01	695	NaN
1960-11-01	690	NaN
1960-12-01	707	NaN
1961-01-01	817	169.0
1961-02-01	839	193.0
1961-03-01	810	171.0
1961-04-01	789	135.0
1961-05-01	760	130.0
1961-06-01	724	102.0
1961-07-01	704	87.0
1961-08-01	691	78.0

In [13]:

# Check if the data is stationary when looking at the change between years
adfuller_test(data['Seasonal_1st_diff'].dropna())
# It still says that the data is not stationary, but the p value is closer to .05

ADF Test Statistic : -2.4697405635319685
p-value : 0.12301141534048127
#Lags used : 3
Number of Observations used : 98
weak evidence against the null hypothesis(Ho), data is not stationary

In [14]:

# Plot the season difference
data['Seasonal_1st_diff'].plot()

Out[14]:

<Axes: xlabel='Month'>

In [15]:

# Add the year lines again
ax = data['Seasonal_1st_diff'].plot(figsize=(12,6), linewidth=3, fontsize=20)

xcoords = ['1960-01-01', '1961-01-01','1962-01-01', '1963-01-01', '1964-01-01', '1965-01-01',
          '1966-01-01', '1967-01-01', '1968-01-01', '1969-01-01']
for xc in xcoords:
    plt.axvline(x=xc, color='black', linestyle='--')

In [16]:

# Plot the autocorrelation
# This page explains what Panda's autocorrelation_plot function does: https://blog.finxter.com/pandas-plotting-autocorrelation/
autocorrelation_plot(data['Riders'])
plt.show()
# Since it has some values above the dotted lines at +/- 0.25, it indicates there is a trend.

In [19]:

# EXTRA: What does the autocorrelation_plot look like for the seasonal diff?
testing2 = data['Seasonal_1st_diff'].dropna()
autocorrelation_plot(testing2)
plt.show()
# Still a drop at the beginning, but a little bit more random looking

In [17]:

# EXTRA: What does the original autocorrelation function look like?
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(testing2, lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(testing2, lags=40, ax=ax2)
# The trend is still there

In [18]:

# Try drawing a lag plot
# Here is an explantion of the lag_plot in Python: https://pythontic.com/pandas/plotting/lag%20plot
pd.plotting.lag_plot(data['Riders'])

Out[18]:

<Axes: xlabel='y(t)', ylabel='y(t + 1)'>

In [20]:

# EXTRA: Try a lag plot with the seasonal shift data
pd.plotting.lag_plot(data['Seasonal_1st_diff'])
# It looks like a less tight correlation than the original data has

Out[20]:

<Axes: xlabel='y(t)', ylabel='y(t + 1)'>

Use SARIMAX Model¶

In [21]:

# Determine what p, d, and q should be for the arima model
stepwise_fit = auto_arima(data['Riders'], trace=True)
# It looks like p = 1, d = 1, q = 3

# One of the articles that I had looked at before for general information about the SARIMAX algorithm talked about p, d, and q: 
#    https://medium.com/swlh/a-brief-introduction-to-arima-and-sarima-modeling-in-python-87a58d375def
# p = auto regressive (AR) order: determines how many previous data points will be used
# d: how many times the data is to be differenced
# q = moving average (MA) order: determines how many previous data points will be used

Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=1235.799, Time=0.52 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=1236.989, Time=0.04 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=1238.600, Time=0.09 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=1238.277, Time=0.13 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=1236.254, Time=0.04 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=1234.903, Time=0.24 sec
 ARIMA(0,1,2)(0,0,0)[0] intercept   : AIC=1233.580, Time=0.16 sec
 ARIMA(0,1,3)(0,0,0)[0] intercept   : AIC=1235.475, Time=0.19 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=1234.436, Time=0.21 sec
 ARIMA(1,1,3)(0,0,0)[0] intercept   : AIC=1232.851, Time=0.43 sec
 ARIMA(2,1,3)(0,0,0)[0] intercept   : AIC=inf, Time=0.46 sec
 ARIMA(1,1,4)(0,0,0)[0] intercept   : AIC=1234.568, Time=0.50 sec
 ARIMA(0,1,4)(0,0,0)[0] intercept   : AIC=1236.809, Time=0.23 sec
 ARIMA(2,1,4)(0,0,0)[0] intercept   : AIC=inf, Time=0.54 sec
 ARIMA(1,1,3)(0,0,0)[0]             : AIC=1238.599, Time=0.16 sec

Best model:  ARIMA(1,1,3)(0,0,0)[0] intercept
Total fit time: 4.056 seconds

In [22]:

# Look at the metrics for the chosen ARIMA model
stepwise_fit.summary()

Out[22]:

SARIMAX Results
Dep. Variable:	y	No. Observations:	114
Model:	SARIMAX(1, 1, 3)	Log Likelihood	-610.425
Date:	Sun, 31 Dec 2023	AIC	1232.851
Time:	19:56:25	BIC	1249.215
Sample:	01-01-1960	HQIC	1239.491
	- 06-01-1969
Covariance Type:	opg

	coef	std err	z	P>\|z\|	[0.025	0.975]
intercept	13.2103	6.858	1.926	0.054	-0.230	26.651
ar.L1	-0.9968	0.033	-29.786	0.000	-1.062	-0.931
ma.L1	0.9849	0.105	9.338	0.000	0.778	1.192
ma.L2	-0.3962	0.137	-2.889	0.004	-0.665	-0.127
ma.L3	-0.3983	0.104	-3.829	0.000	-0.602	-0.194
sigma2	2854.6463	493.872	5.780	0.000	1886.675	3822.618

Ljung-Box (L1) (Q):	0.04	Jarque-Bera (JB):	6.17
Prob(Q):	0.85	Prob(JB):	0.05
Heteroskedasticity (H):	1.84	Skew:	0.57
Prob(H) (two-sided):	0.06	Kurtosis:	2.96

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

In [28]:

# Make the SARIMAX model
# NOTE: It makes sense to use a SARIMAX model here because we noticed seasonality, but even with the seasonality
model = sm.tsa.statespace.SARIMAX(data['Riders'], order=(1,1,3), seasonal_order = (1,1,3,12))
results = model.fit()

C:\Users\aabro\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
C:\Users\aabro\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
C:\Users\aabro\anaconda3\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:866: UserWarning: Too few observations to estimate starting parameters for seasonal ARMA. All parameters except for variances will be set to zeros.
  warn('Too few observations to estimate starting parameters%s.'
C:\Users\aabro\anaconda3\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  warnings.warn("Maximum Likelihood optimization failed to "

In [31]:

# Plot predicted vs actual for the last couple years
data['forecast'] = results.predict(start=90, end=114, dynamic=True)
data[['Riders', 'forecast']].plot(figsize=(12,8), linewidth=3, fontsize=20)
# The predicted values seem to be much higher than the actual values.  
# It does appear to be a drop of after 1967, so that makes some sense.

Out[31]:

<Axes: xlabel='Month'>

In [32]:

# Create 2 years of future dates
future_dates = [data.index[-1]+ DateOffset(months=x) for x in range(0,24)]
future_dates_df = pd.DataFrame(index=future_dates[1:], columns=data.columns)

In [33]:

# See what the last dates are
future_dates_df.tail()
# Looks like the forecast dates go to May 1971

Out[33]:

	Riders	Seasonal_1st_diff	forecast
1971-01-01	NaN	NaN	NaN
1971-02-01	NaN	NaN	NaN
1971-03-01	NaN	NaN	NaN
1971-04-01	NaN	NaN	NaN
1971-05-01	NaN	NaN	NaN

In [34]:

# Combine the datasets
future_data = pd.concat([data, future_dates_df])
future_data

C:\Users\aabro\AppData\Local\Temp\ipykernel_35400\1093377636.py:1: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.
  future_data = pd.concat([data, future_dates_df])

Out[34]:

	Riders	Seasonal_1st_diff	forecast
1960-01-01	648	NaN	NaN
1960-02-01	646	NaN	NaN
1960-03-01	639	NaN	NaN
1960-04-01	654	NaN	NaN
1960-05-01	630	NaN	NaN
...	...	...	...
1971-01-01	NaN	NaN	NaN
1971-02-01	NaN	NaN	NaN
1971-03-01	NaN	NaN	NaN
1971-04-01	NaN	NaN	NaN
1971-05-01	NaN	NaN	NaN

137 rows × 3 columns

In [35]:

# Plotting the original data with the forecasted data
future_data['forecast'] =  results.predict(start=113, end=128, dynamic=True)
future_data[['Riders', 'forecast']].plot(figsize=(12,8), linewidth=3, fontsize=20)

Out[35]:

<Axes: >

EXTRA: Play around with varying the parameters to the SARIMAX model¶

In [52]:

# Calculate error for the predicted results at the end
pred = results.predict(start=90, end=113, dynamic=True)
diff = (pred - data.iloc[90:114]['Riders'])
rse = sqrt((diff**2).sum())

In [66]:

# What is the error for the original parameter set
rse

Out[66]:

1077.1551721453272

In [ ]:

# See what the best parameters for sarimax are
best_rse = 1000000
best_p1 = 0
best_q1 = 0
best_p2 = 0
best_q2 = 0
for p1 in range(5):
    for q1 in range(5):
        for p2 in range(5):
            for q2 in range(5):
                print(f'Current parameters: p1 = {p1}, q1 = {q1}, p2 = {p2}, q2 = {q2}')
                model = sm.tsa.statespace.SARIMAX(data['Riders'], order=(p1,1,q1), seasonal_order = (p2,1,q2,12))
                results = model.fit()
                pred = results.predict(start=90, end=113, dynamic=True)
                diff = (pred - data.iloc[90:114]['Riders'])
                rse = sqrt((diff**2).sum())
                if (rse < best_rse):
                    best_p1 = p1
                    best_q1 = q1
                    best_p2 = p2
                    best_q2 = q2
                    best_rse = rse
                    print(f'New best RSE = {rse}')

In [71]:

print(f'Final parameters: p1 = {best_p1}, q1 = {best_q1}, p2 = {best_p2}, q2 = {best_q2}. Best RSE = {best_rse}')

Final parameters: p1 = 4, q1 = 1, p2 = 0, q2 = 0. Best RSE = 687.0577539896823

EXTRA: Use these new parameters to look at the plots¶

In [72]:

# Plotting the two forecasts
model2 = sm.tsa.statespace.SARIMAX(data['Riders'], order=(4,1,1), seasonal_order = (0,1,0,12))
results2 = model2.fit()
data['forecast2'] = results2.predict(start=90, end=114, dynamic=True)
data[['Riders', 'forecast', 'forecast2']].plot(figsize=(12,8), linewidth=3, fontsize=20)
# Forecast2 is a little bit less extreme than Forecast

C:\Users\aabro\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
C:\Users\aabro\anaconda3\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)

Out[72]:

<Axes: xlabel='Month'>

In [73]:

# Plot the future forcasts from forecast and forecast2
future_data['forecast2'] =  results2.predict(start=113, end=128, dynamic=True)
future_data[['Riders', 'forecast', 'forecast2']].plot(figsize=(12,8), linewidth=3, fontsize=20)
# I don't like how the forecast2 prediction seems to be decreasing, even though there has been a clear upward trend from 1960 to 1967.

Out[73]:

<Axes: >

In [ ]:

# I think the next steps would be to get some different data, see if SARIMAX is an appropriate model for the data, and then use the algorithm for prediction.