This notebook contains a SARIMAX (Seasonal ARIMA with eXogenous variables) analysis that predicts future ice cream sales. This analysis is performed on fabricated data I created.
Imports¶
In [1]:
from matplotlib import pyplot
import statsmodels.api as sm
import pandas as pd
from pandas.plotting import autocorrelation_plot
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
import warnings
warnings.filterwarnings('ignore')
Ingest data¶
In [2]:
df = pd.read_excel('Ice Cream sales.xlsx')
df['Month Year']=pd.to_datetime(df['Month Year'])
df.set_index('Month Year', inplace=True)
df.head()
Out[2]:
| Sales | |
|---|---|
| Month Year | |
| 2010-01-01 | 2225 |
| 2010-02-01 | 1573 |
| 2010-03-01 | 1959 |
| 2010-04-01 | 2730 |
| 2010-05-01 | 4924 |
Plot data¶
In [3]:
df.plot()
Out[3]:
<Axes: xlabel='Month Year'>
Check for stationary¶
In [4]:
result=adfuller(df['Sales'])
labels = ['ADF test statistic','p value','# lags used','Number of observations used']
for value,label in zip(result,labels):
print(label+' : '+str(value) )
if result[1] <= 0.05:
print("Data is stationary.")
else:
print("Data is non-stationary.")
ADF test statistic : -2.8959521919599487 p value : 0.04582073132373142 # lags used : 11 Number of observations used : 108 Data is stationary.
Calculate highest autocorrelation lag¶
In [19]:
# Calculate autocorrelation
num_observations = len(df['Sales'])
data = np.asarray(df['Sales'])
mean = np.mean(data)
c0 = np.sum((data - mean) ** 2) / float(num_observations)
def r(h):
return ((data[:num_observations - h] - mean) * (data[h:] - mean)).sum() / float(num_observations) / c0
autocorr = [r(h) for h in range(num_observations)]
# Find the lag with the highest autocorrelation (excluding lag 0)
best_autocorr_lag = np.argmax(np.abs(autocorr[1:]))+1
highest_autocorr_value = autocorr[best_autocorr_lag]
# Print autocorrelations
print(f"Highest autocorrelation is at lag {best_autocorr_lag} with a value of: {round(highest_autocorr_value,3)} \n")
for i, value in enumerate(autocorr):
if i == 0 or i > best_autocorr_lag:
continue
print(f"Lag {i}: {round(value,3)}")
Highest autocorrelation is at lag 12 with a value of: 0.804 Lag 1: 0.696 Lag 2: 0.406 Lag 3: -0.039 Lag 4: -0.45 Lag 5: -0.68 Lag 6: -0.8 Lag 7: -0.637 Lag 8: -0.408 Lag 9: -0.009 Lag 10: 0.385 Lag 11: 0.654 Lag 12: 0.804
In [6]:
autocorrelation_plot(df['Sales'])
Out[6]:
<Axes: xlabel='Lag', ylabel='Autocorrelation'>
Create lagged column and access stationary of it¶
In [7]:
lagged_column = str(best_autocorr_lag) + " period lag"
df[lagged_column]=df['Sales']-df['Sales'].shift(best_autocorr_lag)
df.tail()
Out[7]:
| Sales | 12 period lag | |
|---|---|---|
| Month Year | ||
| 2019-08-01 | 6885 | -1119.0 |
| 2019-09-01 | 7095 | 2607.0 |
| 2019-10-01 | 5312 | -816.0 |
| 2019-11-01 | 2940 | 2.0 |
| 2019-12-01 | 2198 | -190.0 |
In [8]:
result=adfuller(df[lagged_column].dropna())
labels = ['ADF test statistic','p value','# lags used','Number of observations used']
for value,label in zip(result,labels):
print(label+' : '+str(value) )
if result[1] <= 0.05:
print("Data is stationary.")
else:
print("Data is non-stationary.")
ADF test statistic : -3.5817798917489236 p value : 0.006119064416567114 # lags used : 12 Number of observations used : 95 Data is stationary.
Create autocorrelation charts and determine p and q values¶
In [9]:
# Create plots
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
sm.graphics.tsa.plot_acf(df[lagged_column].dropna(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
sm.graphics.tsa.plot_pacf(df[lagged_column].dropna(), lags=40, ax=ax2)
plt.show()
In [10]:
# Calculate ACF and PACF values
acf_values, acf_confint = sm.tsa.acf(df[lagged_column].dropna(), nlags=40, alpha=0.05)
pacf_values, pacf_confint = sm.tsa.pacf(df[lagged_column].dropna(), nlags=40, alpha=0.05)
# Get value of q (first index where ACF value is between the ACF confidence intervals)
q = 0
length = len(acf_confint)
while q < length:
this_acf_value = acf_values[q]
this_acf_confint = acf_confint[q]
this_acf_confint_min = min(this_acf_confint)
this_acf_confint_max = max(this_acf_confint)
if this_acf_value > this_acf_confint_min and this_acf_value < this_acf_confint_max:
break
q += 1
# Get value of p (first index where PACF value is between the PACF confidence intervals)
p = 0
length = len(acf_confint)
while p < length:
this_pacf_value = pacf_values[p]
this_pacf_confint = pacf_confint[p]
this_pacf_confint_min = min(this_pacf_confint)
this_pacf_confint_max = max(this_pacf_confint)
if this_pacf_value > this_pacf_confint_min and this_pacf_value < this_pacf_confint_max:
break
p += 1
Create and fit SARIMAX model¶
In [11]:
model=sm.tsa.statespace.SARIMAX(df['Sales'],order=(p, 1, q),seasonal_order=(p, 1, q, best_autocorr_lag))
results=model.fit()
Forcast on observed data¶
In [12]:
full_data_set_size = df.shape[0]
last_20_percent_dataset = int(full_data_set_size*(1-0.2))
In [13]:
df['Forecast']=results.predict(start=last_20_percent_dataset,end=full_data_set_size,dynamic=True)
df['Forecast'] = df['Forecast'].round()
df[['Sales','Forecast']].plot(figsize=(12,8))
# Calculate Mean Absolute Error (MAE)
mae = np.mean(np.abs(df['Sales'] - df['Forecast']))
# Calculate Mean Absolute Percentage Error (MAPE)
mape = np.mean(np.abs((df['Sales'] - df['Forecast']) / df['Sales'])) * 100
plt.title(f'Sales and Forecast Comparison (MAE: {mae:.2f}, MAPE: {mape:.2f}%)', fontsize=14)
plt.xlabel("Month")
plt.ylabel("Sales")
Out[13]:
Text(0, 0.5, 'Sales')
Forcast future (unobserved) months¶
In [14]:
number_future_months = 18
# Create future dates data dataframe
from pandas.tseries.offsets import DateOffset
future_dates=[df.index[-1]+ DateOffset(months=x)for x in range(0,number_future_months)]
future_datest_df=pd.DataFrame(index=future_dates[1:],columns=df.columns)
In [15]:
# Apend above dataframe to original data
future_df=pd.concat([df,future_datest_df])
# Make predictions and visualise them
future_df['Forecast'] = results.predict(start = full_data_set_size, end = full_data_set_size+number_future_months, dynamic= True)
future_df[['Sales', 'Forecast']].plot(figsize=(12, 8))
plt.title(f'Future Sales Forecast', fontsize=14)
plt.xlabel("Month")
plt.ylabel("Sales")
Out[15]:
Text(0, 0.5, 'Sales')