This notebook contains a SARIMAX (Seasonal ARIMA with eXogenous variables) analysis that predicts future ice cream sales. This analysis is performed on fabricated data I created.

Imports¶

In [1]:
from matplotlib import pyplot
import statsmodels.api as sm
import pandas as pd
from pandas.plotting import autocorrelation_plot
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
import warnings
warnings.filterwarnings('ignore')

Ingest data¶

In [2]:
df = pd.read_excel('Ice Cream sales.xlsx')
df['Month Year']=pd.to_datetime(df['Month Year'])
df.set_index('Month Year', inplace=True)

df.head()
Out[2]:
Sales
Month Year
2010-01-01 2225
2010-02-01 1573
2010-03-01 1959
2010-04-01 2730
2010-05-01 4924

Plot data¶

In [3]:
df.plot()
Out[3]:
<Axes: xlabel='Month Year'>
No description has been provided for this image

Check for stationary¶

In [4]:
result=adfuller(df['Sales'])

labels = ['ADF test statistic','p value','# lags used','Number of observations used']
for value,label in zip(result,labels):
    print(label+' : '+str(value) )

if result[1] <= 0.05:
    print("Data is stationary.")
else:
    print("Data is non-stationary.")
ADF test statistic : -2.8959521919599487
p value : 0.04582073132373142
# lags used : 11
Number of observations used : 108
Data is stationary.

Calculate highest autocorrelation lag¶

In [19]:
# Calculate autocorrelation
num_observations = len(df['Sales'])
data = np.asarray(df['Sales'])
mean = np.mean(data)
c0 = np.sum((data - mean) ** 2) / float(num_observations)

def r(h):
    return ((data[:num_observations - h] - mean) * (data[h:] - mean)).sum() / float(num_observations) / c0

autocorr = [r(h) for h in range(num_observations)]

# Find the lag with the highest autocorrelation (excluding lag 0)
best_autocorr_lag = np.argmax(np.abs(autocorr[1:]))+1
highest_autocorr_value = autocorr[best_autocorr_lag]

# Print autocorrelations
print(f"Highest autocorrelation is at lag {best_autocorr_lag} with a value of: {round(highest_autocorr_value,3)} \n")
for i, value in enumerate(autocorr):
    if i == 0 or i > best_autocorr_lag:
        continue
    print(f"Lag {i}: {round(value,3)}")
Highest autocorrelation is at lag 12 with a value of: 0.804 

Lag 1: 0.696
Lag 2: 0.406
Lag 3: -0.039
Lag 4: -0.45
Lag 5: -0.68
Lag 6: -0.8
Lag 7: -0.637
Lag 8: -0.408
Lag 9: -0.009
Lag 10: 0.385
Lag 11: 0.654
Lag 12: 0.804
In [6]:
autocorrelation_plot(df['Sales'])
Out[6]:
<Axes: xlabel='Lag', ylabel='Autocorrelation'>
No description has been provided for this image

Create lagged column and access stationary of it¶

In [7]:
lagged_column = str(best_autocorr_lag) + " period lag" 
df[lagged_column]=df['Sales']-df['Sales'].shift(best_autocorr_lag)
df.tail()
Out[7]:
Sales 12 period lag
Month Year
2019-08-01 6885 -1119.0
2019-09-01 7095 2607.0
2019-10-01 5312 -816.0
2019-11-01 2940 2.0
2019-12-01 2198 -190.0
In [8]:
result=adfuller(df[lagged_column].dropna())

labels = ['ADF test statistic','p value','# lags used','Number of observations used']
for value,label in zip(result,labels):
    print(label+' : '+str(value) )

if result[1] <= 0.05:
    print("Data is stationary.")
else:
    print("Data is non-stationary.")
ADF test statistic : -3.5817798917489236
p value : 0.006119064416567114
# lags used : 12
Number of observations used : 95
Data is stationary.

Create autocorrelation charts and determine p and q values¶

In [9]:
# Create plots
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
sm.graphics.tsa.plot_acf(df[lagged_column].dropna(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
sm.graphics.tsa.plot_pacf(df[lagged_column].dropna(), lags=40, ax=ax2)
plt.show()
No description has been provided for this image
In [10]:
# Calculate ACF and PACF values
acf_values, acf_confint = sm.tsa.acf(df[lagged_column].dropna(), nlags=40, alpha=0.05)
pacf_values, pacf_confint = sm.tsa.pacf(df[lagged_column].dropna(), nlags=40, alpha=0.05)

# Get value of q (first index where ACF value is between the ACF confidence intervals)
q = 0
length = len(acf_confint)
while q < length:
    this_acf_value = acf_values[q]
    this_acf_confint = acf_confint[q]
    this_acf_confint_min = min(this_acf_confint)
    this_acf_confint_max = max(this_acf_confint)

    if this_acf_value > this_acf_confint_min and this_acf_value < this_acf_confint_max:
        break

    q += 1

# Get value of p (first index where PACF value is between the PACF confidence intervals)
p = 0
length = len(acf_confint)
while p < length:
    this_pacf_value = pacf_values[p]
    this_pacf_confint = pacf_confint[p]
    this_pacf_confint_min = min(this_pacf_confint)
    this_pacf_confint_max = max(this_pacf_confint)

    if this_pacf_value > this_pacf_confint_min and this_pacf_value < this_pacf_confint_max:
        break

    p += 1

Create and fit SARIMAX model¶

In [11]:
model=sm.tsa.statespace.SARIMAX(df['Sales'],order=(p, 1, q),seasonal_order=(p, 1, q, best_autocorr_lag))
results=model.fit()

Forcast on observed data¶

In [12]:
full_data_set_size = df.shape[0]
last_20_percent_dataset = int(full_data_set_size*(1-0.2))
In [13]:
df['Forecast']=results.predict(start=last_20_percent_dataset,end=full_data_set_size,dynamic=True)
df['Forecast'] = df['Forecast'].round()
df[['Sales','Forecast']].plot(figsize=(12,8))

# Calculate Mean Absolute Error (MAE)
mae = np.mean(np.abs(df['Sales'] - df['Forecast']))

# Calculate Mean Absolute Percentage Error (MAPE)
mape = np.mean(np.abs((df['Sales'] - df['Forecast']) / df['Sales'])) * 100

plt.title(f'Sales and Forecast Comparison (MAE: {mae:.2f}, MAPE: {mape:.2f}%)', fontsize=14)
plt.xlabel("Month")
plt.ylabel("Sales")
Out[13]:
Text(0, 0.5, 'Sales')
No description has been provided for this image

Forcast future (unobserved) months¶

In [14]:
number_future_months = 18

# Create future dates data dataframe
from pandas.tseries.offsets import DateOffset
future_dates=[df.index[-1]+ DateOffset(months=x)for x in range(0,number_future_months)]
future_datest_df=pd.DataFrame(index=future_dates[1:],columns=df.columns)
In [15]:
# Apend above dataframe to original data
future_df=pd.concat([df,future_datest_df])

# Make predictions and visualise them
future_df['Forecast'] = results.predict(start = full_data_set_size, end = full_data_set_size+number_future_months, dynamic= True)  
future_df[['Sales', 'Forecast']].plot(figsize=(12, 8)) 

plt.title(f'Future Sales Forecast', fontsize=14)
plt.xlabel("Month")
plt.ylabel("Sales")
Out[15]:
Text(0, 0.5, 'Sales')
No description has been provided for this image