from matplotlib import pyplot
import statsmodels.api as sm
import pandas as pd
from pandas.plotting import autocorrelation_plot
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
import warnings
warnings.filterwarnings('ignore')

df = pd.read_excel('Ice Cream sales.xlsx')
df['Month Year']=pd.to_datetime(df['Month Year'])
df.set_index('Month Year', inplace=True)

df.head()

df.plot()

<Axes: xlabel='Month Year'>

result=adfuller(df['Sales'])

labels = ['ADF test statistic','p value','# lags used','Number of observations used']
for value,label in zip(result,labels):
    print(label+' : '+str(value) )

if result[1] <= 0.05:
    print("Data is stationary.")
else:
    print("Data is non-stationary.")

ADF test statistic : -2.8959521919599487
p value : 0.04582073132373142
# lags used : 11
Number of observations used : 108
Data is stationary.

# Calculate autocorrelation
num_observations = len(df['Sales'])
data = np.asarray(df['Sales'])
mean = np.mean(data)
c0 = np.sum((data - mean) ** 2) / float(num_observations)

def r(h):
    return ((data[:num_observations - h] - mean) * (data[h:] - mean)).sum() / float(num_observations) / c0

autocorr = [r(h) for h in range(num_observations)]

# Find the lag with the highest autocorrelation (excluding lag 0)
best_autocorr_lag = np.argmax(np.abs(autocorr[1:]))+1
highest_autocorr_value = autocorr[best_autocorr_lag]

# Print autocorrelations
print(f"Highest autocorrelation is at lag {best_autocorr_lag} with a value of: {round(highest_autocorr_value,3)} \n")
for i, value in enumerate(autocorr):
    if i == 0 or i > best_autocorr_lag:
        continue
    print(f"Lag {i}: {round(value,3)}")

Highest autocorrelation is at lag 12 with a value of: 0.804 

Lag 1: 0.696
Lag 2: 0.406
Lag 3: -0.039
Lag 4: -0.45
Lag 5: -0.68
Lag 6: -0.8
Lag 7: -0.637
Lag 8: -0.408
Lag 9: -0.009
Lag 10: 0.385
Lag 11: 0.654
Lag 12: 0.804

autocorrelation_plot(df['Sales'])

<Axes: xlabel='Lag', ylabel='Autocorrelation'>

lagged_column = str(best_autocorr_lag) + " period lag" 
df[lagged_column]=df['Sales']-df['Sales'].shift(best_autocorr_lag)
df.tail()

result=adfuller(df[lagged_column].dropna())

labels = ['ADF test statistic','p value','# lags used','Number of observations used']
for value,label in zip(result,labels):
    print(label+' : '+str(value) )

if result[1] <= 0.05:
    print("Data is stationary.")
else:
    print("Data is non-stationary.")

ADF test statistic : -3.5817798917489236
p value : 0.006119064416567114
# lags used : 12
Number of observations used : 95
Data is stationary.

# Create plots
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
sm.graphics.tsa.plot_acf(df[lagged_column].dropna(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
sm.graphics.tsa.plot_pacf(df[lagged_column].dropna(), lags=40, ax=ax2)
plt.show()

# Calculate ACF and PACF values
acf_values, acf_confint = sm.tsa.acf(df[lagged_column].dropna(), nlags=40, alpha=0.05)
pacf_values, pacf_confint = sm.tsa.pacf(df[lagged_column].dropna(), nlags=40, alpha=0.05)

# Get value of q (first index where ACF value is between the ACF confidence intervals)
q = 0
length = len(acf_confint)
while q < length:
    this_acf_value = acf_values[q]
    this_acf_confint = acf_confint[q]
    this_acf_confint_min = min(this_acf_confint)
    this_acf_confint_max = max(this_acf_confint)

    if this_acf_value > this_acf_confint_min and this_acf_value < this_acf_confint_max:
        break

    q += 1

# Get value of p (first index where PACF value is between the PACF confidence intervals)
p = 0
length = len(acf_confint)
while p < length:
    this_pacf_value = pacf_values[p]
    this_pacf_confint = pacf_confint[p]
    this_pacf_confint_min = min(this_pacf_confint)
    this_pacf_confint_max = max(this_pacf_confint)

    if this_pacf_value > this_pacf_confint_min and this_pacf_value < this_pacf_confint_max:
        break

    p += 1

model=sm.tsa.statespace.SARIMAX(df['Sales'],order=(p, 1, q),seasonal_order=(p, 1, q, best_autocorr_lag))
results=model.fit()

full_data_set_size = df.shape[0]
last_20_percent_dataset = int(full_data_set_size*(1-0.2))

df['Forecast']=results.predict(start=last_20_percent_dataset,end=full_data_set_size,dynamic=True)
df['Forecast'] = df['Forecast'].round()
df[['Sales','Forecast']].plot(figsize=(12,8))

# Calculate Mean Absolute Error (MAE)
mae = np.mean(np.abs(df['Sales'] - df['Forecast']))

# Calculate Mean Absolute Percentage Error (MAPE)
mape = np.mean(np.abs((df['Sales'] - df['Forecast']) / df['Sales'])) * 100

plt.title(f'Sales and Forecast Comparison (MAE: {mae:.2f}, MAPE: {mape:.2f}%)', fontsize=14)
plt.xlabel("Month")
plt.ylabel("Sales")

Text(0, 0.5, 'Sales')

number_future_months = 18

# Create future dates data dataframe
from pandas.tseries.offsets import DateOffset
future_dates=[df.index[-1]+ DateOffset(months=x)for x in range(0,number_future_months)]
future_datest_df=pd.DataFrame(index=future_dates[1:],columns=df.columns)

# Apend above dataframe to original data
future_df=pd.concat([df,future_datest_df])

# Make predictions and visualise them
future_df['Forecast'] = results.predict(start = full_data_set_size, end = full_data_set_size+number_future_months, dynamic= True)  
future_df[['Sales', 'Forecast']].plot(figsize=(12, 8)) 

plt.title(f'Future Sales Forecast', fontsize=14)
plt.xlabel("Month")
plt.ylabel("Sales")

Text(0, 0.5, 'Sales')

	Sales
Month Year
2010-01-01	2225
2010-02-01	1573
2010-03-01	1959
2010-04-01	2730
2010-05-01	4924

	Sales	12 period lag
Month Year
2019-08-01	6885	-1119.0
2019-09-01	7095	2607.0
2019-10-01	5312	-816.0
2019-11-01	2940	2.0
2019-12-01	2198	-190.0

Imports¶

Ingest data¶

Plot data¶

Check for stationary¶

Calculate highest autocorrelation lag¶

Create lagged column and access stationary of it¶

Create autocorrelation charts and determine p and q values¶

Create and fit SARIMAX model¶

Forcast on observed data¶

Forcast future (unobserved) months¶