Below is a simple analysis of ten Scottish Premier League seasons (2000/2001 to 2010/2011). In this notebook I filter, transform, and visualise data using R. The dataset, which I import from a CSV, was retrieved from this URL: https://www.kaggle.com/datasets/schochastics/domestic-football-results-from-1888-to-2019¶
In [17]:
library(tidyverse)
In [2]:
df <- read.csv("football_results.csv")

Clean data set¶

In [3]:
# Filter data to include 10 Scottish Premier League seasons (2000/2001 to 2010/2011)
df$date <- as.Date(df$date, format = "%Y-%m-%d")
df <- df %>% filter(competition == "scotland", date >= as.Date("2000-07-01"), date < as.Date("2011-06-01"))
In [4]:
# Delete columns
df = df[,!(names(df) %in% c('level',
                            'continent',
                            'full_time',
                            'home_continent',
                            'away_continent',
                            'away_code',
                            'home_code',
                            'competition',
                            'home_ident',
                            'away_ident',
                            'home_country',
                            'away_country'
                           ))]
In [5]:
# Rename columns
df <- df %>% 
     rename(Home_Team = home,
        Away_Team = away,
        Game_Date = date,
        Home_Goals = gh,
        Away_Goals = ga, 
     )

Add columns¶

In [6]:
# Add Day of week column
df$Day_of_Week <- weekdays(as.Date(df$Game_Date))
In [7]:
# Add total goals column
df$Total_Goals = df$Home_Goals + df$Away_Goals
In [8]:
# Add win (W), loss (L), draw (D) column
df$Home_Win_Loss_Draw <- ifelse(
    df$Home_Goals > df$Away_Goals, "W",
    ifelse(df$Home_Goals < df$Away_Goals, "L",
    "D")
)
In [9]:
head(df)
Home_TeamAway_TeamGame_DateHome_GoalsAway_GoalsDay_of_WeekTotal_GoalsHome_Win_Loss_Draw
Rangers FC St Johnstone FC 2000-07-29 2 1 Saturday 3 W
Dunfermline AthleticAberdeen FC 2000-07-29 0 0 Saturday 0 D
Motherwell FC Dundee FC 2000-07-29 0 2 Saturday 2 L
St Mirren FC Kilmarnock FC 2000-07-29 0 1 Saturday 1 L
Heart Of Midlothian Hibernian FC 2000-07-30 0 0 Sunday 0 D
Dundee United Celtic FC 2000-07-30 1 2 Sunday 3 L

Conduct analysis¶

Home vs away goal differences¶

In [10]:
# Home vs away goal figures
avg_goals_home <- round(mean(df$Home_Goals),2)
avg_goals_away <- round(mean(df$Away_Goals),2)
print(paste("The average number of home goals was:", avg_goals_home))
print(paste("The average number of away goals was:", avg_goals_away))

if (avg_goals_home > avg_goals_away) {
  print("The home team scored more on average.")
} else if (avg_goals_away == avg_goals_home) {
  print("The home and away teams scored the same number of goals on average.")
} else {
  print("The away team scored more on average.")
}

# Statistical significance testing
ttest_result <- t.test(df$Home_Goals, df$Away_Goals,paired = TRUE)
ttest_result
p_value <- ttest_result$p.value

if(p_value > 0.05){
    print("There is not a signficant difference between these averages.")
} else {print("There is a signficant difference between these averages.")}
[1] "The average number of home goals was: 1.48"
[1] "The average number of away goals was: 1.17"
[1] "The home team scored more on average."
	Paired t-test

data:  df$Home_Goals and df$Away_Goals
t = 8.6563, df = 2507, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.2442534 0.3873255
sample estimates:
mean of the differences 
              0.3157895 
[1] "There is a signficant difference between these averages."

Maximum goal figures (total, home, away)¶

In [11]:
max_goals <- max(df$Total_Goals)
max_goals_home <- max(df$Home_Goals)
max_goals_away <- max(df$Away_Goals)

print(paste("The highest scoring game contained", max_goals, "goals."))
print(paste("The most goals scored by the home team was", max_goals_home, "goals."))
print(paste("The most goals scored by the away team was", max_goals_away, "goals."))
[1] "The highest scoring game contained 12 goals."
[1] "The most goals scored by the home team was 9 goals."
[1] "The most goals scored by the away team was 8 goals."

Average number of home team goals (top 10)¶

In [12]:
average_home_goals = df %>% group_by(Home_Team)%>%
    summarise(
        Average_Home_Goals = mean(Home_Goals)
    ) %>% arrange(desc(Average_Home_Goals))

head(average_home_goals, 10)
Home_TeamAverage_Home_Goals
Celtic FC 2.483254
Rangers FC 2.392344
Hibernian FC 1.602871
Heart Of Midlothian 1.574163
Motherwell FC 1.461538
Kilmarnock FC 1.379808
Aberdeen FC 1.317536
Inverness Caledonian Thistle FC1.263158
Dundee FC 1.252632
Partick Thistle 1.236842

Average number of away team goals (top 10)¶

In [13]:
average_away_goals = df %>% group_by(Away_Team)%>%
    summarise(
        Average_Away_Goals = mean(Away_Goals)
    ) %>% arrange(desc(Average_Away_Goals))

head(average_away_goals, 10)
Away_TeamAverage_Away_Goals
Celtic FC 2.081340
Rangers FC 1.779904
Hibernian FC 1.143541
Inverness Caledonian Thistle FC1.140351
Dundee FC 1.136842
Dundee United 1.133971
Heart Of Midlothian 1.090909
Falkirk FC 1.085106
Kilmarnock FC 1.033333
Livingston FC 1.020833

Games by day of week¶

In [14]:
games_per_day <- df %>% group_by(Day_of_Week) %>% summarise(Total_Games_Played = n()) %>% arrange(desc(Total_Games_Played))

games_per_day
Day_of_WeekTotal_Games_Played
Saturday 1691
Sunday 443
Wednesday 207
Tuesday 102
Monday 44
Thursday 20
Friday 1
In [15]:
# Calculate the maximum value of Total_Games_Played and add 19% to it (for Y limit)
max_value <- max(games_per_day$Total_Games_Played) * 1.19

# Define colours to be applied to bars
colors <-rev(heat.colors(length(games_per_day$Total_Games_Played))[rank(games_per_day$Total_Games_Played, ties.method = "min")])

# Create the bar plot
bp <- barplot(games_per_day$Total_Games_Played,
              names.arg = games_per_day$Day_of_Week,
              ylab = "Number of Games",
              main = "Number of games by day of week",
              col = colors,
              las = 2,
              cex.names = 0.9,
              cex.axis = 0.8,
              ylim = c(0, max_value)
             )

# Add data labels above the bars
text(x = bp, y = games_per_day$Total_Games_Played, labels = games_per_day$Total_Games_Played, pos = 3)

Portion of home games won, drawn, and lost¶

In [16]:
# Create vector with data
win_loss_draw <- c(Home_Wins = sum(df$Home_Win_Loss_Draw == "W"),
                   Home_Draws = sum(df$Home_Win_Loss_Draw == "D"),
                   Home_Losses = sum(df$Home_Win_Loss_Draw == "L"))

# Map the original labels (above) to new labels
label_names <- c("Home_Wins" = "Home Wins", "Home_Draws" = "Home Draws", "Home_Losses" = "Home Losses")

# Calculate the total count of all data points
total_count <- sum(win_loss_draw)

# Create a pie chart
pie(win_loss_draw,
    main = "Portion of home games won, drawn, and lost",
    col = c("green", "yellow", "red"),
    labels = sprintf("%s: %d (%.1f%%)", label_names[names(win_loss_draw)], win_loss_draw, (win_loss_draw/total_count)*100),
    cex = 0.8
)