library(tidyverse)
df <- read.csv("football_results.csv")
# Filter data to include 10 Scottish Premier League seasons (2000/2001 to 2010/2011)
df$date <- as.Date(df$date, format = "%Y-%m-%d")
df <- df %>% filter(competition == "scotland", date >= as.Date("2000-07-01"), date < as.Date("2011-06-01"))
# Delete columns
df = df[,!(names(df) %in% c('level',
'continent',
'full_time',
'home_continent',
'away_continent',
'away_code',
'home_code',
'competition',
'home_ident',
'away_ident',
'home_country',
'away_country'
))]
# Rename columns
df <- df %>%
rename(Home_Team = home,
Away_Team = away,
Game_Date = date,
Home_Goals = gh,
Away_Goals = ga,
)
# Add Day of week column
df$Day_of_Week <- weekdays(as.Date(df$Game_Date))
# Add total goals column
df$Total_Goals = df$Home_Goals + df$Away_Goals
# Add win (W), loss (L), draw (D) column
df$Home_Win_Loss_Draw <- ifelse(
df$Home_Goals > df$Away_Goals, "W",
ifelse(df$Home_Goals < df$Away_Goals, "L",
"D")
)
head(df)
Home_Team | Away_Team | Game_Date | Home_Goals | Away_Goals | Day_of_Week | Total_Goals | Home_Win_Loss_Draw |
---|---|---|---|---|---|---|---|
Rangers FC | St Johnstone FC | 2000-07-29 | 2 | 1 | Saturday | 3 | W |
Dunfermline Athletic | Aberdeen FC | 2000-07-29 | 0 | 0 | Saturday | 0 | D |
Motherwell FC | Dundee FC | 2000-07-29 | 0 | 2 | Saturday | 2 | L |
St Mirren FC | Kilmarnock FC | 2000-07-29 | 0 | 1 | Saturday | 1 | L |
Heart Of Midlothian | Hibernian FC | 2000-07-30 | 0 | 0 | Sunday | 0 | D |
Dundee United | Celtic FC | 2000-07-30 | 1 | 2 | Sunday | 3 | L |
# Home vs away goal figures
avg_goals_home <- round(mean(df$Home_Goals),2)
avg_goals_away <- round(mean(df$Away_Goals),2)
print(paste("The average number of home goals was:", avg_goals_home))
print(paste("The average number of away goals was:", avg_goals_away))
if (avg_goals_home > avg_goals_away) {
print("The home team scored more on average.")
} else if (avg_goals_away == avg_goals_home) {
print("The home and away teams scored the same number of goals on average.")
} else {
print("The away team scored more on average.")
}
# Statistical significance testing
ttest_result <- t.test(df$Home_Goals, df$Away_Goals,paired = TRUE)
ttest_result
p_value <- ttest_result$p.value
if(p_value > 0.05){
print("There is not a signficant difference between these averages.")
} else {print("There is a signficant difference between these averages.")}
[1] "The average number of home goals was: 1.48" [1] "The average number of away goals was: 1.17" [1] "The home team scored more on average."
Paired t-test data: df$Home_Goals and df$Away_Goals t = 8.6563, df = 2507, p-value < 2.2e-16 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: 0.2442534 0.3873255 sample estimates: mean of the differences 0.3157895
[1] "There is a signficant difference between these averages."
max_goals <- max(df$Total_Goals)
max_goals_home <- max(df$Home_Goals)
max_goals_away <- max(df$Away_Goals)
print(paste("The highest scoring game contained", max_goals, "goals."))
print(paste("The most goals scored by the home team was", max_goals_home, "goals."))
print(paste("The most goals scored by the away team was", max_goals_away, "goals."))
[1] "The highest scoring game contained 12 goals." [1] "The most goals scored by the home team was 9 goals." [1] "The most goals scored by the away team was 8 goals."
average_home_goals = df %>% group_by(Home_Team)%>%
summarise(
Average_Home_Goals = mean(Home_Goals)
) %>% arrange(desc(Average_Home_Goals))
head(average_home_goals, 10)
Home_Team | Average_Home_Goals |
---|---|
Celtic FC | 2.483254 |
Rangers FC | 2.392344 |
Hibernian FC | 1.602871 |
Heart Of Midlothian | 1.574163 |
Motherwell FC | 1.461538 |
Kilmarnock FC | 1.379808 |
Aberdeen FC | 1.317536 |
Inverness Caledonian Thistle FC | 1.263158 |
Dundee FC | 1.252632 |
Partick Thistle | 1.236842 |
average_away_goals = df %>% group_by(Away_Team)%>%
summarise(
Average_Away_Goals = mean(Away_Goals)
) %>% arrange(desc(Average_Away_Goals))
head(average_away_goals, 10)
Away_Team | Average_Away_Goals |
---|---|
Celtic FC | 2.081340 |
Rangers FC | 1.779904 |
Hibernian FC | 1.143541 |
Inverness Caledonian Thistle FC | 1.140351 |
Dundee FC | 1.136842 |
Dundee United | 1.133971 |
Heart Of Midlothian | 1.090909 |
Falkirk FC | 1.085106 |
Kilmarnock FC | 1.033333 |
Livingston FC | 1.020833 |
games_per_day <- df %>% group_by(Day_of_Week) %>% summarise(Total_Games_Played = n()) %>% arrange(desc(Total_Games_Played))
games_per_day
Day_of_Week | Total_Games_Played |
---|---|
Saturday | 1691 |
Sunday | 443 |
Wednesday | 207 |
Tuesday | 102 |
Monday | 44 |
Thursday | 20 |
Friday | 1 |
# Calculate the maximum value of Total_Games_Played and add 19% to it (for Y limit)
max_value <- max(games_per_day$Total_Games_Played) * 1.19
# Define colours to be applied to bars
colors <-rev(heat.colors(length(games_per_day$Total_Games_Played))[rank(games_per_day$Total_Games_Played, ties.method = "min")])
# Create the bar plot
bp <- barplot(games_per_day$Total_Games_Played,
names.arg = games_per_day$Day_of_Week,
ylab = "Number of Games",
main = "Number of games by day of week",
col = colors,
las = 2,
cex.names = 0.9,
cex.axis = 0.8,
ylim = c(0, max_value)
)
# Add data labels above the bars
text(x = bp, y = games_per_day$Total_Games_Played, labels = games_per_day$Total_Games_Played, pos = 3)
# Create vector with data
win_loss_draw <- c(Home_Wins = sum(df$Home_Win_Loss_Draw == "W"),
Home_Draws = sum(df$Home_Win_Loss_Draw == "D"),
Home_Losses = sum(df$Home_Win_Loss_Draw == "L"))
# Map the original labels (above) to new labels
label_names <- c("Home_Wins" = "Home Wins", "Home_Draws" = "Home Draws", "Home_Losses" = "Home Losses")
# Calculate the total count of all data points
total_count <- sum(win_loss_draw)
# Create a pie chart
pie(win_loss_draw,
main = "Portion of home games won, drawn, and lost",
col = c("green", "yellow", "red"),
labels = sprintf("%s: %d (%.1f%%)", label_names[names(win_loss_draw)], win_loss_draw, (win_loss_draw/total_count)*100),
cex = 0.8
)