import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import ttest_rel
from pandasql import sqldf
import matplotlib.pyplot as plt
iter_csv = pd.read_csv(r'football_results.csv', iterator=True, chunksize=1000)
df = pd.concat([chunk[chunk['competition'] == 'england'] for chunk in iter_csv])
# Drop columns
df = df.drop(['level',
'continent',
'full_time',
'home_continent',
'away_continent',
'away_code',
'home_code',
'competition',
'home_ident',
'away_ident'],
axis=1)
# Rename columns
df = df.rename(columns={
'home': 'Home_Team',
'away': 'Away_Team',
'date': 'Game_Date',
'gh': 'Home_Goals',
'ga': 'Away_Goals',
'home_country': 'Home_Country',
'away_country': 'Away_Country'
})
# Change case
df['Home_Country'] = df['Home_Country'].str.title()
df['Away_Country'] = df['Away_Country'].str.title()
# Filter out non-1900s data
df["Game_Date"] = pd.to_datetime(df["Game_Date"], format="%Y-%m-%d")
df = df[(df['Game_Date'] >= '1900-01-01') & (df['Game_Date'] < '2000-01-01')]
# Add day of week column
df['Game_Day_of_Week'] = df['Game_Date'].dt.day_name()
# Add total goals column
df['Total_Goals'] = df['Home_Goals'] + df['Away_Goals']
# Add goal difference column
df['Home_Goal_Difference'] = df['Home_Goals'] - df['Away_Goals']
# Add column categorising games as win (W), lose (L), draw (D)
df['Home_Win/Loss/Draw'] = np.where(
df['Home_Goal_Difference'] > 0, 'W',
np.where(df['Home_Goal_Difference'] < 0, 'L',
'D')
)
# Add decade column
df['Decade'] = df['Game_Date'].astype(str).str[3:4] + "0s"
df.head()
| Home_Team | Away_Team | Game_Date | Home_Goals | Away_Goals | Home_Country | Away_Country | Game_Day_of_Week | Total_Goals | Home_Goal_Difference | Home_Win/Loss/Draw | Decade | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3427 | Blackburn Rovers | Burnley FC | 1900-01-01 | 2 | 0 | England | England | Monday | 2 | 2 | W | 00s |
| 3428 | Bury FC | Aston Villa | 1900-01-01 | 2 | 0 | England | England | Monday | 2 | 2 | W | 00s |
| 3429 | Everton FC | Preston North End | 1900-01-01 | 1 | 0 | England | England | Monday | 1 | 1 | W | 00s |
| 3430 | Sunderland Afc | Wolverhampton Wanderers | 1900-01-01 | 1 | 2 | England | England | Monday | 3 | -1 | L | 00s |
| 3432 | Bury FC | Notts County | 1900-01-06 | 0 | 1 | England | England | Saturday | 1 | -1 | L | 00s |
games_per_day = df.groupby(['Game_Day_of_Week'])['Home_Team'].count().reset_index()
games_per_day = games_per_day.sort_values(['Home_Team'], ascending = [False]).reset_index(drop=True)
games_per_day = games_per_day.rename(columns={'Home_Team': 'Total_Games_Played'})
games_per_day
| Game_Day_of_Week | Total_Games_Played | |
|---|---|---|
| 0 | Saturday | 29391 |
| 1 | Wednesday | 3120 |
| 2 | Monday | 2667 |
| 3 | Tuesday | 1868 |
| 4 | Friday | 852 |
| 5 | Sunday | 472 |
| 6 | Thursday | 403 |
goals_per_decade = df.groupby(['Decade'])['Total_Goals'].mean().reset_index()
goals_per_decade = goals_per_decade.rename(columns={'Total_Goals': 'Avg_Goals_Scored'})
goals_per_decade = goals_per_decade.style.highlight_max(subset=['Avg_Goals_Scored'], color='#85e62c').highlight_min(subset=['Avg_Goals_Scored'], color='#ff6242')
goals_per_decade
| Decade | Avg_Goals_Scored | |
|---|---|---|
| 0 | 00s | 3.018011 |
| 1 | 10s | 2.935458 |
| 2 | 20s | 2.927558 |
| 3 | 30s | 2.910470 |
| 4 | 40s | 3.008658 |
| 5 | 50s | 3.035859 |
| 6 | 60s | 3.113355 |
| 7 | 70s | 3.045364 |
| 8 | 80s | 3.010491 |
| 9 | 90s | 2.968370 |
highest_scoring_game = df.groupby(['Decade'])['Total_Goals'].max().reset_index()
highest_scoring_game = highest_scoring_game.rename(columns={'Total_Goals': 'Highest_Scoring_Game'})
highest_scoring_game = highest_scoring_game.style.highlight_max(subset=['Highest_Scoring_Game'], color='#85e62c').highlight_min(subset=['Highest_Scoring_Game'], color='#ff6242')
highest_scoring_game
| Decade | Highest_Scoring_Game | |
|---|---|---|
| 0 | 00s | 12 |
| 1 | 10s | 12 |
| 2 | 20s | 12 |
| 3 | 30s | 13 |
| 4 | 40s | 12 |
| 5 | 50s | 12 |
| 6 | 60s | 13 |
| 7 | 70s | 13 |
| 8 | 80s | 14 |
| 9 | 90s | 12 |
ten_goals = df[df['Total_Goals']>=10]
ten_goals = ten_goals.groupby(['Decade'])['Total_Goals'].count().reset_index()
ten_goals = ten_goals.rename(columns={'Total_Goals': 'Games_with_10+_Goals'})
ten_goals = ten_goals.style.highlight_max(subset=['Games_with_10+_Goals'], color='#85e62c').highlight_min(subset=['Games_with_10+_Goals'], color='#ff6242')
ten_goals
| Decade | Games_with_10+_Goals | |
|---|---|---|
| 0 | 00s | 16 |
| 1 | 10s | 5 |
| 2 | 20s | 17 |
| 3 | 30s | 9 |
| 4 | 40s | 10 |
| 5 | 50s | 14 |
| 6 | 60s | 12 |
| 7 | 70s | 14 |
| 8 | 80s | 12 |
| 9 | 90s | 12 |
home_team_goals = df.groupby(['Home_Team'])['Home_Goals'].mean().reset_index()
home_team_goals = home_team_goals.sort_values(['Home_Goals'], ascending = [False]).reset_index(drop=True)
home_team_goals = home_team_goals.rename(columns={'Home_Goals': 'Avg_Home_Goals'})
home_team_goals.head(10)
| Home_Team | Avg_Home_Goals | |
|---|---|---|
| 0 | Wolverhampton Wanderers | 2.097536 |
| 1 | Liverpool FC | 2.048585 |
| 2 | Blackburn Rovers | 2.013118 |
| 3 | Aston Villa | 2.006418 |
| 4 | Sunderland Afc | 1.958665 |
| 5 | Tottenham Hotspur | 1.946456 |
| 6 | Newcastle United | 1.945375 |
| 7 | Derby County | 1.944223 |
| 8 | Burnley FC | 1.939502 |
| 9 | Portsmouth FC | 1.939450 |
home_team_goals= home_team_goals = home_team_goals.sort_values(['Avg_Home_Goals'], ascending = [True]).reset_index(drop=True)
home_team_goals.head(10)
| Home_Team | Avg_Home_Goals | |
|---|---|---|
| 0 | Glossop North End | 1.000000 |
| 1 | Leyton Orient | 1.047619 |
| 2 | Carlisle United | 1.047619 |
| 3 | Crystal Palace | 1.170732 |
| 4 | Swindon Town | 1.190476 |
| 5 | Brighton Hove Albion | 1.309524 |
| 6 | Barnsley FC | 1.315789 |
| 7 | Millwall FC | 1.315789 |
| 8 | Norwich City | 1.382567 |
| 9 | Afc Wimbledon | 1.404040 |
away_team_goals = df.groupby(['Away_Team'])['Away_Goals'].mean().reset_index()
away_team_goals = away_team_goals.sort_values(['Away_Goals'], ascending = [False]).reset_index(drop=True)
away_team_goals = away_team_goals.rename(columns={'Away_Goals': 'Avg_Away_Goals'})
away_team_goals.head(10)
| Away_Team | Avg_Away_Goals | |
|---|---|---|
| 0 | Manchester United | 1.446909 |
| 1 | Arsenal FC | 1.298349 |
| 2 | Wolverhampton Wanderers | 1.282922 |
| 3 | Huddersfield Town | 1.273016 |
| 4 | Blackpool FC | 1.257496 |
| 5 | Liverpool FC | 1.249384 |
| 6 | Brentford FC | 1.247619 |
| 7 | Aston Villa | 1.233333 |
| 8 | Portsmouth FC | 1.233028 |
| 9 | Sunderland Afc | 1.231629 |
away_team_goals = away_team_goals.sort_values(['Avg_Away_Goals'], ascending = [True]).reset_index(drop=True)
away_team_goals.head(10)
| Away_Team | Avg_Away_Goals | |
|---|---|---|
| 0 | Glossop North End | 0.428571 |
| 1 | Barnsley FC | 0.631579 |
| 2 | Leyton Orient | 0.714286 |
| 3 | Brighton Hove Albion | 0.857143 |
| 4 | Crystal Palace | 0.882114 |
| 5 | Notts County | 0.886836 |
| 6 | Oldham Athletic | 0.900826 |
| 7 | Coventry City | 0.909910 |
| 8 | Stoke City | 0.922986 |
| 9 | Bristol City | 0.944134 |
average_home_goals = df['Home_Goals'].mean()
average_away_goals = df['Away_Goals'].mean()
print("The average number of home goals was: " + str(round(average_home_goals,2)) + "\nThe average number of away goals was: " + str(round(average_away_goals,2)))
print('Away teams scored more on average.' if average_home_goals < average_away_goals else 'Home teams score more on average.\n')
t_stat, p_val = ttest_rel(df['Home_Goals'], df['Away_Goals'])
print("T value: " + str(round(t_stat,2)) + ", P-value: " + str(p_val))
print('The difference is statistically significant.' if p_val < 0.05 else 'The difference is not statistically significant')
The average number of home goals was: 1.83 The average number of away goals was: 1.16 Home teams score more on average. T value: 71.73, P-value: 0.0 The difference is statistically significant.
home_wins = len(df[df['Home_Win/Loss/Draw']=='W'])
home_draws = len(df[df['Home_Win/Loss/Draw']=='D'])
home_losses = len(df[df['Home_Win/Loss/Draw']=='L'])
home_all_games = df.shape[0]
home_game_wins_percent = round(home_wins/home_all_games,3)*100
home_game_draw_percent = round(home_draws/home_all_games,3)*100
home_game_loss_percent = round(home_losses/home_all_games,3)*100
print(str(home_wins) + " home games resulted in a win.")
print(str(home_draws) + " home games resulted in a draw.")
print(str(home_losses) + " home games resulted in a loss.\n")
print(str(home_game_wins_percent) + "% of home games resulted in a win.")
print(str(home_game_draw_percent) + "% of home games resulted in a draw.")
print(str(home_game_loss_percent) + "% of home games resulted in a loss.")
19929 home games resulted in a win. 9644 home games resulted in a draw. 9200 home games resulted in a loss. 51.4% of home games resulted in a win. 24.9% of home games resulted in a draw. 23.7% of home games resulted in a loss.
games_in_wales = df[df["Home_Country"]=="Wales"].count()["Home_Country"]
games_in_england = df[df["Home_Country"]=="England"].count()["Home_Country"]
total_games = games_in_wales + games_in_england
print(str(games_in_england) + " of the " + str(total_games) + " games were played in England (" + str(round(games_in_england/total_games*100,1)) + "% of all games).\n" + str(games_in_wales) + " of the " + str(total_games) + " where played in Wales (" + str(round(games_in_wales/total_games*100,1)) + "% of all games).")
38416 of the 38773 games were played in England (99.1% of all games). 357 of the 38773 where played in Wales (0.9% of all games).
pysqldf = lambda q: sqldf(q, globals())
# Find last home appearance date and removes top 20 rows (which were the 20 Premier League teams in the most recent year in the dataset)
last_home_appeareance = pysqldf("""
SELECT
Home_Team,
substr(MAX(Game_Date), 1, 10) as Last_Home_Appeareance
FROM df
GROUP BY Home_Team
ORDER BY Last_Home_Appeareance DESC
LIMIT -1 OFFSET 20
""")
last_home_appeareance
| Home_Team | Last_Home_Appeareance | |
|---|---|---|
| 0 | Nottingham Forest | 1999-05-16 |
| 1 | Charlton Athletic | 1999-05-16 |
| 2 | Blackburn Rovers | 1999-05-12 |
| 3 | Crystal Palace | 1998-05-10 |
| 4 | Barnsley FC | 1998-05-10 |
| 5 | Bolton Wanderers | 1998-05-02 |
| 6 | Manchester City | 1996-05-04 |
| 7 | Queens Park Rangers | 1996-04-27 |
| 8 | Norwich City | 1995-05-14 |
| 9 | Ipswich Town | 1995-05-09 |
| 10 | Swindon Town | 1994-05-07 |
| 11 | Oldham Athletic | 1994-05-05 |
| 12 | Sheffield United | 1994-04-30 |
| 13 | Notts County | 1992-05-02 |
| 14 | Luton Town | 1992-04-25 |
| 15 | Millwall FC | 1990-05-05 |
| 16 | Wimbledon | 1989-12-30 |
| 17 | Portsmouth FC | 1988-05-02 |
| 18 | Oxford United | 1988-05-02 |
| 19 | West Bromwich Albion | 1986-05-03 |
| 20 | Birmingham City | 1986-05-03 |
| 21 | Stoke City | 1985-05-17 |
| 22 | Wolverhampton Wanderers | 1984-05-07 |
| 23 | Swansea City | 1983-05-14 |
| 24 | Brighton Hove Albion | 1983-05-07 |
| 25 | Bristol City | 1980-04-26 |
| 26 | Burnley FC | 1976-04-24 |
| 27 | Carlisle United | 1975-04-19 |
| 28 | Huddersfield Town | 1972-04-22 |
| 29 | Blackpool FC | 1971-05-01 |
| 30 | Fulham FC | 1968-05-04 |
| 31 | Northampton Town | 1966-04-25 |
| 32 | Leyton Orient | 1963-05-07 |
| 33 | Cardiff City | 1962-04-23 |
| 34 | Preston North End | 1961-04-22 |
| 35 | Grimsby Town | 1948-04-24 |
| 36 | Brentford FC | 1947-05-26 |
| 37 | Bury FC | 1929-05-04 |
| 38 | Bradford Park Avenue | 1921-05-07 |
| 39 | Glossop North End | 1900-04-28 |
import matplotlib.pyplot as plt
# Create one-column DataFrame for all home/away teams
home_games = df[['Home_Team']].copy()
away_games = df[['Away_Team']].copy()
home_games = home_games.rename(columns={'Home_Team': 'Team'})
away_games = away_games.rename(columns={'Away_Team': 'Team'})
all_games = pd.concat([home_games, away_games], ignore_index=True)
# Group by count of games
all_games_count = all_games.groupby('Team').size().reset_index()
all_games_count = all_games_count.rename(columns={0: 'Number_of_Games'})
# Sort DataFrame
all_games_count = all_games_count.sort_values(by=['Number_of_Games'], ascending=False)
# Normalize the data to be between 0 and 1 for colouring
norm = plt.Normalize(min(all_games_count['Number_of_Games']), max(all_games_count['Number_of_Games']))
colors = plt.cm.Purples(norm(all_games_count['Number_of_Games']))
# Plot the bar chart
ax = all_games_count.plot.bar(x='Team',
y='Number_of_Games',
rot=90,
fontsize=9,
figsize=(15,8),
color=colors, # Colours defined at top applied here
edgecolor='black',
linewidth=1,
legend= None,
title=("Number of top-flight English football league games played in the 20th century"))
# Set the y-axis limit (so data labels not cut off)
plt.ylim(top=max(all_games_count['Number_of_Games'])+max(all_games_count['Number_of_Games']/12))
# Add data labels on top of the bars
for i, v in enumerate(all_games_count['Number_of_Games']):
plt.text(i-0.3, v+max(all_games_count['Number_of_Games']/50), str(v), rotation=90)
# Sum number of home goals for home team
total_home_goals = df.groupby(['Home_Team'])['Home_Goals'].sum().reset_index()
# Sum number of away goals for away team
total_away_goals = df.groupby(['Away_Team'])['Away_Goals'].sum().reset_index()
# Merge (join) above together
total_goals = pd.merge(total_home_goals, total_away_goals, how="outer" , left_on = ["Home_Team"], right_on = ["Away_Team"])
total_goals = total_goals.rename(columns={'Home_Team': 'Team'})
# Add home and away goals (to get total goals)
total_goals['Goals'] = total_goals['Home_Goals']+total_goals['Away_Goals']
total_goals= total_goals.drop(['Away_Goals', 'Home_Goals', 'Away_Team'], axis=1)
total_goals = total_goals.sort_values(by=['Goals'], ascending=False)
# Normalize the data to be between 0 and 1 for colouring
norm = plt.Normalize(min(total_goals['Goals']), max(total_goals['Goals']))
colors = plt.cm.Purples(norm(total_goals['Goals']))
# Plot the bar chart
ax = total_goals.plot.bar(x='Team',
y='Goals',
rot=90,
fontsize=9,
figsize=(15,8),
color=colors, # Colours defined at top applied here
edgecolor='black',
linewidth=1,
legend= None,
title="Number of top-flight English football league goals scored in the 20th century")
# Set the y-axis limit (so data labels not cut off)
plt.ylim(top=max(total_goals['Goals'])+max(total_goals['Goals']/12))
# Add data labels on top of the bars
for i, v in enumerate(total_goals['Goals']):
plt.text(i-0.3, v+max(total_goals['Goals']/50), str(v), rotation=90)
mean_goals = df['Total_Goals'].mean() # Get mean goals for average line
max_goals = df['Total_Goals'].max() # Get max goals to set max bin size
plt.hist(df['Total_Goals'],
color='#8002b5',
edgecolor='black',
bins=np.arange(max_goals+1)-0.5)
plt.axvline(mean_goals, color='r', linestyle='dashed')
plt.xlabel('Number of goals')
plt.ylabel('Number of games')
plt.title('Goals per game')
Text(0.5, 1.0, 'Goals per game')
games_per_day = sns.countplot(x=df["Game_Day_of_Week"],
palette=['#8002b5'],
edgecolor='black',
order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
games_per_day.set(ylabel='Number of games',
xlabel='Day of week played',
title='Number of games by day of week')
games_per_day.tick_params(axis='x', rotation=90)
sizes = [home_wins, home_draws, home_losses]
labels = ['Home win', 'Home draw', 'Home loss']
plt.pie(sizes, labels=labels)
colors = ['green','yellow','red']
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%')
plt.title("Portion of home games won, drawn, and lost")
Text(0.5, 1.0, 'Portion of home games won, drawn, and lost')