Below is a simple analysis of ten Italian Serie A football seasons (1990/1991 to 1999/2000). The analysis is conducted with SQL. The dataset, which I import from a CSV, was retrieved from this URL: https://www.kaggle.com/datasets/schochastics/domestic-football-results-from-1888-to-2019¶

Import Italian data¶

In [1]:
import pandas as pd
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

iter_csv = pd.read_csv(r'football_results.csv', iterator=True, chunksize=1000)
italian_football = pd.concat([chunk[chunk['competition'] == 'italy'] for chunk in iter_csv])
In [2]:
italian_football.head()
Out[2]:
home away date gh ga full_time competition home_ident away_ident home_country away_country home_code away_code home_continent away_continent continent level
36293 Juventus Ssc Napoli 1929-10-06 3 2 F italy Juventus (Italy) Ssc Napoli (Italy) italy italy IT IT Europe Europe Europe national
36294 Lazio Roma Bologna FC 1929-10-06 3 0 F italy Lazio Roma (Italy) Bologna FC (Italy) italy italy IT IT Europe Europe Europe national
36295 As Livorno Inter 1929-10-06 1 2 F italy As Livorno (Italy) Inter (Italy) italy italy IT IT Europe Europe Europe national
36296 AC Milan Brescia Calcio 1929-10-06 4 1 F italy AC Milan (Italy) Brescia Calcio (Italy) italy italy IT IT Europe Europe Europe national
36297 Calcio Padova Modena FC 1929-10-06 1 3 F italy Calcio Padova (Italy) Modena FC (Italy) italy italy IT IT Europe Europe Europe national

Sort teams by average goals per game [100+ games played]¶

In [3]:
goals_scored_per_game = pysqldf("""

WITH all_teams AS (

SELECT
    home AS Team,
    gh AS Goals,
    date
FROM
    italian_football
    
UNION ALL

SELECT
    away AS Team,
    ga AS Goals,
    date
FROM
    italian_football
    
)

SELECT
    Team,
    SUM(Goals) AS Total_Goals,
    COUNT(Goals) AS Total_Games_Played,
    ROUND(AVG(Goals), 2) AS Goals_per_Games

FROM
    all_teams
    
WHERE
    DATE(date) >= DATE('1990-08-15') AND DATE(date) <= DATE('2000-06-15')

GROUP BY
    Team

HAVING
    Total_Games_Played >=100
    
ORDER BY
    Goals_per_Games DESC

""")

goals_scored_per_game
Out[3]:
Team Total_Goals Total_Games_Played Goals_per_Games
0 Lazio Roma 566 340 1.66
1 AC Milan 538 340 1.58
2 Juventus 530 340 1.56
3 Sampdoria 469 306 1.53
4 Acf Fiorentina 465 306 1.52
5 Inter 509 340 1.50
6 As Roma 493 340 1.45
7 Udinese Calcio 340 238 1.43
8 Parma FC 462 340 1.36
9 US Foggia 175 136 1.29
10 Ac Perugia Calcio 127 102 1.25
11 Bologna FC 210 170 1.24
12 Ssc Napoli 314 272 1.15
13 Genoa CFc 193 170 1.14
14 Torino FC 266 238 1.12
15 Cagliari Calcio 340 306 1.11
16 As Bari 259 238 1.09
17 Atalanta 253 238 1.06
18 Vicenza Calcio 142 136 1.04
19 US Cremonese 139 136 1.02
20 Hellas Verona 102 102 1.00
21 Brescia Calcio 99 102 0.97
22 Piacenza Calcio 188 204 0.92
23 US Lecce 113 136 0.83
24 AC Reggiana 81 102 0.79

Sort teams my most wins [top 10]¶

In [4]:
total_home_games = pysqldf("""

SELECT
    CASE
        WHEN gh > ga THEN home
        WHEN gh = ga THEN ''
        ELSE away
    END AS 'Winner',
    COUNT(*) AS Number_of_Wins
    
FROM
    italian_football
    
WHERE
    DATE(date) >= DATE('1990-08-15') AND DATE(date) <= DATE('2000-06-15') AND Winner <> ''
    
GROUP BY
    Winner
    
ORDER BY
    Number_of_Wins DESC
    
LIMIT 10

""")

total_home_games
Out[4]:
Winner Number_of_Wins
0 Juventus 179
1 AC Milan 173
2 Lazio Roma 157
3 Parma FC 155
4 Inter 151
5 As Roma 129
6 Sampdoria 124
7 Acf Fiorentina 109
8 Udinese Calcio 91
9 Cagliari Calcio 84

Sort days of week by number of games played¶

In [5]:
days_of_week = pysqldf("""

SELECT
    CASE strftime('%w', date)
        WHEN '0' THEN 'Sunday'
        WHEN '1' THEN 'Monday'
        WHEN '2' THEN 'Tuesday'
        WHEN '3' THEN 'Wednesday'
        WHEN '4' THEN 'Thursday'
        WHEN '5' THEN 'Friday'
        WHEN '6' THEN 'Saturday'
    END AS Day_of_Week,
    COUNT(*) AS Games_Played,
    ROUND((COUNT(*) * 100.0 / (SELECT COUNT(*) FROM italian_football WHERE DATE(date) >= DATE('1990-08-15') AND DATE(date) <= DATE('2000-06-15'))), 2) AS Percent_of_Total
    
FROM
    italian_football
    
WHERE    
    DATE(date) >= DATE('1990-08-15') AND DATE(date) <= DATE('2000-06-15')    
    
GROUP BY
    Day_of_Week
    
ORDER BY
    Games_Played DESC

""")

days_of_week
Out[5]:
Day_of_Week Games_Played Percent_of_Total
0 Sunday 2751 89.90
1 Saturday 226 7.39
2 Wednesday 43 1.41
3 Thursday 17 0.56
4 Monday 12 0.39
5 Friday 11 0.36

Highest scoring game(s)¶

In [6]:
highest_scoring = pysqldf("""

SELECT
    home AS Home_Team,
    away AS Away_Team,
    date AS Date,
    gh AS Home_Goals,
    ga AS Away_Goals,
    ga + gh AS Total_Goals

FROM
    italian_football

WHERE
    DATE(date) >= DATE('1990-08-15') AND DATE(date) <= DATE('2000-06-15')
    AND Total_Goals = (SELECT MAX(ga + gh) FROM italian_football WHERE DATE(date) >= DATE('1990-08-15') AND DATE(date) <= DATE('2000-06-15'))

ORDER BY
    Total_Goals DESC

""")

highest_scoring.head(10)
Out[6]:
Home_Team Away_Team Date Home_Goals Away_Goals Total_Goals
0 US Foggia AC Milan 1992-05-24 2 8 10
1 Acf Fiorentina AC Milan 1992-10-04 3 7 10
2 Lazio Roma Acf Fiorentina 1995-03-05 8 2 10
3 Acf Fiorentina Calcio Padova 1996-04-06 6 4 10
4 Inter Calcio Padova 1996-04-14 8 2 10

Most common scorelines [top 5]¶

In [7]:
most_common_scoreline = pysqldf("""

WITH scorelines AS (

SELECT
    ga || " - " || gh  AS Scoreline
    
FROM
    italian_football
)

SELECT
    Scoreline,
    COUNT(scoreline) AS Games_With_This_Scoreline
    
FROM
    scorelines
    
GROUP BY
    Scoreline
    
ORDER BY
    Games_With_This_Scoreline DESC
    
LIMIT 5

""")

most_common_scoreline
Out[7]:
Scoreline Games_With_This_Scoreline
0 1 - 1 3473
1 0 - 1 3267
2 0 - 0 2951
3 1 - 2 2389
4 0 - 2 2127

AC Milan average home goals season-over-season¶

In [8]:
total_home_games = pysqldf("""
WITH ac_milan_gh_by_year AS (
    SELECT
        home AS Home,
        CASE
            WHEN CAST(strftime('%m', date) AS INTEGER) >= 8 THEN CAST(strftime('%Y', date) AS TEXT) || '/' || CAST(CAST(strftime('%Y', date) AS INTEGER) + 1 AS TEXT)
            ELSE CAST(CAST(strftime('%Y', date) AS INTEGER) - 1 AS TEXT) || '/' || CAST(strftime('%Y', date) AS TEXT)
        END AS Season,
        ROUND(AVG(gh), 2) AS Averge_Home_Goals
    FROM
        italian_football
    WHERE
        date >= '1989-08-15' AND date <= '2000-06-15' AND home = 'AC Milan'
    GROUP BY
        Season
)
SELECT
    *,
    Averge_Home_Goals - LAG(Averge_Home_Goals, 1) OVER (ORDER BY Season) AS Difference_From_Previous_Season
FROM
    ac_milan_gh_by_year
LIMIT -1 OFFSET 1;
""")

total_home_games
Out[8]:
Home Season Averge_Home_Goals Difference_From_Previous_Season
0 AC Milan 1990/1991 1.65 -0.17
1 AC Milan 1991/1992 2.59 0.94
2 AC Milan 1992/1993 1.65 -0.94
3 AC Milan 1993/1994 1.29 -0.36
4 AC Milan 1994/1995 1.47 0.18
5 AC Milan 1995/1996 2.41 0.94
6 AC Milan 1996/1997 1.65 -0.76
7 AC Milan 1997/1998 0.94 -0.71
8 AC Milan 1998/1999 2.06 1.12
9 AC Milan 1999/2000 2.35 0.29

AC Milan vs Inter Milan derby outcomes¶

In [9]:
ac_vs_inter = pysqldf("""

SELECT 
    CASE
        WHEN gh > ga THEN home || ' win'
        WHEN gh = ga THEN 'Draw'
        ELSE away || ' win'
    END AS Outcome,
    COUNT(*) AS Number_of_Games
    
FROM
    italian_football
    
WHERE
    DATE(date) >= DATE('1990-08-15') AND DATE(date) <= DATE('2000-06-15')
    AND ((home = 'Inter' AND away = 'AC Milan') OR (home = 'AC Milan' AND away = 'Inter'))
    
GROUP BY
    Outcome
    
ORDER BY
    Number_of_Games DESC    

""")

ac_vs_inter
Out[9]:
Outcome Number_of_Games
0 Draw 9
1 Inter win 6
2 AC Milan win 5
In [ ]: