Data analysis and visualization using Python - European football league data

We'll limit our analysis to the 5 major European football leagues - The English Premier League (EPL), Serie A (Italy), La Liga (Spain), Bundesliga (Germany) and Ligue One (France).

The data has been collected from the following sources:

We have 5 zip files - one for each league
Each zip file unpacks into 9 CSV files (one for each season)
Each row in the CSV files corresponds to one league match

First, the standard boilerplate stuff:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import spearmanr
from IPython.display import HTML, display
from pandas import Series, DataFrame

np.set_printoptions(threshold=np.nan)
pd.set_option('display.max_columns',500)

import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import poisson,skellam
import scipy, scipy.stats

sns.set_style("white")

Let's try loading one of the files first and examine the structure
EPL_DIR = "EPL" + FILE_SEP + "data"
epl18_19=pd.read_csv(EPL_DIR + FILE_SEP+ "season-1819_csv.csv")
epl18_19.columns

Output:
Index(['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG',
'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC',
'HY', 'AY', 'HR', 'AR'],
dtype='object')

We can rename the column headers to be more meaningful, like this :
new_cols = {"FTHG":"FullTime_HomeTeam_Goals",
"FTAG":"FullTime_AwayTeam_Goals",
"FTR":"FullTime_Result",
"HTHG":"HalfTime_HomeTeam_Goals",
"HTAG":"HalfTime_AwayTeam_Goals",
"HTR":"HalfTime_Result",
"HS":"HomeTeam_Shots",
"AS":"AwayTeam_Shots",
"HST":"HomeTeam_ShotsOnTarget",
"AST":"AwayTeam_ShotsOnTarget",
"HF":"HomeTeam_FoulsCommitted",
"AF":"AwayTeam_FoulsCommitted",
"HC":"HomeTeam_Corners",
"AC":"AwayTeam_Corners",
"HY":"HomeTeam_YellowCards",
"AY":"AwayTeam_YellowCards",
"HR":"HomeTeam_RedCards",
"AR":"AwayTeam_RedCards",
}
epl18_19.rename(columns = new_cols, inplace=True)

We can change the default index and use date as index instead
Lets build a function to load data for all seasons and all leagues in a generic way
def load_data(basedir, filename, season):
print("Loading",season,"data from",filename)
df = pd.read_csv(basedir + FILE_SEP + filename)
df['Season']=season
df = df[['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 'Season']]
df.rename(columns = new_cols, inplace=True)
df.set_index('Date', inplace=True)
print("Data Shape :",df.shape)
display(HTML(df.head().to_html()))
return df

We'll start with EPL data first
df18_19 = load_data(EPL_DIR,"season-1819_csv.csv","2018-2019")
df17_18 = load_data(EPL_DIR,"season-1718_csv.csv","2017-2018")
df16_17 = load_data(EPL_DIR,"season-1617_csv.csv","2016-2017")
df15_16 = load_data(EPL_DIR,"season-1516_csv.csv","2015-2016")
df14_15 = load_data(EPL_DIR,"season-1415_csv.csv","2014-2015")
df13_14 = load_data(EPL_DIR,"season-1314_csv.csv","2013-2014")
df12_13 = load_data(EPL_DIR,"season-1213_csv.csv","2012-2013")
df11_12 = load_data(EPL_DIR,"season-1112_csv.csv","2011-2012")
df10_11 = load_data(EPL_DIR,"season-1011_csv.csv","2010-2011")

Next, lets concatenate all EPL data together. Except 2018-19 season, which is ongoing, all other datasets have 380 rows. So final dataset will have 8*380 + 120 i.e. 3160 rows
epl = df10_11.append(df11_12).append(df12_13).append(df13_14).append(df14_15).append(df15_16).append(df16_17).append(df17_18).append(df18_19)
epl.sort_index()

Let's repeat the process for other leagues as well

La Liga:
LALIGA_DIR = "La-Liga"+ FILE_SEP+ "data"
df18_19 = load_data(LALIGA_DIR,"season-1819_csv.csv","2018-2019")
df17_18 = load_data(LALIGA_DIR,"season-1718_csv.csv","2017-2018")
df16_17 = load_data(LALIGA_DIR,"season-1617_csv.csv","2016-2017")
df15_16 = load_data(LALIGA_DIR,"season-1516_csv.csv","2015-2016")
df14_15 = load_data(LALIGA_DIR,"season-1415_csv.csv","2014-2015")
df13_14 = load_data(LALIGA_DIR,"season-1314_csv.csv","2013-2014")
df12_13 = load_data(LALIGA_DIR,"season-1213_csv.csv","2012-2013")
df11_12 = load_data(LALIGA_DIR,"season-1112_csv.csv","2011-2012")
df10_11 = load_data(LALIGA_DIR,"season-1011_csv.csv","2010-2011")
laliga = df10_11.append(df11_12).append(df12_13).append(df13_14).append(df14_15).append(df15_16).append(df16_17).append(df17_18).append(df18_19)
laliga.sort_index()

Serie A:
SERIE_A_DIR = "Serie-A" + FILE_SEP+ "data"
df18_19 = load_data(SERIE_A_DIR,"season-1819_csv.csv","2018-2019")
df17_18 = load_data(SERIE_A_DIR,"season-1718_csv.csv","2017-2018")
df16_17 = load_data(SERIE_A_DIR,"season-1617_csv.csv","2016-2017")
df15_16 = load_data(SERIE_A_DIR,"season-1516_csv.csv","2015-2016")
df14_15 = load_data(SERIE_A_DIR,"season-1415_csv.csv","2014-2015")
df13_14 = load_data(SERIE_A_DIR,"season-1314_csv.csv","2013-2014")
df12_13 = load_data(SERIE_A_DIR,"season-1213_csv.csv","2012-2013")
df11_12 = load_data(SERIE_A_DIR,"season-1112_csv.csv","2011-2012")
df10_11 = load_data(SERIE_A_DIR,"season-1011_csv.csv","2010-2011")
serieA = df10_11.append(df11_12).append(df12_13).append(df13_14).append(df14_15).append(df15_16).append(df16_17).append(df17_18).append(df18_19)
serieA.sort_index()

Bundesliga:
BUNDESLIGA_DIR = "Bundesliga"+ FILE_SEP+ "data"
df18_19 = load_data(BUNDESLIGA_DIR,"season-1819_csv.csv","2018-2019")
df17_18 = load_data(BUNDESLIGA_DIR,"season-1718_csv.csv","2017-2018")
df16_17 = load_data(BUNDESLIGA_DIR,"season-1617_csv.csv","2016-2017")
df15_16 = load_data(BUNDESLIGA_DIR,"season-1516_csv.csv","2015-2016")
df14_15 = load_data(BUNDESLIGA_DIR,"season-1415_csv.csv","2014-2015")
df13_14 = load_data(BUNDESLIGA_DIR,"season-1314_csv.csv","2013-2014")
df12_13 = load_data(BUNDESLIGA_DIR,"season-1213_csv.csv","2012-2013")
df11_12 = load_data(BUNDESLIGA_DIR,"season-1112_csv.csv","2011-2012")
df10_11 = load_data(BUNDESLIGA_DIR,"season-1011_csv.csv","2010-2011")
bundesliga = df10_11.append(df11_12).append(df12_13).append(df13_14).append(df14_15).append(df15_16).append(df16_17).append(df17_18).append(df18_19)
bundesliga.sort_index()

Ligue One:
LIGUE1_DIR = "Ligue-1" + FILE_SEP + "data"
df18_19 = load_data(LIGUE1_DIR,"season-1819_csv.csv","2018-2019")
df17_18 = load_data(LIGUE1_DIR,"season-1718_csv.csv","2017-2018")
df16_17 = load_data(LIGUE1_DIR,"season-1617_csv.csv","2016-2017")
df15_16 = load_data(LIGUE1_DIR,"season-1516_csv.csv","2015-2016")
df14_15 = load_data(LIGUE1_DIR,"season-1415_csv.csv","2014-2015")
df13_14 = load_data(LIGUE1_DIR,"season-1314_csv.csv","2013-2014")
df12_13 = load_data(LIGUE1_DIR,"season-1213_csv.csv","2012-2013")
df11_12 = load_data(LIGUE1_DIR,"season-1112_csv.csv","2011-2012")
df10_11 = load_data(LIGUE1_DIR,"season-1011_csv.csv","2010-2011")
ligue1 = df10_11.append(df11_12).append(df12_13).append(df13_14).append(df14_15).append(df15_16).append(df16_17).append(df17_18).append(df18_19)
ligue1.sort_index()

Let's append all the leagues into a single dataframe:
epl['League']='EPL'
laliga['League']='LaLiga'
serieA['League']='SerieA'
bundesliga['League']='Bundesliga'
ligue1['League']='Ligue1'
data = epl.append(laliga).append(serieA).append(bundesliga).append(ligue1)
data.sort_index()

Let's check for rows with data missing for one or more columns:
plt.cla()
plt.clf()
missing_data = data.isnull().sum()
ax = missing_data.plot(kind = 'bar', colormap='summer')
ax.set_ylim(0, 6)
plt.title("No data counts")
for p in ax.patches:
ax.annotate("%.0f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()

Lets try populating the null values now
We can assume that Half Time Results/Goals were in line with Full Time results/goals whenever Half Time data is empty So Half Time Goals can be interpolated as Full Time Goals/2
data['HalfTime_Result'] = data.apply(
lambda row: row['FullTime_Result'] if pd.isnull(row['HalfTime_Result']) else row['HalfTime_Result'],
axis=1
)
data['HalfTime_HomeTeam_Goals'] = data.apply(
lambda row: int(row['FullTime_HomeTeam_Goals']/2) if pd.isnull(row['HalfTime_HomeTeam_Goals']) else row['HalfTime_HomeTeam_Goals'],
axis=1
)
data['HalfTime_AwayTeam_Goals'] = data.apply(
lambda row: int(row['FullTime_AwayTeam_Goals']/2) if pd.isnull(row['HalfTime_AwayTeam_Goals']) else row['HalfTime_AwayTeam_Goals'],
axis=1
)

When data is missing we can assume that total no of shots/shots on target is same as the number of goals
data['HomeTeam_Shots'] = data.apply(
lambda row: row['FullTime_HomeTeam_Goals'] if pd.isnull(row['HomeTeam_Shots']) else row['HomeTeam_Shots'],
axis=1
)
data['HomeTeam_ShotsOnTarget'] = data.apply(
lambda row: row['FullTime_HomeTeam_Goals'] if pd.isnull(row['HomeTeam_ShotsOnTarget']) else row['HomeTeam_ShotsOnTarget'],
axis=1
)
data['AwayTeam_Shots'] = data.apply(
lambda row: row['FullTime_AwayTeam_Goals'] if pd.isnull(row['AwayTeam_Shots']) else row['AwayTeam_Shots'],
axis=1
)
data['AwayTeam_ShotsOnTarget'] = data.apply(
lambda row: row['FullTime_AwayTeam_Goals'] if pd.isnull(row['AwayTeam_ShotsOnTarget']) else row['AwayTeam_ShotsOnTarget'],
axis=1
)

Fouls committed, corners and red/yellow cards missing data can be populated based on average number of fouls committed/red or yellow cards in other matches involving these teams
fouls_missing = data[(data['HomeTeam_FoulsCommitted'].isnull()) | (data['AwayTeam_FoulsCommitted'].isnull())]

Foul information is missing for the follwing matches :
  • Lyon vs Marseille
  • Caen vs Nancy
  • Bastia vs Lyon
  • Cagliary vs Roma
We can use aggregate data from other matches involving these teams to populate the missing data
avg_foul_data = data.groupby(['HomeTeam','AwayTeam'])['HomeTeam_FoulsCommitted','AwayTeam_FoulsCommitted'].mean()

is_cagliary_vs_roma = ((data['HomeTeam']=='Cagliari') & (data['AwayTeam']=='Roma'))
data.loc[(data['HomeTeam_FoulsCommitted'].isnull()) & (is_cagliary_vs_roma),'HomeTeam_FoulsCommitted']=avg_foul_data.loc[('Cagliari','Roma')]['HomeTeam_FoulsCommitted']
data.loc[(data['AwayTeam_FoulsCommitted'].isnull()) & (is_cagliary_vs_roma),'AwayTeam_FoulsCommitted']=avg_foul_data.loc[('Cagliari','Roma')]['AwayTeam_FoulsCommitted']

is_lyon_vs_marseille = ((data['HomeTeam']=='Lyon') & (data['AwayTeam']=='Marseille'))
data.loc[(data['HomeTeam_FoulsCommitted'].isnull()) & (is_lyon_vs_marseille),'HomeTeam_FoulsCommitted']=avg_foul_data.loc[('Lyon','Marseille')]['HomeTeam_FoulsCommitted']
data.loc[(data['AwayTeam_FoulsCommitted'].isnull()) & (is_lyon_vs_marseille),'AwayTeam_FoulsCommitted']=avg_foul_data.loc[('Lyon','Marseille')]['AwayTeam_FoulsCommitted']

is_caen_vs_nancy = ((data['HomeTeam']=='Caen') & (data['AwayTeam']=='Nancy'))
data.loc[(data['HomeTeam_FoulsCommitted'].isnull()) & (is_caen_vs_nancy),'HomeTeam_FoulsCommitted']=avg_foul_data.loc[('Caen','Nancy')]['HomeTeam_FoulsCommitted']
data.loc[(data['AwayTeam_FoulsCommitted'].isnull()) & (is_caen_vs_nancy),'AwayTeam_FoulsCommitted']=avg_foul_data.loc[('Caen','Nancy')]['AwayTeam_FoulsCommitted']

is_bastia_vs_lyon = ((data['HomeTeam']=='Bastia') & (data['AwayTeam']=='Lyon'))
data.loc[(data['HomeTeam_FoulsCommitted'].isnull()) & (is_bastia_vs_lyon),'HomeTeam_FoulsCommitted']=avg_foul_data.loc[('Bastia','Lyon')]['HomeTeam_FoulsCommitted']
data.loc[(data['AwayTeam_FoulsCommitted'].isnull()) & (is_bastia_vs_lyon),'AwayTeam_FoulsCommitted']=avg_foul_data.loc[('Bastia','Lyon')]['AwayTeam_FoulsCommitted']

We can use the same strategy to populate missing foul data as well:
cards_missing = data[(data['HomeTeam_YellowCards'].isnull()) | (data['AwayTeam_YellowCards'].isnull()) | (data['HomeTeam_RedCards'].isnull()) | (data['AwayTeam_RedCards'].isnull())
Card information is missing for the following matches :
  1. Cagliary vs Roma
  2. Roma vs Juventus
  3. Bastia vs Lyon
We can use other matches involving these teams to populate the missing data
avg_card_data = data.groupby(['HomeTeam','AwayTeam'])['HomeTeam_YellowCards','HomeTeam_RedCards','AwayTeam_YellowCards','AwayTeam_RedCards'].mean()

is_cagliary_vs_roma = ((data['HomeTeam']=='Cagliari') & (data['AwayTeam']=='Roma'))
data.loc[(data['HomeTeam_YellowCards'].isnull()) & (is_cagliary_vs_roma),'HomeTeam_YellowCards']=avg_card_data.loc[('Cagliari','Roma')]['HomeTeam_YellowCards']
data.loc[(data['HomeTeam_RedCards'].isnull()) & (is_cagliary_vs_roma),'HomeTeam_RedCards']=avg_card_data.loc[('Cagliari','Roma')]['HomeTeam_RedCards']
data.loc[(data['AwayTeam_YellowCards'].isnull()) & (is_cagliary_vs_roma),'AwayTeam_YellowCards']=avg_card_data.loc[('Cagliari','Roma')]['AwayTeam_YellowCards']
data.loc[(data['AwayTeam_RedCards'].isnull()) & (is_cagliary_vs_roma),'AwayTeam_RedCards']=avg_card_data.loc[('Cagliari','Roma')]['AwayTeam_RedCards']

is_roma_vs_juventus = ((data['HomeTeam']=='Roma') & (data['AwayTeam']=='Juventus'))
data.loc[(data['HomeTeam_YellowCards'].isnull()) & (is_roma_vs_juventus),'HomeTeam_YellowCards']=avg_card_data.loc[('Roma','Juventus')]['HomeTeam_YellowCards']
data.loc[(data['HomeTeam_RedCards'].isnull()) & (is_roma_vs_juventus),'HomeTeam_RedCards']=avg_card_data.loc[('Roma','Juventus')]['HomeTeam_RedCards']
data.loc[(data['AwayTeam_YellowCards'].isnull()) & (is_roma_vs_juventus),'AwayTeam_YellowCards']=avg_card_data.loc[('Roma','Juventus')]['AwayTeam_YellowCards']
data.loc[(data['AwayTeam_RedCards'].isnull()) & (is_roma_vs_juventus),'AwayTeam_RedCards']=avg_card_data.loc[('Roma','Juventus')]['AwayTeam_RedCards']

is_bastia_vs_lyon = ((data['HomeTeam']=='Bastia') & (data['AwayTeam']=='Lyon'))
data.loc[(data['HomeTeam_YellowCards'].isnull()) & (is_bastia_vs_lyon),'HomeTeam_YellowCards']=avg_card_data.loc[('Bastia','Lyon')]['HomeTeam_YellowCards']
data.loc[(data['HomeTeam_RedCards'].isnull()) & (is_bastia_vs_lyon),'HomeTeam_RedCards']=avg_card_data.loc[('Bastia','Lyon')]['HomeTeam_RedCards']
data.loc[(data['AwayTeam_YellowCards'].isnull()) & (is_bastia_vs_lyon),'AwayTeam_YellowCards']=avg_card_data.loc[('Bastia','Lyon')]['AwayTeam_YellowCards']
data.loc[(data['AwayTeam_RedCards'].isnull()) & (is_bastia_vs_lyon),'AwayTeam_RedCards']=avg_card_data.loc[('Bastia','Lyon')]['AwayTeam_RedCards']

The same strategy can be used to populate missing corner information as well:
corners_missing = data[(data['HomeTeam_Corners'].isnull()) | (data['AwayTeam_Corners'].isnull()) ]

Corner information is missing for the following matches :
  1. Cagliary vs Roma
  2. Bastia vs Lyon
We can use other matches involving these teams to populate the missing data
avg_corner_data = data.groupby(['HomeTeam','AwayTeam'])['HomeTeam_Corners','AwayTeam_Corners'].mean()

is_cagliary_vs_roma = ((data['HomeTeam']=='Cagliari') & (data['AwayTeam']=='Roma'))
data.loc[(data['HomeTeam_Corners'].isnull()) & (is_cagliary_vs_roma),'HomeTeam_Corners']=avg_corner_data.loc[('Cagliari','Roma')]['HomeTeam_Corners']
data.loc[(data['AwayTeam_Corners'].isnull()) & (is_cagliary_vs_roma),'AwayTeam_Corners']=avg_corner_data.loc[('Cagliari','Roma')]['AwayTeam_Corners']

is_bastia_vs_lyon = ((data['HomeTeam']=='Bastia') & (data['AwayTeam']=='Lyon'))
data.loc[(data['HomeTeam_Corners'].isnull()) & (is_bastia_vs_lyon),'HomeTeam_Corners']=avg_corner_data.loc[('Cagliari','Roma')]['HomeTeam_Corners']
data.loc[(data['AwayTeam_Corners'].isnull()) & (is_bastia_vs_lyon),'AwayTeam_Corners']=avg_corner_data.loc[('Cagliari','Roma')]['AwayTeam_Corners']

Now lets add the following extra columns :
data['Winner']=np.NaN
data['Winner'] = data.apply(
lambda row: row['HomeTeam'] if row['FullTime_Result']=='H' else row['AwayTeam'] if row['FullTime_Result']=='A' else np.NaN,
axis=1
)
data['Loser']=np.NaN
data['Loser'] = data.apply(
lambda row: row['HomeTeam'] if row['FullTime_Result']=='A' else row['AwayTeam'] if row['FullTime_Result']=='H' else np.NaN,
axis=1
)
data['HomeTeam_ShotsOnTarget_Percent'] = data['HomeTeam_ShotsOnTarget']*100/data['HomeTeam_Shots']
data['AwayTeam_ShotsOnTarget_Percent'] = data['AwayTeam_ShotsOnTarget']*100/data['AwayTeam_Shots']
data['HomeTeam_GoalSaves'] = data['AwayTeam_ShotsOnTarget'] - data['FullTime_AwayTeam_Goals']
data['AwayTeam_GoalSaves'] = data['HomeTeam_ShotsOnTarget'] - data['FullTime_HomeTeam_Goals']
data['TotalGoals']= data['FullTime_HomeTeam_Goals'] + data['FullTime_AwayTeam_Goals']
data['HomeTeam_TotalCards'] = data['HomeTeam_YellowCards'] + data['HomeTeam_RedCards']
data['AwayTeam_TotalCards'] = data['AwayTeam_YellowCards'] + data['AwayTeam_RedCards']
data['TotalCards']=data['HomeTeam_TotalCards'] + data['AwayTeam_TotalCards']

Lets have a look at the number of matches per season per league. If its not uniform, then we might need to adjust the aggregate data accordingly before comparing (We'll ignore the current season as not every league starts at the same time)
plt.cla()
plt.clf()
plt.figure(figsize=(4,4))

past = data[data['Season']!='2018-2019']

ax = sns.countplot(y = past["League"], order=past["League"].value_counts().index, linewidth = 1, edgecolor = "k")
for i,j in enumerate(past["League"].value_counts().values):
ax.text(.5, i,j, weight = "bold", color="white")
plt.title("Total # of matches grouped by league")
plt.show()


Clearly Bundesliga has lesser number of matches as compared to other leagues. Lets see if number of matches per league varied across seasons
plt.cla()
plt.clf()
plt.figure(figsize=(10,10))
sns.countplot(y = past["Season"], hue=past["League"])
plt.title("# of matches per season per league")
plt.xlim(0, 500)
plt.show()
So for a particular league, the number of matches per season has remained consistent over the years. So we can safely compare aggregate data for a league without adjusting (e.g total goals in EPL per season). However when we compare other league aggregate data with Bundesliga, the data might need some adjustment.

Bundesliga has 18 teams and 306 games (34 games for each team) per season whereas other leagues have 20 teams and 380 games a season (38 games for each team)
Hence when we compare data across leagues, aggregate data (such as total/count etc) for Bundesliga has to be scaled up accordingly in order to do a fair comparison
BUNDESLIGA_TEAM_SCALEUP = 38/34
BUNDESLIGA_SEASON_SCALEUP = 380/306

In every season, a team plays two games - one as home team another as away team. So the total number of unique values for HomeTeam and AwayTeam column should be identical
print("Home Team unique values #",len(data['HomeTeam'].unique()))
print("Away Team unique values #",len(data['AwayTeam'].unique()))

Output:
Home Team unique values # 163
Away Team unique values # 163

Let's look at the total number of teams per league as well as the average number of teams per season
plt.cla()
plt.clf()
f, axes = plt.subplots(1, 2 , figsize=(16,5))
ax0 = data.groupby('League').HomeTeam.nunique().plot(kind = 'bar', ax=axes[0])
ax0.set_ylim(0, 40)
ax0.set_title('Total number of teams that played since 2010')
for p in ax0.patches:
ax0.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')


data2017_18 = data[data['Season']=='2017-2018'] # any year should have been fine
ax1 = data2017_18.groupby('League').HomeTeam.nunique().plot(kind = 'bar', ax=axes[1])
ax1.set_ylim(0, 25)
ax1.set_title('Number of teams playing in each season')
for p in ax1.patches:
ax1.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()


The difference between the two numbers indicates that there were occasions where teams were relegated to lower tier league or top teams from lower tier leagues were promoted
Let's look at the number of seasons each team featured in the league :

A. Bundesliga
bundesliga = data[data['League']=='Bundesliga'][['HomeTeam','Season']]
bundesliga = bundesliga.groupby('HomeTeam').Season.nunique()
bundesliga.sort_values(inplace=True, ascending=False)

plt.cla()
plt.clf()
f, axes = plt.subplots(1, 1 , figsize=(16,5))
ax = bundesliga.plot(kind='bar')
ax.set_ylim(0, 10)
ax.set_title('Number of times each team featured in Bundesliga')
for p in ax.patches:
ax.annotate("%.0f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()
The teams that have never been relegated from Bundesliga are :
1. Wolfsburg
2. Mainz
3. Bayern Munich
4. Dortmund
5. Werder Bremen
6. Leverkusen
7. M'gladbach
8. Hoffenheim
9. Schalke04
Rest of the teams have been relegated at least once
9 out of 18 or 50 % of teams have remained consistent across last 9 seasons of Bundesliga

B. EPL
epl = data[data['League']=='EPL'][['HomeTeam','Season']]
epl = epl.groupby('HomeTeam').Season.nunique()
epl.sort_values(inplace=True, ascending=False)

plt.cla()
plt.clf()
f, axes = plt.subplots(1, 1 , figsize=(16,5))
ax = epl.plot(kind='bar')
ax.set_ylim(0, 10)
ax.set_title('Number of times each team featured in English Premier League')
for p in ax.patches:
ax.annotate("%.0f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()
Clearly the teams that have never been relegated from EPL are :
1. Liverpool
2. Man United
3. Chelsea
4. Everton
5. Man City
6. Arsenal
7. Tottenhum
Rest of the teams have been relegated at least once
7 out of 20 or 35 % of teams have remained consistent across last 9 seasons of EPL

C. La Liga
laliga = data[data['League']=='LaLiga'][['HomeTeam','Season']]
laliga = laliga.groupby('HomeTeam').Season.nunique()
laliga.sort_values(inplace=True, ascending=False)
plt.cla()
plt.clf()
f, axes = plt.subplots(1, 1 , figsize=(16,5))
ax = laliga.plot(kind='bar')
ax.set_ylim(0, 10)
ax.set_title('Number of times each team featured in La Liga')
for p in ax.patches:
ax.annotate("%.0f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()
Clearly the teams that have never been relegated from LaLiga are :
  1. Espanol
  2. Valencia
  3. Sociedad
  4. Sevilla
  5. Ath Bilbao
  6. Real Madrid
  7. Ath Madrid
  8. Barcelona
Rest of the teams have been relegated at least once
8 out of 20 or 40 % of teams have remained consistent across last 9 seasons of La Liga

D. Serie A
serieA = data[data['League']=='SerieA'][['HomeTeam','Season']]
serieA = serieA.groupby('HomeTeam').Season.nunique()
serieA.sort_values(inplace=True, ascending=False)
plt.cla()
plt.clf()
f, axes = plt.subplots(1, 1 , figsize=(16,5))
ax = serieA.plot(kind='bar')
ax.set_ylim(0, 10)
ax.set_title('Number of times each team featured in Serie A')
for p in ax.patches:
ax.annotate("%.0f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()
The teams that have never been relegated from Serie A are :
  1. Juventus
  2. Napoli
  3. Chievo
  4. Florentina
  5. Genoa
  6. Roma
  7. Inter Milan
  8. Udinese
  9. Lazio
  10. AC Milan
Rest of the teams have been relegated at least once
10 out of 20 or 50 % of teams have remained consistent across last 9 seasons of Serie A

E. Ligue One
ligue1 = data[data['League']=='Ligue1'][['HomeTeam','Season']]
ligue1 = ligue1.groupby('HomeTeam').Season.nunique()
ligue1.sort_values(inplace=True, ascending=False)
plt.cla()
plt.clf()
f, axes = plt.subplots(1, 1 , figsize=(16,5))
ax = ligue1.plot(kind='bar')
ax.set_ylim(0, 10)
ax.set_title('Number of times each team featured in Ligue 1')
for p in ax.patches:
ax.annotate("%.0f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()
Clearly the teams that have never been relegated from Ligue 1 are :
  1. Lyon
  2. Rennes
  3. Marseille
  4. Montpellier
  5. Bordeaux
  6. Paris SG
  7. Nice
  8. St Etienne
  9. Toulouse
  10. Lille
Rest of the teams have been relegated at least once
10 out of 20 or 50 % of teams have remained consistent across last 9 seasons of Ligue One

In terms of relegation, EPL has been the most volatile league over past 9 years. Only 35 % teams have been consistent. For other leagues, the percentage varies from 40 to 50 %

Lets look at overall distribution of home and away goals
plt.cla()
plt.clf()
plt.figure(figsize=(12,6))
sns.kdeplot(data["FullTime_HomeTeam_Goals"],shade=True, color="b",label="home goals")
sns.kdeplot(data["FullTime_AwayTeam_Goals"],shade=True, color="r",label="away goals")
plt.axvline(data["FullTime_HomeTeam_Goals"].mean(),linestyle = "dashed", color="b",label="home goals mean")
plt.axvline(data["FullTime_AwayTeam_Goals"].mean(),linestyle = "dashed", color="r",label="away goals mean")
plt.legend(loc="best",prop = {"size" : 12})
plt.title("Distribution of Home and Away Goals")
plt.xlabel("goals")
plt.show()
The mean value for home goals is higher than that of away goals. So there might be some home advantage.
Also lets take a look at how the distribution of total goals in a match varies across leagues
plt.cla()
plt.clf()
plt.figure(figsize=(13,5))
sns.violinplot(data["League"],data["TotalGoals"],palette="rainbow")
plt.title("Distribution of Goals in different leagues")
plt.show()
for most of matches, the total number of goals varies between 1 and 4. It's quite rate to see more than 6 goals in a match in any league
grouped_by_leagues = data.groupby('League').agg('mean')
grouped_by_seasons = data.groupby('Season').agg('mean')

Lets look at average number of goals by home/away team in every league
plt.cla()
plt.clf()
f, axes = plt.subplots(1, 2 , figsize=(16,5))
ax0 = grouped_by_leagues['FullTime_HomeTeam_Goals'].plot(kind='bar', ax = axes[0])
ax0.set_ylim(0, 1.8)
ax0.set_title('Average number of Goals by Home Teams')
for p in ax0.patches:
ax0.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
ax1 = grouped_by_leagues['FullTime_AwayTeam_Goals'].plot(kind='bar', ax = axes[1])
ax1.set_ylim(0, 1.8)
ax1.set_title('Average number of Goals by Away Teams')
for p in ax1.patches:
ax1.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()
Clearly in every league there's a significant home advantage
Lets look at the top teams in terms of number of home and away goals scored
home = data.groupby("HomeTeam")["FullTime_HomeTeam_Goals"].sum().reset_index()
away = data.groupby("AwayTeam")["FullTime_AwayTeam_Goals"].sum().reset_index()
home.sort_values(by="FullTime_HomeTeam_Goals", ascending= False, inplace=True)
away.sort_values(by="FullTime_AwayTeam_Goals", ascending= False, inplace=True)
def annotate(ax, data, count, color='k'):
for i,j in enumerate(data[:count]):
ax.text(.5,i,j, weight = "bold", color=color)
plt.cla()
plt.clf()
plt.figure(figsize=(13,10))
plt.subplot(121)
ax = sns.barplot(y="HomeTeam", x="FullTime_HomeTeam_Goals", data=home[:15], palette="summer", linewidth = 1,edgecolor = "k")
plt.ylabel('')
plt.title("Top Teams in terms of Home Goals")
annotate(ax, home["FullTime_HomeTeam_Goals"], 15, "black")
plt.subplot(122)
ax = sns.barplot(y="AwayTeam",x="FullTime_AwayTeam_Goals", data=away[:15],palette="winter", linewidth = 1,edgecolor = "k")
plt.ylabel('')
plt.title("Top Teams in terms of Away Goals")
annotate(ax, away["FullTime_AwayTeam_Goals"], 15, "black")
plt.show()
These two lists have a lot of teams in common. This indicates that good teams usually do well at home as well as away (However they score better at home than at away).

Let's look at variation of goal data across last 9 seasons
plt.cla()
plt.clf()
f, axes = plt.subplots(1, 2 , figsize=(16,5))
ax0 = grouped_by_seasons['FullTime_HomeTeam_Goals'].plot(kind='bar', ax = axes[0])
ax0.set_ylim(0, 1.8)
ax0.set_title('Average number of Goals by Home Teams')
for p in ax0.patches:
ax0.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
ax1 = grouped_by_seasons['FullTime_AwayTeam_Goals'].plot(kind='bar', ax = axes[1])
ax1.set_ylim(0, 1.8)
ax1.set_title('Average number of Goals by Away Teams')
for p in ax1.patches:
ax1.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()
In last 10 years, overall, away teams have improved (but only marginally) - from 1.12 goals per match to 1.22 goals per match
Lets look at the total number of goals as well. Let’s find out which league had the most goals. To make it a fair comparison we will adjust Bundesliga numbers (Bundesliga has 18 teams as opposed to 20 for other leagues)
goals_by_league = data.groupby("League").agg({"FullTime_HomeTeam_Goals":"sum","FullTime_AwayTeam_Goals":"sum","TotalGoals":"sum"})
goals_by_league.loc["Bundesliga"] = goals_by_league.loc["Bundesliga"] * BUNDESLIGA_SEASON_SCALEUP
plt.cla()
plt.clf()
goals_by_league.plot(kind="barh" ,figsize = (10,10),edgecolor = "k",linewidth =1)
plt.title("Goals by league")
plt.legend(loc = "best" , prop = {"size" : 14})
plt.xlabel("Total goals")
plt.show()
Clearly Ligue One produces less goals as compared to other leagues. Post adjustment, Bundesliga produces more goals than anybody else.
Lets look at the season-wise data as well.
plt.cla()
plt.clf()
data.groupby("Season").agg({"FullTime_HomeTeam_Goals":"sum","FullTime_AwayTeam_Goals":"sum","TotalGoals":"sum"}).plot(kind="barh" ,figsize = (10,10),edgecolor = "k",linewidth =1)
plt.title("Goals by Season")
plt.legend(loc = "best" , prop = {"size" : 14})
plt.xlabel("Total goals")
plt.show()
From 2010-11 to 2013-14, number of goals increased each season but stopped growing after 2013-14 season (except for a spike in 2016-17)

Let's look at distribution of match results:
plt.cla()
plt.clf()
ax = plt.gca()
explode = (0, 0.1, 0)
labels = ['Home Team Won','Away Team Won','Draw']
data.groupby('FullTime_Result')['FullTime_Result'].count().plot(kind='pie', autopct='%1.2f%%', figsize=(4,4), explode=explode, labels=labels)
plt.title("Result distribution")
plt.show()

Overall, home team wins nearly 46 % of matches
result_dict = {'A':'Away Team Won','H':'Home Team Won','D': 'Draw'}

Lets look at distribution of match results in various league
plt.cla()
plt.clf()
pd.crosstab(data['League'], data['FullTime_Result'].map(result_dict), normalize='index').plot(kind='bar', stacked=True, title="Home/Away win vs Draw (%) across Leagues", rot=60)
plt.show()
Home Team wins more than 40 % of matches in every league.
Home Advantage seems to be a bigger factor in La Liga as compared to other leagues
Ligue1 produces more draws than other leagues

Lets also look at score line at half time
plt.cla()
plt.clf()
f, axes = plt.subplots(1, 2 , figsize=(16,5))

ax0 = grouped_by_leagues['HalfTime_HomeTeam_Goals'].plot(kind='bar', ax = axes[0])
ax0.set_ylim(0, 0.8)
ax0.set_title('Average number of Goals by Home Teams at Half Time')
for p in ax0.patches:
ax0.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

ax1 = grouped_by_leagues['HalfTime_AwayTeam_Goals'].plot(kind='bar', ax = axes[1])
ax1.set_ylim(0, 0.8)
ax1.set_title('Average number of Goals by Away Teams at Half Time')
for p in ax1.patches:
ax1.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()
So, home team dominates right from the first half
plt.cla()
plt.clf()


grouped_by_leagues['HalfTime_HomeTeam_Goals(Ideal)']=grouped_by_leagues['FullTime_HomeTeam_Goals']/2
grouped_by_leagues['HalfTime_AwayTeam_Goals(Ideal)']=grouped_by_leagues['FullTime_AwayTeam_Goals']/2
fig, ax = plt.subplots(figsize=(14, 8))
#grouped_by_leagues[['HalfTime_HomeTeam_Goals','FullTime_HomeTeam_Goals','HalfTime_AwayTeam_Goals','FullTime_AwayTeam_Goals']].plot(ax=ax, color=['r', 'g', 'k', 'b'], marker="o")
#grouped_by_leagues[['HalfTime_HomeTeam_Goals(Ideal)','HalfTime_AwayTeam_Goals(Ideal)']].plot(ax=ax, color=[ 'm','y'], marker="d", linestyle=":")
#ax.set_xticklabels(grouped_by_leagues.index)
tmp = grouped_by_leagues[['HalfTime_HomeTeam_Goals','FullTime_HomeTeam_Goals','HalfTime_AwayTeam_Goals','FullTime_AwayTeam_Goals','HalfTime_HomeTeam_Goals(Ideal)','HalfTime_AwayTeam_Goals(Ideal)']]
x = np.arange(1,6)
labels = list(tmp.index)
plt.plot(x, tmp['HalfTime_HomeTeam_Goals'], color='r', marker='o')
plt.plot(x, tmp['FullTime_HomeTeam_Goals'], color='g', marker='o')
plt.plot(x, tmp['HalfTime_AwayTeam_Goals'], color='k', marker='o')
plt.plot(x, tmp['FullTime_AwayTeam_Goals'], color='b', marker='o')
plt.plot(x, tmp['HalfTime_HomeTeam_Goals(Ideal)'], color='m', marker='o', linestyle=":")
plt.plot(x, tmp['HalfTime_AwayTeam_Goals(Ideal)'], color='y', marker='o', linestyle=":")
plt.legend()
plt.title("Full Time vs Half Time Goals across Seasons")
plt.xticks(x, labels, rotation='vertical')
plt.show()

For Both Home and Away teams, Half Time goals is significantly less than full time goals/2
So both Home and Away teams are typically more aggressive and attack more during second half of the match

Lets look at the distribution of results at half time
plt.cla()
plt.clf()
ax = plt.gca()
explode = (0, 0.1, 0)
labels = ['Away Team Ahead','Even','Home Team Ahead']
data.groupby('HalfTime_Result')['HalfTime_Result'].count().plot(kind='pie', autopct='%1.2f%%', figsize=(4,4), explode=explode, labels=labels)
plt.title("Half time Result distribution")
plt.show()
Lets look at the half time and full time data together
df1 = pd.crosstab(data['League'], data['HalfTime_Result'], normalize='index')
df2 = pd.crosstab(data['League'], data['FullTime_Result'], normalize='index')
display(HTML(df1.to_html()))
display(HTML(df2.to_html()))
Lets look at the number of shots by home and away team
plt.cla()
plt.clf()
f, axes = plt.subplots(1, 2 , figsize=(16,5))
ax0 = grouped_by_leagues['HomeTeam_Shots'].plot(kind='bar', ax = axes[0])
ax0.set_ylim(0, 16)
ax0.set_title('Average number of Shots by Home Teams')
for p in ax0.patches:
ax0.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
ax1 = grouped_by_leagues['AwayTeam_Shots'].plot(kind='bar', ax = axes[1])
ax1.set_ylim(0, 16)
ax1.set_title('Average number of Shots by Away Teams')
for p in ax1.patches:
ax1.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()
Home Teams are ahead here as well
Lets look at the total number of shots as well. Total Number of shots = home team shots + away team shots
data['TotalShots']=data['HomeTeam_Shots']+data['AwayTeam_Shots']
shots_by_league = data.groupby("League").agg({"HomeTeam_Shots":"sum","AwayTeam_Shots":"sum","TotalShots":"sum"})
shots_by_league.loc["Bundesliga"] = shots_by_league.loc["Bundesliga"] * BUNDESLIGA_SEASON_SCALEUP
plt.cla()
plt.clf()
shots_by_league.plot(kind="barh" ,figsize = (10,10),edgecolor = "k",linewidth =1)
plt.title("Shots by league")
plt.legend(loc = "best" , prop = {"size" : 14})
plt.xlabel("Total shots")
plt.show()
Serie A, Bundesliga (adjusted) and EPL has max number of shots at goal per season.Interestingly, La Liga leads in number of goals per season but not at number of shots whereas it is other way round for Serie A
So, we can conclude that La Liga forwards are more accurate in general as compared to Serie A forwards

Now that we have both goal and shots data,. A good measure of accuracy will be number of attempts required to score a goal successfully we can take a look at accuracy

tmp = data.groupby("League").agg({"HomeTeam_Shots":"sum","AwayTeam_Shots":"sum","TotalShots":"sum","TotalGoals":"sum"})
tmp['ShotsPerGoal']=tmp['TotalShots']/tmp['TotalGoals']
plt.cla()
plt.clf()
f, axes = plt.subplots(1, 1 , figsize=(16,5))
ax = tmp['ShotsPerGoal'].plot(kind='bar')
ax.set_ylim(0, 12)
ax.set_title('Attempts Required Per Goal')
for p in ax.patches:
ax.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()

Forwards in Bundesliga and LaLiga are more accurate as compared to others. Forwards in SerieA are the least effective

Lets check the number of shots on target (scored goals or hit the bar or saved by goal keeper)

plt.cla()
plt.clf()
f, axes = plt.subplots(1, 2 , figsize=(16,5))
ax0 = grouped_by_leagues['HomeTeam_ShotsOnTarget'].plot(kind='bar', ax = axes[0])
ax0.set_ylim(0, 7)
ax0.set_title('Average number of Shots on Target by Home Teams per match')
for p in ax0.patches:
ax0.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
ax1 = grouped_by_leagues['AwayTeam_ShotsOnTarget'].plot(kind='bar', ax = axes[1])
ax1.set_ylim(0, 7)
ax1.set_title('Average number of Shots on Target by Away Teams per match')
for p in ax1.patches:
ax1.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()

EPL has highest number of shots on target for both home and away team

Lets look at the number of fouls committed

plt.cla()
plt.clf()
f, axes = plt.subplots(1, 2 , figsize=(16,5))
ax0 = grouped_by_leagues['HomeTeam_FoulsCommitted'].plot(kind='bar', ax = axes[0])
ax0.set_ylim(0, 17)
ax0.set_title('Average number of Fouls by Home Teams')
for p in ax0.patches:
ax0.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
ax1 = grouped_by_leagues['AwayTeam_FoulsCommitted'].plot(kind='bar', ax = axes[1])
ax1.set_ylim(0, 17)
ax1.set_title('Average number of Fouls by Away Teams')
for p in ax1.patches:
ax1.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()

EPL clearly is the most disciplined league with least number of fouls by both Home and Away teams. Away Teams in Bundesliga commit the most number of fouls per match

Lets look the seasonal trend in number of fouls

plt.cla()
plt.clf()
f, axes = plt.subplots(1, 2 , figsize=(16,5))


ax0 = grouped_by_seasons['HomeTeam_FoulsCommitted'].plot(kind='bar', ax = axes[0])
ax0.set_ylim(0, 17)
ax0.set_title('Average number of Fouls by Home Teams')
for p in ax0.patches:
ax0.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

ax1 = grouped_by_seasons['AwayTeam_FoulsCommitted'].plot(kind='bar', ax = axes[1])
ax1.set_ylim(0, 17)
ax1.set_title('Average number of Fouls by Away Teams')
for p in ax1.patches:
ax1.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.show()

The number of fouls is on a steady decline over last 9 years

Lets have a look at card data as well:

data['TotalRedCards']=data['HomeTeam_RedCards']+data['AwayTeam_RedCards']
data['TotalYellowCards']=data['HomeTeam_YellowCards']+data['AwayTeam_YellowCards']
data['TotalHomeTeamCards']=data['HomeTeam_RedCards']+data['HomeTeam_YellowCards']
data['TotalAwayTeamCards']=data['AwayTeam_RedCards']+data['AwayTeam_YellowCards']
grouped_by_leagues = data.groupby('League').agg('mean')
grouped_by_seasons = data.groupby('Season').agg('mean')
plt.cla()
plt.clf()
f, axes = plt.subplots(1, 2 , figsize=(16,5))

ax0 = grouped_by_leagues['TotalHomeTeamCards'].plot(kind='bar', ax = axes[0])
ax0.set_ylim(0, 4)
ax0.set_title('Average number of Cards by Home Teams')
for p in ax0.patches:
ax0.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

ax1 = grouped_by_leagues['TotalAwayTeamCards'].plot(kind='bar', ax = axes[1])
ax1.set_ylim(0, 4)
ax1.set_title('Average number of Cards by Away Teams')
for p in ax1.patches:
ax1.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.show()

Clearly, La Liga witnesses the maximum number of cards while EPL sees the minimum number of cards. Away Teams face cards more often than Home Teams

Is there a correlation between number of cards and number of fouls ?

plt.cla()
plt.clf()
fig, ax = plt.subplots(figsize=(14, 8))
#grouped_by_leagues[['TotalRedCards','TotalYellowCards','TotalFouls']].plot(ax=ax, color=['r', 'y', 'k'], marker="o")
tmp = grouped_by_leagues[['TotalRedCards','TotalYellowCards','TotalFouls']]
x = np.arange(1,6)
labels = list(tmp.index)
plt.plot(x, tmp['TotalRedCards'], color='r', marker='o')
plt.plot(x, tmp['TotalYellowCards'], color='y', marker='o')
plt.plot(x, tmp['TotalFouls'], color='k', marker='o')
plt.legend()
plt.title("Cards vs Fouls across leagues")
plt.xticks(x, labels, rotation='vertical')
plt.show()

Except in EPL, there seems to be a linear correlation between number of cards and number of fouls

Lets look at how number of cards varied over the years:

plt.cla()
plt.clf()
f, axes = plt.subplots(1, 2 , figsize=(16,5))

ax0 = grouped_by_seasons['TotalHomeTeamCards'].plot(kind='bar', ax = axes[0])
ax0.set_ylim(0, 2.5)
ax0.set_title('Average number of Cards by Home Teams')
for p in ax0.patches:
ax0.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

ax1 = grouped_by_seasons['TotalAwayTeamCards'].plot(kind='bar', ax = axes[1])
ax1.set_ylim(0, 3)
ax1.set_title('Average number of Cards by Away Teams')
for p in ax1.patches:
ax1.annotate("%.2f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()


plt.cla()
plt.clf()
fig, ax = plt.subplots(figsize=(14, 8))
#grouped_by_seasons[['TotalRedCards','TotalYellowCards','TotalFouls']].plot(ax=ax, color=['r', 'y', 'k'], marker="o")
tmp = grouped_by_seasons[['TotalRedCards','TotalYellowCards','TotalFouls']]
x = np.arange(1,10)
labels = list(tmp.index)
plt.plot(x, tmp['TotalRedCards'], color='r', marker='o')
plt.plot(x, tmp['TotalYellowCards'], color='y', marker='o')
plt.plot(x, tmp['TotalFouls'], color='k', marker='o')
plt.legend()
plt.title("Cards vs Fouls across seasons")
plt.xticks(x, labels, rotation='vertical')
plt.show()

Clearly while the average number of fouls per game has come down significantly, the average number of cards haven't changed much. This indicates that referees have become more strict over the years

Lets look the most successful teams (teams with most number of wins in every league)

A. EPL
plt.cla()
plt.clf()
f, axes = plt.subplots(1, 1 , figsize=(16,5))
ax = epl['Winner'].value_counts().plot(kind='bar')
ax.set_ylim(0, 230)
ax.set_title('Number of wins in EPL')
for p in ax.patches:
ax.annotate("%.0f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()

Interestingly, Manchester City (and not Manchester United) has been the most successful team in EPL since 2010 The top 5 EPL teams (in terms of number of wins) are :
  1. Manchester City
  2. Manchester United
  3. Chelsea
  4. Arsenal
  5. Tottenhum Hotspur

B. La Liga

plt.cla()
plt.clf()
f, axes = plt.subplots(1, 1 , figsize=(16,5))
ax = laliga['Winner'].value_counts().plot(kind='bar')
ax.set_ylim(0, 260)
ax.set_title('Number of wins in La Liga')
for p in ax.patches:
ax.annotate("%.0f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()

FC Barcelona has been the most successful team in La Liga over last 10 years. The top5 teams in La Liga have been
  1. FC Barcelona
  2. Real Madrid
  3. Athletico Madrid
  4. Sevilla
  5. Valencia

C. Serie A
plt.cla()
plt.clf()
f, axes = plt.subplots(1, 1 , figsize=(16,5))
ax = serieA['Winner'].value_counts().plot(kind='bar')
ax.set_ylim(0, 240)
ax.set_title('Number of wins in Serie A')
for p in ax.patches:
ax.annotate("%.0f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()

Juventus has been the most successful team in Serie A over last 10 years Interesting to see AC Milan and Inter Milan at no 5 and 6 - reflecting their current decline The top 5 teams in Serie A have been :
  1. Juventus
  2. Napoli
  3. Roma
  4. Lazio
  5. AC Milan

D. Bundesliga
plt.cla()
plt.clf()
f, axes = plt.subplots(1, 1 , figsize=(16,5))
ax = bundesliga['Winner'].value_counts().plot(kind='bar')
ax.set_ylim(0, 240)
ax.set_title('Number of wins in Bundesliga')
for p in ax.patches:
ax.annotate("%.0f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()

As expected Bayern Munich emerges as the top team in Bundesliga over last 10 years The top 5 teams are :
  1. Bayern Munich
  2. Borussia Dortmund
  3. Bayer Leverkusen
  4. Schalke04
  5. Borussia Mönchengladbach

E. Ligue One
plt.cla()
plt.clf()
f, axes = plt.subplots(1, 1 , figsize=(16,5))
ax = ligue1['Winner'].value_counts().plot(kind='bar')
ax.set_ylim(0, 240)
ax.set_title('Number of wins in Ligue 1')
for p in ax.patches:
ax.annotate("%.0f" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
plt.show()

As expected, Paris Saint Germain emerges as the top team of Ligue 1 The top 5 teams in League 1 have been :
  1. Paris Saint Germain
  2. Lyon
  3. Marseille
  4. Lille
  5. St Etienne
Overall, these are the top 15 and bottom 15 teams in terms of number of wins across all leagues

winner = data["Winner"].value_counts().reset_index()
loser = data["Loser"].value_counts().reset_index()
plt.cla()
plt.clf()
plt.figure(figsize=(13,10))
plt.subplot(121)
ax = sns.barplot(winner["Winner"][:15],winner["index"][:15], palette="Set1", linewidth = 1,edgecolor = "k")
plt.ylabel('')
plt.title("Top Teams in terms of # of Wins")
annotate(ax, winner["Winner"], 15, "black")
plt.subplot(122)
ax = sns.barplot(loser["Loser"][:15],loser["index"][:15], palette="Set2", linewidth = 1,edgecolor = "k")
plt.ylabel('')
plt.title("Worst Teams in terms of # of losses")
annotate(ax, loser["Loser"], 15, "black")
plt.show()

So the two La Liga giants - Barcelona and Real Madrid has won the maximum number of matches in last 9 seasons
But winning isnt everything - a draw (especially an away draw) is much more valuable than a loss
Let's implement a points system :
3 points for a win
1 point for a draw
0 point for a loss

data['HomeTeam_Points'] = np.nan
data['AwayTeam_Points'] = np.nan


data['HomeTeam_Points'] = data.apply(
lambda row: 3 if row['FullTime_Result']=='H' else 0 if row['FullTime_Result']=='A' else 1,
axis=1
)
data['AwayTeam_Points'] = data.apply(
lambda row: 3 if row['FullTime_Result']=='A' else 0 if row['FullTime_Result']=='H' else 1,
axis=1
)

Now lets have a look at top teams on the basis of points

home = pd.DataFrame(data[['HomeTeam','HomeTeam_Points']])
away = pd.DataFrame(data[['AwayTeam','AwayTeam_Points']])
home.rename(columns = {'HomeTeam':'Team','HomeTeam_Points':'Point'}, inplace=True)
away.rename(columns = {'AwayTeam':'Team','AwayTeam_Points':'Point'}, inplace=True)
home = home.groupby('Team').agg(sum)
away = away.groupby('Team').agg(sum)
overall = home + away
overall_desc = overall.sort_values(by='Point', ascending=False)
overall_asc = overall.sort_values(by='Point', ascending=True)






plt.cla()
plt.clf()

plt.figure(figsize=(13,10))
plt.subplot(121)
ax = sns.barplot(overall_desc["Point"][:15],overall_desc.index[:15], palette="Set1", linewidth = 1,edgecolor = "k")
plt.ylabel('')
plt.title("Top Teams in terms of Points")

annotate(ax, overall_desc["Point"], 15, "black")

plt.subplot(122)
ax = sns.barplot(overall_asc["Point"][:15],overall_asc.index[:15], palette="Set2", linewidth = 1,edgecolor = "k")
plt.ylabel('')
plt.title("Bottom Teams in terms of Points")

annotate(ax, overall_asc["Point"], 15, "black")
plt.show()

We still have FC Barcelona and Real Madrid at the top
Lets also look at the top performers in every league and season based on points. We'll exclude current ongoing season as it has incomplete data

past = data[data['Season']!='2018-2019']
home = pd.DataFrame(past[['HomeTeam','HomeTeam_Points','Season','League']])
away = pd.DataFrame(past[['AwayTeam','AwayTeam_Points','Season','League']])
home.rename(columns = {'HomeTeam':'Team','HomeTeam_Points':'Point'}, inplace=True)
away.rename(columns = {'AwayTeam':'Team','AwayTeam_Points':'Point'}, inplace=True)
home = home.groupby(['Team','Season','League']).agg(sum)
away = away.groupby(['Team','Season','League']).agg(sum)
overall = home + away
overall.reset_index(inplace=True)
overall = overall.sort_values(by = ['League','Season','Point'], ascending=False)
overall = overall.groupby(['Season','League']).head(1)


epl = overall[overall['League']=='EPL'].sort_values(by = ['Season','Point'], ascending=False)
laliga = overall[overall['League']=='LaLiga'].sort_values(by = ['Season','Point'], ascending=False)
serieA = overall[overall['League']=='SerieA'].sort_values(by = ['Season','Point'], ascending=False)
bundesliga = overall[overall['League']=='Bundesliga'].sort_values(by = ['Season','Point'], ascending=False)
ligue1 = overall[overall['League']=='Ligue1'].sort_values(by = ['Season','Point'], ascending=False)


def plot_league(league, title, position):
plt.subplot(position)
ax = sns.barplot(league["Point"][:8],league["Season"][:8],palette="summer",edgecolor = "k")
annotate(ax, league["Team"], 8, "black")
plt.title(title)
plt.xlabel("")
plt.ylabel("")


plt.cla()
plt.clf()
plt.figure(figsize=(13,28))
plot_league(epl, "English Premier League", 621)
plot_league(laliga, "La Liga", 622)
plot_league(serieA, "Serie A", 623)
plot_league(bundesliga, "Bundesliga", 624)
plot_league(ligue1, "Ligue One", 625)
plt.show()

In most leagues its generally one or two clubs dominating the league. The only exception is EPL where we had 4 winners in last 8 years
So from feature analysis we can conclude the following :
  1. EPL is the most volatile league in terms of league qualification. Only 35 % of teams have been consistently present in last 10 years
  2. There's a significant home advantage in every league. Away performance is improving in general but not at a very rapid pace
  3. EPL and LaLiga produces more goals compared to other leagues
  4. Both Home and Away teams produce more goals during second half
  5. Forwards in La Liga and Bundesliga are the most effective. Forwards in Serie A are the least effective
  6. EPL clearly is the most disciplined and least violent league with least number of fouls and cards. La Liga leads in number of red and yellow cards
  7. Average number of fouls have fallen across all leagues over last 10 years. However average number of cards haven't
  8. EPL forwards land maximum number of shots on target. However, EPL goalkeepers save maximum number of goals as well











comments powered by Disqus