DWM Mini Project: Soccer Data Analysis
DWM Mini Project: Soccer Data Analysis
In [ ]: #Import libraries
import numpy as np
import pandas as pd
import sqlite3
from datetime import timedelta
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import mpl_toolkits
from mpl_toolkits.basemap import Basemap
import folium
import folium.plugins
from matplotlib import animation,rc
import io
import base64
import itertools
from subprocess import check_output
In [ ]: countries.head()
leagues.head()
matches.head()
teams.head()
player.head()
player_attributes.head()
sequence.head()
team_attributes.head()
In [29]: matches.head()
Out[29]:
id country_id league_id season stage date match_api_id home_team_api_id away_te
2008-
0 1 1 1 2008/2009 1 08-17 492473 9987
00:00:00
2008-
1 2 1 1 2008/2009 1 08-16 492474 10000
00:00:00
2008-
2 3 1 1 2008/2009 1 08-16 492475 9984
00:00:00
2008-
3 4 1 1 2008/2009 1 08-17 492476 9991
00:00:00
2008-
4 5 1 1 2008/2009 1 08-16 492477 7947
00:00:00
matches_new = matches_new.drop("id",axis=1)
Out[31]: country_id 0
league_id 0
season 0
stage 0
date 0
match_api_id 0
home_team_api_id 0
away_team_api_id 0
home_team_goal 0
away_team_goal 0
country 0
league 0
dtype: int64
Out[32]: country_id 11
league_id 11
season 8
stage 38
date 1694
match_api_id 25979
home_team_api_id 299
away_team_api_id 299
home_team_goal 11
away_team_goal 10
country 11
league 11
dtype: int64
plt.figure(figsize=(12,12))
country = list(country_info["country_x"].unique())
c = sns.color_palette("Set1",11)
label = country
def function(country,c,label):
lat = list(country_info[country_info["country_x"] == country].latitude)
lon = list(country_info[country_info["country_x"] == country].longitude)
x,y = m3(lon,lat)
m3.plot(x,y,"go",markersize=15,color=j,alpha=.8,label=i)
m3.bluemarble(scale=0.5)
plt.legend(loc="center right",frameon=True,prop={"size":15}).get_frame().set_face
plt.title("MATCHES PLAYED IN COUNTRIES")
plt.show()
Clipping input data to the valid range for imshow with RGB data ([0..1] for flo
ats or [0..255] for integers).
plt.figure(figsize=(15,10))
m.drawmapboundary(fill_color="skyblue",color="k",linewidth=2)
m.drawcoastlines(linewidth=2)
m.drawcountries(linewidth=2,color="grey")
m.fillcontinents(color="gold",alpha=1,lake_color="b")
leag = list(country_info["league"].unique())
c = sns.color_palette("Set1",11)
lat = list(country_info[country_info["league"] == leag].latitude)
lon = list(country_info[country_info["league"] == leag].longitude)
x,y = m(lon,lat)
def function(leag,c):
lat = list(country_info[country_info["league"] == leag].latitude)
lon = list(country_info[country_info["league"] == leag].longitude)
x,y = m(lon,lat)
m.plot(x,y,"go",markersize=40,color=j,alpha=.8,linewidth=20)
In [39]: plt.figure(figsize=(8,8))
ax = sns.countplot(y = data["league"],
order=data["league"].value_counts().index,
linewidth = 1,
edgecolor = "k"*data["league"].nunique()
)
for i,j in enumerate(data["league"].value_counts().values):
ax.text(.7,i,j,weight = "bold")
plt.title("Matches by league")
plt.show()
In [40]: data.groupby("league").agg({"home_team_goal":"sum","away_team_goal":"sum"}).plot
In [42]: plt.figure(figsize=(10,10))
sns.countplot(y = data["season"],hue=data["league"],
palette=["r","g","b","c","lime","m","y","k","gold","orange"])
plt.title("MATCHES PLAYED IN EACH LEAGUE BY SEASON")
plt.show()
In [46]: x = h_t
x = x.rename(columns={'home_team_lname':"team", 'home_team_goal':"goals"})
y = a_t
y = y.rename(columns={'away_team_lname':"team", 'away_team_goal':"goals"})
goals = pd.concat([x,y])
goals = goals.groupby("team")["goals"].sum().reset_index().sort_values(by = "goal
plt.figure(figsize=(9,14))
ax = sns.barplot(x="goals",y="team",
data=goals[:30],palette="rainbow",
linewidth = 1,edgecolor = "k"*30)
for i,j in enumerate(goals["goals"][:30]):
ax.text(.3,i,j,weight="bold",color = "k",fontsize =12)
plt.title("Teams with highest total goals ")
plt.show()
In [47]: x = data.groupby("home_team_lname")["match_api_id"].count().reset_index()
x = x.rename(columns={"home_team_lname":"team"})
y = data.groupby("away_team_lname")["match_api_id"].count().reset_index()
y = y.rename(columns={"away_team_lname":"team"})
xy = pd.concat([x,y],axis=0)
match_teams = xy.groupby("team")["match_api_id"].sum().reset_index().sort_values
match_teams = match_teams.rename(columns={"match_api_id":"matches_played"})
match_teams[:20]
Out[47]:
team matches_played
51 Chelsea 304
83 FC Lorient 304
75 FC Barcelona 304
49 Celtic 304
70 Everton 304
In [49]: plt.figure(figsize=(12,6))
sns.kdeplot(data["home_team_goal"],shade=True,
color="b",label="home goals")
sns.kdeplot(data["away_team_goal"],shade=True,
color="r",label="away goals")
plt.axvline(data["home_team_goal"].mean(),linestyle = "dashed",
color="b",label="home goals mean")
plt.axvline(data["away_team_goal"].mean(),linestyle = "dashed",
color="r",label="away goals mean")
plt.legend(loc="best",prop = {"size" : 12})
plt.title("DISTRIBUTION OF HOME AND AWAY GOALS")
plt.xlabel("goals")
plt.show()
In [50]: x = data.groupby(["home_team_lname","league"]).agg({"match_api_id":"count","home_
y = data.groupby(["away_team_lname","league"]).agg({"match_api_id":"count","away_
x = x.rename(columns={'home_team_lname':"team", 'match_api_id':"matches", 'home_t
y = y.rename(columns={'away_team_lname':"team", 'match_api_id':"matches", 'away_t
xy = pd.concat([x,y])
xy = xy.groupby(["team","league"])[["matches","goals"]].sum().reset_index()
xy = xy.sort_values(by="goals",ascending=False)
plt.figure(figsize=(13,6))
c = ["r","g","b","m","y","yellow","c","orange","grey","lime","white"]
lg = xy["league"].unique()
for i,j,k in itertools.zip_longest(lg,range(len(lg)),c):
plt.scatter("matches","goals",data=xy[xy["league"] == i],label=[i],s=100,alph
plt.legend(loc="best")
plt.xlabel("MATCHES")
plt.ylabel("GOALS SCORED")
In [61]: plt.figure(figsize=(8,8))
data["outcome_side"].value_counts().plot.pie(autopct = "%1.0f%%",
colors =sns.color_palette("rainbow",
wedgeprops = {"linewidth":2,"edgecol
my_circ = plt.Circle((0,0),.7,color = "white")
plt.gca().add_artist(my_circ)
plt.title("PROPORTION OF GAME OUTCOMES")
plt.show()
plt.subplot(122)
ax = sns.barplot(lost["lost"][:30],lost["index"][:30],
palette="Set2",
linewidth = 1,edgecolor = "k"*30)
plt.title(" TOP TEAMS that Lost")
plt.subplots_adjust(wspace = .3)
plt.ylabel("")
for i,j in enumerate(lost["lost"][:30]):
ax.text(.7,i,j,color = "black",weight = "bold")
In [64]: f = f.sort_values(by="goals",ascending=False)
f_new = f.copy()
f_new.index = f_new.team
f_new[["win","lost","draw"]][:20].plot(kind = "bar",figsize=(13,5),
stacked =True,linewidth = 1,
edgecolor = "k"*20
)
plt.legend(loc="best",prop = {"size" : 13})
plt.title("PERFORMANCE BY TOP TEAMS")
plt.ylabel("matches played")
plt.show()
In [65]: x = pd.DataFrame(data.groupby(["league","win"])["win"].count())
x = x.rename(columns={"win":"team"}).reset_index()
x = x.rename(columns={"win":"team","team":"win"})
x = x.sort_values(by="win",ascending=False)
x = x[x["team"] != "DRAW"]
x = x.drop_duplicates(subset=["league"],keep="first")
plt.figure(figsize=(8,7))
ax =sns.barplot(x["win"],x["league"],palette="cool",
linewidth = 1 ,edgecolor = "k"*10)
for i,j in enumerate(x["team"]):
ax.text(.7,i,j,weight = "bold",fontsize = 12)
plt.title("TOP TEAMS BY LEAGUES")
plt.show()
In [66]: data.groupby(["league"]).agg({"match_api_id":"count","total_goal":"sum"}).plot(ki
st
fi
li
ed
)
plt.title("# MATCHES PLAYED IN EACH LEAGUE VS TOTAL GOALS SCORED")
plt.show()
In [67]: plt.figure(figsize=(7,15))
plt.subplot(211)
agg = data.groupby(["league"]).agg({"match_api_id":"count","total_goal":"sum"})
agg["match_api_id"].plot.pie(colors=sns.color_palette("seismic",10),
autopct="%1.0f%%",
wedgeprops={"linewidth":2,"edgecolor":"white"})
plt.ylabel("")
my_circ = plt.Circle((0,0),.7,color ="white")
plt.gca().add_artist(my_circ)
plt.title("PROPORTION OF MATCHES PLAYED IN LEAGUES")
plt.subplot(212)
agg["total_goal"].plot.pie(colors=sns.color_palette("seismic",10),
autopct="%1.0f%%",
wedgeprops={"linewidth":2,"edgecolor":"white"})
plt.ylabel("")
my_circ = plt.Circle((0,0),.7,color ="white")
plt.gca().add_artist(my_circ)
plt.title("PROPORTION OF GOALS SCORED IN LEAGUES")
plt.show()
In [69]: i = data["win"].value_counts()[1:25].index
t= pd.pivot_table(index="home_team_lname",columns="season",values="home_team_goal
data=data,aggfunc="sum")
t=t[t.index.isin(i)]
t.plot(kind="barh",stacked=True,figsize=(10,10),colors=sns.color_palette("prism",
plt.title("HOME GOALS SCORED BY TOP TEAMS BY SEASON")
plt.show()
In [70]: i = data["win"].value_counts()[1:25].index
t= pd.pivot_table(index="away_team_lname",columns="season",
values="away_team_goal",data=data,aggfunc="sum")
t=t[t.index.isin(i)]
t.plot(kind="barh",stacked=True,figsize=(10,10),colors=sns.color_palette("prism",
plt.title("HOME GOALS SCORED BY TOP TEAMS BY SEASON")
plt.show()
In [71]: nw = data[["season","league","win"]]
nw["team"] = nw["win"]
nw = nw.groupby(["season","league","team"])["win"].count().reset_index().sort_val
nw = nw[nw["team"] != "DRAW"]
nw = nw.drop_duplicates(subset=["season","league"],keep="first").sort_values(by=
plt.figure(figsize=(13,28))
plt.subplot(621)
lg = nw[nw["league"] == "Belgium Jupiler League"]
ax = sns.barplot(lg["win"],lg["season"],palette="cool",
linewidth = 1 ,edgecolor = "k"*lg["season"].nunique())
for i,j in enumerate(lg["team"]):
ax.text(.7,i,j,weight = "bold")
plt.title("Belgium Jupiler League")
plt.xlabel("")
plt.ylabel("")
plt.subplot(622)
lg = nw[nw["league"] == "England Premier League"]
ax = sns.barplot(lg["win"],lg["season"],palette="magma",
linewidth = 1 ,edgecolor = "k"*lg["season"].nunique())
for i,j in enumerate(lg["team"]):
ax.text(.7,i,j,weight = "bold",color="white")
plt.title("England Premier League")
plt.xlabel("")
plt.ylabel("")
plt.subplot(623)
lg = nw[nw["league"] == 'Spain LIGA BBVA']
ax = sns.barplot(lg["win"],lg["season"],palette="rainbow",
linewidth = 1 ,edgecolor = "k"*lg["season"].nunique())
for i,j in enumerate(lg["team"]):
ax.text(.7,i,j,weight = "bold")
plt.title('Spain LIGA BBVA')
plt.xlabel("")
plt.ylabel("")
plt.subplot(624)
lg = nw[nw["league"] == 'France Ligue 1']
ax = sns.barplot(lg["win"],lg["season"],palette="summer",
linewidth = 1 ,edgecolor = "k"*lg["season"].nunique())
for i,j in enumerate(lg["team"]):
ax.text(.7,i,j,weight = "bold",color = "white")
plt.title('France Ligue 1')
plt.xlabel("")
plt.ylabel("")
plt.subplot(625)
lg = nw[nw["league"] == 'Germany 1. Bundesliga']
ax = sns.barplot(lg["win"],lg["season"],palette="winter",
linewidth = 1 ,edgecolor = "k"*lg["season"].nunique())
for i,j in enumerate(lg["team"]):
ax.text(.7,i,j,weight = "bold")
plt.title('Germany 1. Bundesliga')
plt.xlabel("")
plt.ylabel("")
plt.subplot(626)
lg = nw[nw["league"] == 'Italy Serie A']
ax = sns.barplot(lg["win"],lg["season"],palette="husl",
linewidth = 1 ,edgecolor = "k"*lg["season"].nunique())
for i,j in enumerate(lg["team"]):
ax.text(.7,i,j,weight = "bold")
plt.title('Italy Serie A')
plt.xlabel("")
plt.ylabel("")
plt.show()
In [72]: plt.figure(figsize=(13,28))
plt.subplot(621)
lg = nw[nw["league"] == 'Netherlands Eredivisie']
ax = sns.barplot(lg["win"],lg["season"],palette="Blues",
linewidth = 1 ,edgecolor = "k"*lg["season"].nunique())
for i,j in enumerate(lg["team"]):
ax.text(.7,i,j,weight = "bold")
plt.title('Netherlands Eredivisie')
plt.xlabel("")
plt.ylabel("")
plt.subplot(622)
lg = nw[nw["league"] == 'Poland Ekstraklasa']
ax = sns.barplot(lg["win"],lg["season"],palette="winter",
linewidth = 1 ,edgecolor = "k"*lg["season"].nunique())
for i,j in enumerate(lg["team"]):
ax.text(.7,i,j,weight = "bold")
plt.title('Poland Ekstraklasa')
plt.xlabel("")
plt.ylabel("")
plt.subplot(623)
lg = nw[nw["league"] == 'Portugal Liga ZON Sagres']
ax = sns.barplot(lg["win"],lg["season"],palette="rainbow",
linewidth = 1 ,edgecolor = "k"*lg["season"].nunique())
for i,j in enumerate(lg["team"]):
ax.text(.7,i,j,weight = "bold")
plt.title('Portugal Liga ZON Sagres')
plt.xlabel("")
plt.ylabel("")
plt.subplot(624)
lg = nw[nw["league"] == 'Scotland Premier League']
ax = sns.barplot(lg["win"],lg["season"],palette="Greens",
linewidth = 1 ,edgecolor = "k"*lg["season"].nunique())
for i,j in enumerate(lg["team"]):
ax.text(.7,i,j,weight = "bold")
plt.title('Scotland Premier League')
plt.xlabel("")
plt.ylabel("")
plt.subplot(625)
lg = nw[nw["league"] == 'Switzerland Super League']
ax = sns.barplot(lg["win"],lg["season"],palette="cool",
linewidth = 1 ,edgecolor = "k"*lg["season"].nunique())
for i,j in enumerate(lg["team"]):
ax.text(.7,i,j,weight = "bold")
plt.title('Switzerland Super League')
plt.xlabel("")
plt.ylabel("")
plt.show()
In [73]: plt.figure(figsize=(8,14))
ax = sns.countplot(y=nw["team"],order=nw["team"].value_counts().index,
palette="plasma",linewidth = 1 ,edgecolor = "k"*nw["team"].nun
plt.title("TOP LEAGUE WINNERS")
for i,j in enumerate(nw["team"].value_counts().values):
ax.text(.2,i,j,color = "white",weight="bold")
In [74]: player
player["weight_kg"] = player["weight"] * 0.453592
player["height_m"] = player["height"] / 100
player["bmi"] = player["weight_kg"]/(player["height_m"] * player["height_m"
player["year"] = pd.DatetimeIndex(player["birthday"]).year
player["age"] = 2018 - player["year"]
cols = ["bmi","weight_kg","height_m","age"]
length = len(cols)
c = ["b","r","g","c"]
plt.figure(figsize=(13,17))
for i,j,k in itertools.zip_longest(cols,range(length),c):
plt.subplot(4,1,j+1)
player[i].plot(color = k ,linewidth =.2,label = i)
plt.axhline(player[i].mean(),color = "k",linestyle = "dashed",label="mean")
plt.legend(loc="best")
plt.subplots_adjust(hspace = .3)
plt.xlabel("player index")
plt.ylabel(i)
plt.title(i)
PLAYER ATTRIBUTES
===============================================================================
====================
Oldest Player is Alberto Fontana of age 51 years
Youngest Players are ['Alban Lafont', 'Gianluigi Donnarumma', 'Jonathan Leko']
of age 19 years
Tallest Player is Kristof van Hout of height 2.0828 meters
Shortest Player is Juan Quero of height 1.5748 meters
Player with highest weight are ['Kristof van Hout', 'Tim Wiese'] of height 110.
222856 kilograms
Player with lowest weight is Juan Quero of height 53.070264 kilograms
Player with Highest Body Mass Index is Sylvan Ebanks-Blake of 30.86570247534958
6 kg/m2
Player with lowest Body Mass Index is John Stewart of 17.58956425144406 kg/m2
In [ ]: