# Setting up
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import itertools
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
# ignore warning
import warnings
warnings.filterwarnings("ignore")
# hide the code cell
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Sports is an integral part of nearly all cultures. One of the sports which has lots of fans in the world is soccer, which we will call football throughout this paper. For years individuals have tried to use statistics to figure out what makes teams win, or to try to find out if their favorite teams are the best. Razali suggests that "the research for predicting the results of football matches outcome started as early as 1977 by Stafani R". The English Premier League (EPL), is one of the most popular and largest leagues in the world. In 2017, there were 5 clubs from EPL in top 10 teams by revenue.
In order to gain an understanding of what makes teams win at football, we will be exploring a few research questions:
In answering these questions, we will be working with data from all England Premier League seasons from the 2014/2015 season to the current 2018/2019 season provided by Football Data. These datasets contain data pertaining to the game itself as well as a lot of bets associated with the game.
To answer these research questions, we selected features that would affect a result of game. There are no null/NAN value on our dataset. Based on our knowledge, we choose following features:
HomeTeam
- Name of the home teamAwayTeam
- Name of the away teamFTR
- Full time result (Home win, Away win, Draw)FTHG
- Full time home team goalsFTAG
- Full time away team goalsHS
- Home team shotsAS
- Away team shotsHST
- Home team shots on targetAST
- Away team shots on targetHSGR
- Home team shots goal ratio (calculated)ASGR
- Away team shots goal ratio (calculated)We also include average betting odds from 6 companies, which are Bet365, Bet&win, Interwetten, Pinnacle, VC Bet, and William Hill. The reason that we select these companies is that these companies has valid data in every season. Lowest betting odds means that teams with lower betting odds are likely to win the game. If the betting odds for draw is the lowest, it means that people expect the game would draw. For example, if the betting odd for home team is 1.13, then if home team won the game, each people will get 1.13 times more money than they bet. These are the following variables:
odd_home
- Average betting odds for Home Teamsodd_draw
- Average betting odds of betting on drawodd_away
- Average betting odds for Away Teams# get data frames previous 4 seasons and current season.
df_1415 = pd.read_csv('./data/1415.csv')
df_1516 = pd.read_csv('./data/1516.csv')
df_1617 = pd.read_csv('./data/1617.csv')
df_1718 = pd.read_csv('./data/1718.csv')
df_1819 = pd.read_csv('./data/1819.csv')
df_total = df_1415.append(df_1516).append(df_1617).append(df_1718).append(df_1819)
df_total.head()
# Clean data function
def clean_data(df):
data = df[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST']]
data['HSGR'] = data['FTHG']/data['HS']
data['ASGR'] = data['FTAG']/data['AS']
data = data.replace([np.inf, -np.inf], 0)
bet_home = df[['B365H','BWH','IWH','PSH','VCH','WHH']].mean(axis=1)
bet_draw = df[['B365D','BWD','IWD','PSD','VCD','WHD']].mean(axis=1)
bet_away = df[['B365A','BWA','IWA','PSA','VCA','WHA']].mean(axis=1)
data['odd_home'] = bet_home
data['odd_draw'] = bet_draw
data['odd_away'] = bet_away
data.dropna()
return data
# clean data
df_total = clean_data(df_total)
df_total.head()
For RQ2, we realized it was better to create running metrics that looks at teams' previous games, up to the game being observed, rather than using the statistics of individual games. This is because by the time a game's statistics have been finalized, the betters would have already locked in their odds, meaning we would be making predictions based on values that would not have mattered anymore.
To solve this concern, we turned toward feature engineering, creating variables that represented the statistics of past games in the season to predict the odds of a current game. This approach would allow us to make more relevant predictions. Below is the our prepared dataset for models to answer RQ2. All variables below are updated every game played by the home and away team (e.g. they represent their respective statistics for all games before the current).
H_W
- Home winsH_WR
- Home win rateH_avg_diff
- Home team's average scores over opponentA_W
- Away winsA_WR
- Away win rateA_avg_diff
- Away team's average scores over opponentWe chose to show rows not among the top of the dataframe because when a team first plays, their previous stats would be null. It would be all zero's, show here is a better representation of the majority of the data.
def engineer_features(df):
df = df.copy()
df['goals_h_a'] = df['FTHG'] - df['FTAG']
df['total_h_a'] = df['HS'] - df['AS']
H_GT = [] # home games total so far
H_W = [] # home wins so far
H_WR = [] # home win rate so far
H_avg_diff = [] # home avg goals diff
A_GT = [] # away games total so far
A_W = [] # away wins so far
A_WR = [] #away win rate so far
A_avg_diff = [] # away avg goals diff
for i in range(len(df)):
home = df.loc[i, 'HomeTeam']
away = df.loc[i, 'AwayTeam']
home_h_games = df[df['HomeTeam'] == home].loc[:i-1]['goals_h_a']
home_a_games = df[df['AwayTeam'] == home].loc[:i-1]['goals_h_a'] * -1
home_games = home_h_games.append(home_a_games)
away_h_games = df[df['HomeTeam'] == away].loc[:i-1]['goals_h_a']
away_a_games = df[df['AwayTeam'] == away].loc[:i-1]['goals_h_a'] * -1
away_games = away_h_games.append(away_a_games)
H_GT.append(len(home_games))
A_GT.append(len(away_games))
H_W.append((home_games > 0).sum() + (home_games == 0).sum() * 0.5)
A_W.append((away_games > 0).sum() + (away_games == 0).sum() * 0.5)
if H_GT[i] > 0:
H_WR.append(H_W[i] / H_GT[i])
H_avg_diff.append(home_games.mean())
else:
H_WR.append(0)
H_avg_diff.append(0)
if A_GT[i] > 0:
A_WR.append(A_W[i] / A_GT[i])
A_avg_diff.append(away_games.mean())
else:
A_WR.append(0)
A_avg_diff.append(0)
df['H_GT'] = H_GT
df['H_W'] = H_W
df['H_WR'] = H_WR
df['H_avg_diff'] = H_avg_diff
df['A_GT'] = A_GT
df['A_W'] = A_W
df['A_WR'] = A_WR
df['A_avg_diff'] = A_avg_diff
return df
# clean data for each individual dataset
data_1415 = clean_data(pd.read_csv('./data/1415.csv'))
data_1415.drop(data_1415.tail(1).index,inplace=True)
data_1516 = clean_data(pd.read_csv('./data/1516.csv'))
data_1617 = clean_data(pd.read_csv('./data/1617.csv'))
data_1718 = clean_data(pd.read_csv('./data/1718.csv'))
data_1819 = clean_data(pd.read_csv('./data/1819.csv'))
df_1415_feat_engr = engineer_features(data_1415)
df_1516_feat_engr = engineer_features(data_1516)
df_1617_feat_engr = engineer_features(data_1617)
df_1718_feat_engr = engineer_features(data_1718)
df_1819_feat_engr = engineer_features(data_1819)
df_past_seasons = df_1415_feat_engr.copy()
df_past_seasons = df_past_seasons.append(df_1516_feat_engr)
df_past_seasons = df_past_seasons.append(df_1617_feat_engr)
df_past_seasons = df_past_seasons.append(df_1718_feat_engr)
df_past_seasons = df_past_seasons.reset_index(drop=True)
df_past_seasons = df_past_seasons[['HomeTeam', 'AwayTeam',
'H_W', 'H_WR', 'H_avg_diff',
'A_W', 'A_WR', 'A_avg_diff',
'odd_home', 'odd_draw', 'odd_away']]
df_past_seasons[150:155]
# a function that gets the average of betting odds
def average_betting(df):
betting_accuracies = []
for index, row in df.iterrows():
if(row['FTR'] == 'H' and row['odd_home'] < row['odd_away'] and row['odd_home'] < row['odd_draw']):
betting_accuracies.append(1)
elif(row['FTR'] == 'D' and row['odd_draw'] < row['odd_away'] and row['odd_draw'] < row['odd_home']):
betting_accuracies.append(1)
elif(row['FTR'] == 'A' and row['odd_away'] < row['odd_home'] and row['odd_away'] < row['odd_draw']):
betting_accuracies.append(1)
else:
betting_accuracies.append(0)
return np.mean(betting_accuracies)
# set time trend on plot
time = [average_betting(data_1415),
average_betting(data_1516),
average_betting(data_1617),
average_betting(data_1718),
average_betting(data_1819)]
We expected that the accuracy would increase, but there is no pattern on betting accuracy over time.
# Betting accuracy by timeline ()
plt.plot(["14/15", "15/16", "16/17", "17/18", "18/19"], time, label='betting accuracy')
plt.legend()
plt.title('betting accuracy')
plt.xlabel('time')
plt.ylabel('accuracy')
plt.show()
We compare the average betting odds and actual results. When home team wins the game, the proability of matching betting odds with actual results is around 84%. When the result of the game is draw, the proability of matching betting odds with actual results is around 0% since people expect to draw less than winning or losing a game. When away team win the game, the proability of matching betting odds with actual results is around 57%.
# getting accuracy about how an actual result matched with lowest betting odds.
def accuracy(df):
home = 0
draw = 0
away = 0
for index, row in df.iterrows():
if (row['odd_home'] < row['odd_away'] and row['odd_home'] < row['odd_draw']):
home = home+1
elif (row['odd_draw'] < row['odd_away'] and row['odd_draw'] < row['odd_home']):
draw = draw+1
else:
away = away+1
return [home / len(df), draw / len(df), away / len(df)]
# draw pie chart for getting proportation of betting odds for matched result.
plt.pie(accuracy(df_total.loc[df_total['FTR'] == 'H']), labels=['Home', 'Draw', 'Away'])
plt.title('Betting odds when Home won the game')
plt.show()
plt.pie(accuracy(df_total.loc[df_total['FTR'] == 'D']), labels=['Home', 'Draw', 'Away'])
plt.title('Betting odds when draw')
plt.show()
plt.pie(accuracy(df_total.loc[df_total['FTR'] == 'A']), labels=['Home', 'Draw', 'Away'])
plt.title('Betting odds when Away won the game')
plt.show()
It shows the number of wins for home and away team. Home team used to take advantages.
## get distribution of the result
def getDistResult(data):
arr = [0,0,0]
for index, row in data.iterrows():
if row.FTR == 'H':
arr[0] += 1
elif row.FTR == 'D':
arr[1] += 1
else:
arr[2] += 1
return arr
# by season home or away
result = ['Home Wins', 'Draw', 'Away Wins']
plt.barh(result, getDistResult(df_1415))
plt.xlabel('Frequency')
plt.title('14/15 Season Result stats')
plt.show()
plt.barh(result, getDistResult(df_1516))
plt.xlabel('Frequency')
plt.title('15/16 Season Result stats')
plt.show()
plt.barh(result, getDistResult(df_1617))
plt.xlabel('Frequency')
plt.title('16/17 Season Result stats')
plt.show()
plt.barh(result, getDistResult(df_1718))
plt.xlabel('Frequency')
plt.title('17/18 Season Result stats')
plt.show()
plt.barh(result, getDistResult(df_1819))
plt.xlabel('Frequency')
plt.title('18/19 Season Result stats')
plt.show()
# Clean data function from Kangwoo's notebook
def clean_data_poisson(df):
data = df[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST']]
data['HSGR'] = data['FTHG']/data['HS']
data['ASGR'] = data['FTAG']/data['AS']
data = data.replace([np.inf, -np.inf], 0)
# bet_home = df[['B365H','BWH','IWH','PSH','VCH','WHH']].mean(axis=1)
# bet_draw = df[['B365D','BWD','IWD','PSD','VCD','WHD']].mean(axis=1)
# bet_away = df[['B365A','BWA','IWA','PSA','VCA','WHA']].mean(axis=1)
# data['odd_home'] = bet_home
# data['odd_draw'] = bet_draw
# data['odd_away'] = bet_away
data.dropna()
############### New stuff
team_scores = {}
team_shots = {}
HomeAvgAllTimeSoFar = []
HomeHighAllTimeSoFar = []
HomeLowAllTimeSoFar = []
HomeTotalGoals = []
HomeTotalShots = []
HomeTotalAccuracy = []
AwayAvgAllTimeSoFar = []
AwayHighAllTimeSoFar = []
AwayLowAllTimeSoFar = []
AwayTotalGoals = []
AwayTotalShots = []
AwayTotalAccuracy = []
for index, row in data.iterrows():
# Add values to all the rows before adding to the team scores
home_team = row["HomeTeam"]
away_team = row["AwayTeam"]
if home_team not in team_scores:
team_scores[home_team] = []
if away_team not in team_scores:
team_scores[away_team] = []
if home_team not in team_shots:
team_shots[home_team] = []
if away_team not in team_shots:
team_shots[away_team] = []
home_team_scores = team_scores[home_team]
away_team_scores = team_scores[away_team]
home_team_shots = team_shots[home_team]
away_team_shots = team_shots[away_team]
if len(home_team_scores) < 1:
HomeAvgAllTimeSoFar.append(0)
HomeHighAllTimeSoFar.append(0)
HomeLowAllTimeSoFar.append(0)
else:
HomeAvgAllTimeSoFar.append(np.mean(home_team_scores))
HomeHighAllTimeSoFar.append(np.max(home_team_scores).astype("float"))
HomeLowAllTimeSoFar.append(np.min(home_team_scores).astype("float"))
if len(away_team_scores) < 1:
AwayAvgAllTimeSoFar.append(0)
AwayHighAllTimeSoFar.append(0)
AwayLowAllTimeSoFar.append(0)
else:
AwayAvgAllTimeSoFar.append(np.mean(away_team_scores))
AwayHighAllTimeSoFar.append(np.max(away_team_scores).astype("float"))
AwayLowAllTimeSoFar.append(np.min(away_team_scores).astype("float"))
s_Home_Scores = np.sum(home_team_scores)
s_Home_Shots = np.sum(home_team_shots)
s_Away_Scores = np.sum(away_team_scores)
s_Away_Shots = np.sum(away_team_shots)
HomeTotalGoals.append(s_Home_Scores)
HomeTotalShots.append(s_Home_Shots)
HomeTotalAccuracy.append(np.nan_to_num(s_Home_Scores/s_Home_Shots))
AwayTotalGoals.append(s_Away_Scores)
AwayTotalShots.append(s_Away_Shots)
AwayTotalAccuracy.append(np.nan_to_num(s_Away_Scores/s_Away_Shots))
# Add to team scores
team_scores[home_team].append(row["FTHG"])
team_scores[away_team].append(row["FTAG"])
team_shots[home_team].append(row["HS"])
team_shots[away_team].append(row["AS"])
data["HomeAvgAllTimeSoFar"] = HomeAvgAllTimeSoFar
data["HomeHighAllTimeSoFar"] = HomeHighAllTimeSoFar
data["HomeLowAllTimeSoFar"] = HomeLowAllTimeSoFar
data["AwayAvgAllTimeSoFar"] = AwayAvgAllTimeSoFar
data["AwayHighAllTimeSoFar"] = AwayHighAllTimeSoFar
data["AwayLowAllTimeSoFar"] = AwayLowAllTimeSoFar
data["HomeTotalGoals"] = HomeTotalGoals
data["HomeTotalShots"] = HomeTotalShots
data["HomeTotalAccuracy"] = HomeTotalAccuracy
data["AwayTotalGoals"] = AwayTotalGoals
data["AwayTotalShots"] = AwayTotalShots
data["AwayTotalAccuracy"] = AwayTotalAccuracy
#####################
data.dropna()
return data
# clean data for each individual dataset
data_1415_poi = clean_data_poisson(pd.read_csv('./data/1415.csv'))
data_1415_poi.drop(df_1415.tail(1).index,inplace=True)
data_1516_poi = clean_data_poisson(pd.read_csv('./data/1516.csv'))
data_1617_poi = clean_data_poisson(pd.read_csv('./data/1617.csv'))
data_1718_poi = clean_data_poisson(pd.read_csv('./data/1718.csv'))
data_1819_poi = clean_data_poisson(pd.read_csv('./data/1819.csv'))
We attempted to run a poisson model on the data to produce a guess for if the Home team would win, Away team, or if it would end up in a draw. We encoded the values using sklearn
's LabelEncoder
function. Then we split the data into test and training data. We then plugged it into a general linear model with a Poisson family using the following formulas: FTR ~ HomeTeam + AwayTeam
, FTR ~ HomeTeam + AwayTeam + HomeAvgAllTimeSoFar + HomeHighAllTimeSoFar + HomeLowAllTimeSoFar + AwayAvgAllTimeSoFar + AwayHighAllTimeSoFar + AwayLowAllTimeSoFar + HomeTotalGoals + HomeTotalShots + HomeTotalAccuracy + AwayTotalGoals + AwayTotalShots + AwayTotalAccuracy
, and FTR ~ HomeAvgAllTimeSoFar + HomeHighAllTimeSoFar + HomeLowAllTimeSoFar + AwayAvgAllTimeSoFar + AwayHighAllTimeSoFar + AwayLowAllTimeSoFar + HomeTotalGoals + HomeTotalShots + HomeTotalAccuracy + AwayTotalGoals + AwayTotalShots + AwayTotalAccuracy
.
Here are a list of the values we used in these models:
HomeTeam
- Name of the home teamAwayTeam
- Name of the away teamHomeAvgAllTimeSoFar
- Average of all scores from each game the home team has played by that point in time.HomeHighAllTimeSoFar
- Highest of all scores from each game the home team has played by that point in time.HomeLowAllTimeSoFar
- Lowest of all scores from each game the home team has played by that point in time.AwayAvgAllTimeSoFar
- Average of all scores from each game the away team has played by that point in time.AwayHighAllTimeSoFar
- Highest of all scores from each game the away team has played by that point in time.AwayLowAllTimeSoFar
- Lowest of all scores from each game the away team has played by that point in time.HomeTotalGoals
- Total home goals to dateHomeTotalShots
- Total home shots to dateHomeTotalAccuracy
- Home accuracy to dateAwayTotalGoals
- Total away goals to dateAwayTotalShots
- Total away shots to dateAwayTotalAccuracy
- Away accuracy to dateFirst we ran this model on the separate datasets. At first, we tried out just the Home team and Away team names as factors in the model, similar (but not exactly!) to David Sheehan's study. We received these accuracy scores.
This very simple model seems to be able to guess it more than a third of the time mostly. However, that isn't as good as we would like. It is just slightly better than guessing at random. In 2015-2016, it would have been worse than guessing at random.
We then tried with only the factors we generated and found a dip in performance for most of the years.
Finally, we tried both the home team and away team names as factors combined with our factors, and we saw a slight improvement in most of the scores.
We theorize that 18-19 is so low accuracy because there haven't been as many games played thus far.
We also ran the model on all the combined datasets. Note that while combined, the "All Time" factors are localized to the season the data point came from, meaning there are no "lifetime" data points besides the name of the team (doing so doesn't provide any increase in accuracy). For the model that just had team names as factors, we received an accuracy of 0.387. For the model with just our factors, we received an accuracy of 0.362. For the model with those combined, we received an accuracy of 0.398.
If we take a look at the summary of the model with the team names as factors alongside our factors, we see that some of the factors are significant. To contrast David Sheehan's study, we didn't receive as many significant p values because we did not combine HomeTeam
and AwayTeam
into a single team
variable for this specific analysis.
# create label encoder to convert H, A, or D into factors for who wins (Home, Away, or Draw)
encoder_1415 = LabelEncoder()
encoder_1415.fit(data_1415_poi["FTR"])
data_1415_poi["FTR"] = encoder_1415.transform(data_1415_poi["FTR"])
# split into training and test data
train_1415, valid_1415, train_labels_1415, valid_labels_1415 = train_test_split(
data_1415_poi.drop("FTR", axis=1),
data_1415_poi["FTR"],
test_size=0.2,
random_state=123
)
# Combine the training data and labels to fit into generalized linear model
train_1415_all = train_1415
train_1415_all["FTR"] = train_labels_1415
# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeTeam + AwayTeam + HomeAvgAllTimeSoFar + \
HomeHighAllTimeSoFar + HomeLowAllTimeSoFar + AwayAvgAllTimeSoFar + \
AwayHighAllTimeSoFar + AwayLowAllTimeSoFar + HomeTotalGoals + \
HomeTotalShots + HomeTotalAccuracy + AwayTotalGoals + AwayTotalShots + \
AwayTotalAccuracy",
data=train_1415_all, family=sm.families.Poisson()).fit()
# Predict and find accuracy
pred = np.round(m.predict(valid_1415))
a1 = accuracy_score(valid_labels_1415, pred)
# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeAvgAllTimeSoFar + \
HomeHighAllTimeSoFar + HomeLowAllTimeSoFar + AwayAvgAllTimeSoFar + \
AwayHighAllTimeSoFar + AwayLowAllTimeSoFar + HomeTotalGoals + \
HomeTotalShots + HomeTotalAccuracy + AwayTotalGoals + AwayTotalShots + \
AwayTotalAccuracy",
data=train_1415_all, family=sm.families.Poisson()).fit()
# Predict and find accuracy
pred = np.round(m.predict(valid_1415))
a2 = accuracy_score(valid_labels_1415, pred)
# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeTeam + AwayTeam ",
data=train_1415_all, family=sm.families.Poisson()).fit()
# Predict and find accuracy
pred = np.round(m.predict(valid_1415))
a3 = accuracy_score(valid_labels_1415, pred)
a1, a2, a3
# create label encoder to convert H, A, or D into factors for who wins (Home, Away, or Draw)
encoder_1516 = LabelEncoder()
encoder_1516.fit(data_1516_poi["FTR"])
data_1516_poi["FTR"] = encoder_1516.transform(data_1516_poi["FTR"])
# split into training and test data
train_1516, valid_1516, train_labels_1516, valid_labels_1516 = train_test_split(
data_1516_poi.drop("FTR", axis=1),
data_1516_poi["FTR"],
test_size=0.2,
random_state=123
)
# Combine the training data and labels to fit into generalized linear model
train_1516_all = train_1516
train_1516_all["FTR"] = train_labels_1516
# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeTeam + AwayTeam + HomeAvgAllTimeSoFar + \
HomeHighAllTimeSoFar + HomeLowAllTimeSoFar + AwayAvgAllTimeSoFar + \
AwayHighAllTimeSoFar + AwayLowAllTimeSoFar + HomeTotalGoals + \
HomeTotalShots + HomeTotalAccuracy + AwayTotalGoals + AwayTotalShots + \
AwayTotalAccuracy",
data=train_1516_all, family=sm.families.Poisson()).fit()
# Predict and find accuracy
pred = np.round(m.predict(valid_1516))
a1 = accuracy_score(valid_labels_1516, pred)
# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeAvgAllTimeSoFar + \
HomeHighAllTimeSoFar + HomeLowAllTimeSoFar + AwayAvgAllTimeSoFar + \
AwayHighAllTimeSoFar + AwayLowAllTimeSoFar + HomeTotalGoals + \
HomeTotalShots + HomeTotalAccuracy + AwayTotalGoals + AwayTotalShots + \
AwayTotalAccuracy",
data=train_1516_all, family=sm.families.Poisson()).fit()
# Predict and find accuracy
pred = np.round(m.predict(valid_1516))
a2 = accuracy_score(valid_labels_1516, pred)
# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeTeam + AwayTeam",
data=train_1516_all, family=sm.families.Poisson()).fit()
# Predict and find accuracy
pred = np.round(m.predict(valid_1516))
a3 = accuracy_score(valid_labels_1516, pred)
a1, a2, a3
# create label encoder to convert H, A, or D into factors for who wins (Home, Away, or Draw)
encoder_1617 = LabelEncoder()
encoder_1617.fit(data_1617_poi["FTR"])
data_1617_poi["FTR"] = encoder_1617.transform(data_1617_poi["FTR"])
# split into training and test data
train_1617, valid_1617, train_labels_1617, valid_labels_1617 = train_test_split(
data_1617_poi.drop("FTR", axis=1),
data_1617_poi["FTR"],
test_size=0.2,
random_state=123
)
# Combine the training data and labels to fit into generalized linear model
train_1617_all = train_1617
train_1617_all["FTR"] = train_labels_1617
# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeTeam + AwayTeam + HomeAvgAllTimeSoFar + \
HomeHighAllTimeSoFar + HomeLowAllTimeSoFar + AwayAvgAllTimeSoFar + \
AwayHighAllTimeSoFar + AwayLowAllTimeSoFar + HomeTotalGoals + \
HomeTotalShots + HomeTotalAccuracy + AwayTotalGoals + AwayTotalShots + \
AwayTotalAccuracy",
data=train_1617_all, family=sm.families.Poisson()).fit()
# Predict and find accuracy
pred = np.round(m.predict(valid_1617))
a1 = accuracy_score(valid_labels_1617, pred)
# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeAvgAllTimeSoFar + \
HomeHighAllTimeSoFar + HomeLowAllTimeSoFar + AwayAvgAllTimeSoFar + \
AwayHighAllTimeSoFar + AwayLowAllTimeSoFar + HomeTotalGoals + \
HomeTotalShots + HomeTotalAccuracy + AwayTotalGoals + AwayTotalShots + \
AwayTotalAccuracy",
data=train_1617_all, family=sm.families.Poisson()).fit()
# Predict and find accuracy
pred = np.round(m.predict(valid_1617))
a2 = accuracy_score(valid_labels_1617, pred)
# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeTeam + AwayTeam",
data=train_1617_all, family=sm.families.Poisson()).fit()
# Predict and find accuracy
pred = np.round(m.predict(valid_1617))
a3 = accuracy_score(valid_labels_1617, pred)
a1, a2, a3
# create label encoder to convert H, A, or D into factors for who wins (Home, Away, or Draw)
encoder_1718 = LabelEncoder()
encoder_1718.fit(data_1718_poi["FTR"])
data_1718_poi["FTR"] = encoder_1718.transform(data_1718_poi["FTR"])
# split into training and test data
train_1718, valid_1718, train_labels_1718, valid_labels_1718 = train_test_split(
data_1718_poi.drop("FTR", axis=1),
data_1718_poi["FTR"],
test_size=0.2,
random_state=123
)
# Combine the training data and labels to fit into generalized linear model
train_1718_all = train_1718
train_1718_all["FTR"] = train_labels_1718
# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeTeam + AwayTeam + HomeAvgAllTimeSoFar + \
HomeHighAllTimeSoFar + HomeLowAllTimeSoFar + AwayAvgAllTimeSoFar + \
AwayHighAllTimeSoFar + AwayLowAllTimeSoFar + HomeTotalGoals + \
HomeTotalShots + HomeTotalAccuracy + AwayTotalGoals + AwayTotalShots + \
AwayTotalAccuracy",
data=train_1718_all, family=sm.families.Poisson()).fit()
# Predict and find accuracy
pred = np.round(m.predict(valid_1718))
a1 = accuracy_score(valid_labels_1718, pred)
# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeAvgAllTimeSoFar + \
HomeHighAllTimeSoFar + HomeLowAllTimeSoFar + AwayAvgAllTimeSoFar + \
AwayHighAllTimeSoFar + AwayLowAllTimeSoFar + HomeTotalGoals + \
HomeTotalShots + HomeTotalAccuracy + AwayTotalGoals + AwayTotalShots + \
AwayTotalAccuracy",
data=train_1718_all, family=sm.families.Poisson()).fit()
# Predict and find accuracy
pred = np.round(m.predict(valid_1718))
a2 = accuracy_score(valid_labels_1718, pred)
# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeTeam + AwayTeam",
data=train_1718_all, family=sm.families.Poisson()).fit()
# Predict and find accuracy
pred = np.round(m.predict(valid_1718))
a3 = accuracy_score(valid_labels_1718, pred)
a1, a2, a3
# create label encoder to convert H, A, or D into factors for who wins (Home, Away, or Draw)
encoder_1819 = LabelEncoder()
encoder_1819.fit(data_1819_poi["FTR"])
data_1819_poi["FTR"] = encoder_1819.transform(data_1819_poi["FTR"])
# split into training and test data
train_1819, valid_1819, train_labels_1819, valid_labels_1819 = train_test_split(
data_1819_poi.drop("FTR", axis=1),
data_1819_poi["FTR"],
test_size=0.2,
random_state=123
)
# Combine the training data and labels to fit into generalized linear model
train_1819_all = train_1819
train_1819_all["FTR"] = train_labels_1819
# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeTeam + AwayTeam + HomeAvgAllTimeSoFar + \
HomeHighAllTimeSoFar + HomeLowAllTimeSoFar + AwayAvgAllTimeSoFar + \
AwayHighAllTimeSoFar + AwayLowAllTimeSoFar + HomeTotalGoals + \
HomeTotalShots + HomeTotalAccuracy + AwayTotalGoals + AwayTotalShots + \
AwayTotalAccuracy",
data=train_1819_all, family=sm.families.Poisson()).fit()
# Predict and find accuracy
pred = np.round(m.predict(valid_1819))
a1 = accuracy_score(valid_labels_1819, pred)
# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeAvgAllTimeSoFar + \
HomeHighAllTimeSoFar + HomeLowAllTimeSoFar + AwayAvgAllTimeSoFar + \
AwayHighAllTimeSoFar + AwayLowAllTimeSoFar + HomeTotalGoals + \
HomeTotalShots + HomeTotalAccuracy + AwayTotalGoals + AwayTotalShots + \
AwayTotalAccuracy",
data=train_1819_all, family=sm.families.Poisson()).fit()
# Predict and find accuracy
pred = np.round(m.predict(valid_1819))
a2 = accuracy_score(valid_labels_1819, pred)
# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeTeam + AwayTeam",
data=train_1819_all, family=sm.families.Poisson()).fit()
# Predict and find accuracy
pred = np.round(m.predict(valid_1819))
a3 = accuracy_score(valid_labels_1819, pred)
a1, a2, a3
# concatenate all data
all_data_poi = pd.concat([data_1415_poi, data_1516_poi, data_1617_poi, data_1718_poi, data_1819_poi], axis=0, ignore_index=True)
encoder_all = LabelEncoder()
encoder_all.fit(all_data_poi["FTR"])
all_data_poi["FTR"] = encoder_all.transform(all_data_poi["FTR"])
train_all, valid_all, train_labels_all, valid_labels_all = train_test_split(
all_data_poi.drop("FTR", axis=1),
all_data_poi["FTR"],
test_size=0.2,
random_state=123
)
train_all_allcols = train_all
train_all_allcols["FTR"] = train_labels_all
m1 = smf.glm(formula="FTR ~ HomeTeam + AwayTeam + HomeAvgAllTimeSoFar + \
HomeHighAllTimeSoFar + HomeLowAllTimeSoFar + AwayAvgAllTimeSoFar + \
AwayHighAllTimeSoFar + AwayLowAllTimeSoFar + HomeTotalGoals + \
HomeTotalShots + HomeTotalAccuracy + AwayTotalGoals + AwayTotalShots + \
AwayTotalAccuracy",
data=train_all_allcols, family=sm.families.Poisson()).fit()
pred = np.round(m1.predict(valid_all))
pred[np.isnan(pred)] = 1
a1 = accuracy_score(valid_labels_all, pred)
m2 = smf.glm(formula="FTR ~ HomeAvgAllTimeSoFar + \
HomeHighAllTimeSoFar + HomeLowAllTimeSoFar + AwayAvgAllTimeSoFar + \
AwayHighAllTimeSoFar + AwayLowAllTimeSoFar + HomeTotalGoals + \
HomeTotalShots + HomeTotalAccuracy + AwayTotalGoals + AwayTotalShots + \
AwayTotalAccuracy",
data=train_all_allcols, family=sm.families.Poisson()).fit()
pred = np.round(m2.predict(valid_all))
pred[np.isnan(pred)] = 1
a2 = accuracy_score(valid_labels_all, pred)
m3 = smf.glm(formula="FTR ~ HomeTeam + AwayTeam",
data=train_all_allcols, family=sm.families.Poisson()).fit()
pred = np.round(m3.predict(valid_all))
pred[np.isnan(pred)] = 1
a3 = accuracy_score(valid_labels_all, pred)
a1, a2, a3
m1.summary()
m2.summary()
def forward_selected(data, response):
remaining = set(data.columns)
remaining.remove(response)
selected = []
current_score, best_new_score = 0.0, 0.0
while remaining and current_score == best_new_score:
scores_with_candidates = []
for candidate in remaining:
formula = "{} ~ {} + 1".format(response,
' + '.join(selected + [candidate]))
score = smf.ols(formula, data).fit().rsquared_adj
scores_with_candidates.append((score, candidate))
scores_with_candidates.sort()
best_new_score, best_candidate = scores_with_candidates.pop()
if current_score < best_new_score:
remaining.remove(best_candidate)
selected.append(best_candidate)
current_score = best_new_score
formula = "{} ~ {} + 1".format(response,
' + '.join(selected))
model = smf.ols(formula, data).fit()
return model
def getLastTeamStats(df):
team_stats = {}
for team in df.HomeTeam.unique():
temp_df = df[::-1].reset_index(drop=True)
for i in range(len(temp_df)):
game = temp_df.loc[i]
if team == game['HomeTeam']:
stats = {}
stats['W'] = game['H_W']
stats['WR'] = game['H_WR']
stats['avg_diff'] = game['H_avg_diff']
team_stats[team] = stats
elif team == game['AwayTeam']:
stats = {}
stats['W'] = game['A_W']
stats['WR'] = game['A_WR']
stats['avg_diff'] = game['A_avg_diff']
team_stats[team] = stats
if team in team_stats:
break
return team_stats
As explained in the data preparation section, predicting betting odds reasonably required engineering new variables to run models on. The validity of this reasoning is later proven through the steps we took explained in this section. We chose to use a regressor instead of a classifier because bettings odds are continuous variables which don't make sense to be classified. We also chose to not use a percentile feature selector because we believe each of the six engineered variables represent its own crucial information.
The first model we created was a grid search with a pipeline that utilizes k-neighbors regression and 10-fold cross validation, with scaled training values. The variables it took into account were the number of goals scored by each team (FTHG, FTAG) as well as their ratios of goals to number of tries (HSGR, ASGR). The results were acceptable and not far off the models we choose to showcase below, but it was here we realized it didn't make sense to predict odds with game statistics.
Our second model was the same grid search and pipeline, but using only and all six newly engineered features (H_W, H_WR, H_avg_diff, A_W, A_WR, A_avg_diff). As expected, we got better results! There are two main takeaways from the visualizations of the results. One is that this model does not seem to be overfitted, since we are predicting values for the 18-19 season which is not included in the training data and the model under-predicts outliers. Second is that our engineered features seem to be doing its work; we can see on the plots that with time, the accuracy of the model increases due to the nature of the engineered features being running updated statistics. Visually, this is represented by the straight line at the start of all 3 plots (predictions for odd_home, odd_draw, odd_away). The straight line represents all teams' first games played in the season, when the model makes no assumptions about team/game statististics. However, the model's predicted values almost perfectly aligns with the actual values towards the end of the plots, except for the obvious outliers.
Our third model we created was a simple multivariate linear regression using forward selection on the six engineered features. This model was made to further compoare the performance of our second model, just in case an approach without a grid search might perform just as well. The results were about 20% to 40% worse in terms of negative mean absolute error. Also, the forward selection picked pretty much all six variables! It eliminated just one of them for draw betting odds, but it could be an outlier outcome. All this validates our reasoning to use k-neighbors with cross-validation and to skip on feature selection!
Our second model performed best, so we will use that to predict the betting odds for home, draw, and away in the future games of the 18-19 season.
scaler = MinMaxScaler()
knn_reg = KNeighborsRegressor()
df_future_games = pd.read_csv('./data/prediction.csv')
columns_to_use = ['H_W', 'H_WR', 'H_avg_diff', 'A_W', 'A_WR', 'A_avg_diff']
# GRID SEARCH (K-NEIGHBORS REGRESSOR, SCALER)
pipe = make_pipeline(scaler, knn_reg)
param_grid = {
'kneighborsregressor__n_neighbors':range(1, 20),
'kneighborsregressor__weights':['uniform', 'distance']
}
grid = GridSearchCV(pipe, param_grid, cv=10, scoring="neg_mean_absolute_error")
for odd_type in ['odd_home', 'odd_draw', 'odd_away']:
grid.fit(df_past_seasons[columns_to_use], df_past_seasons[odd_type])
predictions = grid.predict(df_1819_feat_engr[columns_to_use])
score = grid.score(df_1819_feat_engr[columns_to_use], df_1819_feat_engr[odd_type])
plt.figure(figsize=(16, 4))
plt.plot(np.arange(len(predictions)), predictions, alpha=0.8, label='predictions')
plt.plot(np.arange(len(predictions)), df_1819_feat_engr[odd_type].values, alpha=0.8, label='actual')
plt.title('Predicted '+odd_type+' for 2018-2019 Season (so far)', fontsize=15)
plt.xlabel('Game of Season', fontsize=15)
plt.ylabel(odd_type, fontsize=15)
plt.legend(fontsize=15)
plt.show()
print(odd_type, ', neg MAE: ', score)
print(grid.cv_results_['params'][grid.best_index_])
# FORWARD SELECTION
for odd_type in ['odd_home', 'odd_draw', 'odd_away']:
lin_model = forward_selected(df_past_seasons[np.append(columns_to_use, odd_type)], odd_type)
predictions = lin_model.predict(df_1819_feat_engr[np.append(columns_to_use, odd_type)])
score = 0-mean_absolute_error(df_1819_feat_engr[odd_type].values, predictions.values)
plt.figure(figsize=(16, 4))
plt.plot(np.arange(len(predictions)), predictions, alpha=0.8, label='predictions')
plt.plot(np.arange(len(predictions)), df_1819_feat_engr[odd_type].values, alpha=0.8, label='actual')
plt.title('Predicted '+odd_type+' for 2018-2019 Season (so far)', fontsize=15)
plt.xlabel('Game of Season', fontsize=15)
plt.ylabel(odd_type, fontsize=15)
plt.legend(fontsize=15)
plt.show()
print(odd_type, ', neg MAE: ', score)
print(lin_model.params)
# Preparation for predicting future games
team_stats = getLastTeamStats(df_1819_feat_engr)
H_W = []
H_WR = []
H_avg_diff = []
A_W = []
A_WR = []
A_avg_diff = []
for i in range(len(df_future_games)):
game = df_future_games.loc[i]
H_W.append(team_stats[game['HomeTeam']]['W'])
H_WR.append(team_stats[game['HomeTeam']]['WR'])
H_avg_diff.append(team_stats[game['HomeTeam']]['avg_diff'])
A_W.append(team_stats[game['AwayTeam']]['W'])
A_WR.append(team_stats[game['AwayTeam']]['WR'])
A_avg_diff.append(team_stats[game['AwayTeam']]['avg_diff'])
df_future_games['H_W'] = H_W
df_future_games['H_WR'] = H_WR
df_future_games['H_avg_diff'] = H_avg_diff
df_future_games['A_W'] = A_W
df_future_games['A_WR'] = A_WR
df_future_games['A_avg_diff'] = A_avg_diff
higher_wr = []
for i in range(len(df_future_games)):
if df_future_games.loc[i, 'H_WR'] > df_future_games.loc[i, 'A_WR']:
higher_wr.append('H')
elif df_future_games.loc[i, 'H_WR'] < df_future_games.loc[i, 'A_WR']:
higher_wr.append('A')
else:
higher_wr.append('D')
df_future_games['higher_wr'] = higher_wr
# PREDICT ODDS OF FUTURE GAMES USING GRIDSEARCH MODEL
pipe = make_pipeline(scaler, knn_reg)
param_grid = {
'kneighborsregressor__n_neighbors':range(1, 20),
'kneighborsregressor__weights':['uniform', 'distance']
}
grid = GridSearchCV(pipe, param_grid, cv=10, scoring="neg_mean_absolute_error")
for odd_type in ['odd_home', 'odd_draw', 'odd_away']:
grid.fit(df_past_seasons[columns_to_use], df_past_seasons[odd_type])
predictions = grid.predict(df_future_games[columns_to_use])
df_future_games['predicted_'+odd_type] = predictions
lower_odds = []
for i in range(len(df_future_games)):
if df_future_games.loc[i, 'predicted_odd_home'] > df_future_games.loc[i, 'predicted_odd_away']:
lower_odds.append('A')
elif df_future_games.loc[i, 'predicted_odd_home'] < df_future_games.loc[i, 'predicted_odd_away']:
lower_odds.append('H')
else:
lower_odds.append('D')
df_future_games['lower_odds'] = lower_odds
Below is a subset of the dataset representing future games as well as their 3 betting odds. Using our second model, we used the last-calculated statistics for each team to predict the outcomes below.
To draw further observations, we added higher_wr and lower_odds which represent which team currently has the higher win rate and lower betting odds. In the real world, the team with the higher win rate has lower betting odds. This is intuitive as betters are much more likely to vote for the team more likely to win.
# Predicted odds of future games
df_future_games[['HomeTeam', 'AwayTeam',
'H_WR', 'A_WR',
'predicted_odd_home', 'predicted_odd_draw', 'predicted_odd_away',
'higher_wr', 'lower_odds']].head()
Finally, to end this section, we looked at how many predictions have the same team for higher win rates and lower odds. We found this is true for 81% of the time, meaning there are games where the team with the lower win rate is predicted to have higher betting odds!
(df_future_games['higher_wr'] == df_future_games['lower_odds']).mean()
This model also uses poisson, but we replicated this model from David Sheehan's study. First we modified the dataset so that it shows the number of goals each team scored when they were home or away. We only chose the features that is available to us. In addition to the features that Sheehan used, we also added the bettingwhich are from the prediction in the previous section.
# Take a home team and away team
team_opponent_data_home = data_1819[['HomeTeam', 'AwayTeam', 'FTHG', 'odd_home', 'odd_draw', 'odd_away']]
team_opponent_data_home.columns = ['Team', 'Opponent', 'Goals', 'odd_team', 'odd_draw', 'odd_opponent']
team_opponent_data_home['Home'] = 1
team_opponent_data_away = data_1819[['AwayTeam', 'HomeTeam', 'FTAG', 'odd_away', 'odd_draw', 'odd_home']]
team_opponent_data_away.columns = ['Team', 'Opponent', 'Goals', 'odd_team', 'odd_draw', 'odd_opponent']
team_opponent_data_away['Home'] = 0
team_opponent_data = team_opponent_data_home.append(team_opponent_data_home).append(team_opponent_data_away)
team_opponent_data.head()
#Perform poisson model
poisson_model = smf.glm(formula="Goals ~ Home + Team + Opponent + odd_team + odd_draw + odd_opponent", data=team_opponent_data,
family=sm.families.Poisson()).fit()
poisson_model.summary()
The GLM table shows that Home status and betting odds do have difference with Game number of goals that each team scores. From the model, we are 95% confident that if the teams are playing at home, the team will score [0.160, 0.506] more goals than when they are not playing at home. Also, surprisingly, with the p-values of less than 0.05, betting odds do affect the number of goals that team scores in the game.
The table below shows the ranking of the teams until round 29. At this point, Man City was in the winning run in the EPL with 71 points. By using the poisson regression we created we will calculate the number of goals each team score in each game, and predict the result of the match.
ranking=pd.read_csv('./data/CurrentRanking.csv')
ranking
future_matches = pd.read_csv('./data/betting_odds_prediction.csv')
This is our input dataset. We listed all the home teams and their opponent(away teams) for all remaining games. For each game, we predicted number of goals that home team and away team might score and gave 3 points to the team that are more likely to score more. The limitation of our model is that it was nearly impossilbe to have same score(goals) for both home and away team, so there was no draw in our result. After predicting results of each matches, we predicted that Manchester City will win the season with 98 points.
future_matches.head()
## iterate the future matches and calculate the point
result = []
for index, row in future_matches.iterrows():
home_score = poisson_model.predict(pd.DataFrame(data={'Team': row['HomeTeam'], 'Opponent': row['AwayTeam'],
'Home':1, 'odd_team':row['predicted_odd_home'], 'odd_draw':row['predicted_odd_draw'],
'odd_opponent':row['predicted_odd_away']},index=[1]))
away_score = poisson_model.predict(pd.DataFrame(data={'Team': row['AwayTeam'], 'Opponent': row['HomeTeam'],
'Home':0, 'odd_team':row['predicted_odd_away'], 'odd_draw':row['predicted_odd_draw'],
'odd_opponent':row['predicted_odd_home']},index=[1]))
if(home_score[1] > away_score[1]):
ranking.iloc[ranking.loc[ranking['Team']==row.HomeTeam].index[0], ranking.columns.get_loc('Point')] = ranking.loc[ranking.Team == row.HomeTeam]['Point'].iloc[0] + 3
result.append('H')
elif(home_score[1] < away_score[1]):
ranking.iloc[ranking.loc[ranking['Team']==row.AwayTeam].index[0], ranking.columns.get_loc('Point')] = ranking.loc[ranking.Team == row.AwayTeam]['Point'].iloc[0] + 3
result.append('A')
else:
ranking.iloc[ranking.loc[ranking['Team']==row.HomeTeam].index[0], ranking.columns.get_loc('Point')] = ranking.loc[ranking.Team == row.HomeTeam]['Point'].iloc[0] + 1
ranking.iloc[ranking.loc[ranking['Team']==row.AwayTeam].index[0], ranking.columns.get_loc('Point')] = ranking.loc[ranking.Team == row.AwayTeam]['Point'].iloc[0] + 1
result.append('D')
future_matches['predicted_result'] = result
# Prediction
ranking.sort_values('Point', ascending=False)