-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfootball_elo_scrape.py
253 lines (169 loc) · 6.93 KB
/
football_elo_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
# -*- coding: utf-8 -*-
"""
Created on 10/2/2017
@author: Jesse Cambon
"""
### Goal of this program is to calculate ELO for an NFL season
# by scraping results off of a website.
# There is a result level dataframe and a team level dataframe
# ELOs are adjusted for each result and then stored in the team level dataframe
# 10/2/2017: Currently this just calculates ELO for historical results
from bs4 import BeautifulSoup
#import urllib.request
import requests
import pandas as pd
# Calculate ELO rating for NFL teams using data scraped from web
# Starting out with calculating # of wins
# Data source we are going to scrape for results
data_url = 'https://www.pro-football-reference.com/years/2017/games.htm#games::none'
# Scrape the index of a given page
# Return a list of specified web elements
def scrape(selection,parent_object,element_type):
# Select the given div
# data = soup.findAll("div", { "class" : "table_outer_container" })
list_links = []
data = soup.findAll(parent_object, { "data-stat" : selection })
for element in data:
#print(element['href'])
# Add NA option
if element_type!='na':
list_links += [a.contents[0] for a in element.findAll(element_type)]
else:
# Extracts number if it exists
if str(element.renderContents()) != "b''":
list_links += [str(element.renderContents()).split('\'')[1]]
#print(list_links)
#
return list_links
# This is the web data we
page = requests.get(data_url)
soup = BeautifulSoup(page.content, 'html.parser')
# Automatically only goes as far as shortest list
# which is the pts_win (limits to only current games played)
import numpy as np
# this is a game level dataframe
length = len(scrape("pts_win",'td','strong'))
week = scrape("week_num",'th','na')
# Remove all the text from our week data column
while 'Week' in week: week.remove('Week')
season = pd.DataFrame(np.column_stack([week[:length],scrape("winner",'td','a')[:length],scrape("loser",'td','a')[:length],scrape("pts_win",'td','strong')[:length],scrape("pts_lose",'td','na')[:length]]),columns=['week','winner','loser',"pts_win",'pts_lose'])
season['pts_diff'] = season['pts_win'].astype(int) - season['pts_lose'].astype(int)
# This is a team level dataframe
# I append winners to losers to get all possible teams
team_ref = pd.DataFrame(season['winner'].append(season['loser']),columns=['team']).drop_duplicates().set_index(['team']).sort_index()
#initialize vars
# Typed these values in from 538.com
# teams in alphabetical order
elo_list = [
[ 1537],
[ 1617],
[ 1491],
[ 1484],
[ 1527],
[ 1384],
[ 1516],
[ 1336],
[ 1569],
[ 1556],
[ 1501],
[ 1587],
[ 1502],
[ 1514],
[ 1382],
[ 1613],
[ 1437],
[ 1399],
[ 1509],
[ 1498],
[ 1687],
[ 1498],
[ 1530],
[ 1452],
[ 1530],
[ 1511],
[ 1599],
[ 1353],
[ 1571],
[ 1506],
[ 1460],
[ 1504]]
team_ref['elo'] = elo_list
# Old code to start every team at 1500
#team_ref['elo'] = [[1500] for _ in range(len(team_ref))]
# Initialize wins
team_ref['wins'] = 0
team_ref['losses'] = 0
# Initialize ELO rating day of the match
season['winner_elo'] = 0
season['loser_elo'] = 0
season['elo_diff'] = 0
# Initialize ELO rating adjusted for the given match results
season['winner_adj_elo'] = 0
season['loser_adj_elo'] = 0
season['elo_adj_diff'] = 0
# Change the Elo of a team using the index (index is the team name)
K = 20 # this is the ELO adjustment constant
# Iterate through results of the season
for i in range(len(season)):
# Names of teams that won and lost for a given game
winner = season.loc[i]['winner']
loser = season.loc[i]['loser']
pts_diff = season.loc[i]['pts_diff']
# Update counter on team sheet
team_ref.at[winner,'wins'] += 1
team_ref.at[loser,'losses'] += 1
# Set starting ELO
season.at[i,'winner_elo'] = team_ref.at[winner,'elo'][-1]
season.at[i,'loser_elo'] = team_ref.at[loser,'elo'][-1]
season.at[i,'elo_diff'] = season.at[i,'winner_elo'] - season.at[i,'loser_elo']
# Calculate Adjusted ELO
# https://metinmediamath.wordpress.com/2013/11/27/how-to-calculate-the-elo-rating-including-example/
trans_winner_rating = 10**(season.at[i,'winner_elo'] / 400)
trans_loser_rating = 10**(season.at[i,'loser_elo'] / 400)
# print(trans_winner_rating)
# print(trans_loser_rating)
expected_winner_score = trans_winner_rating / (trans_winner_rating + trans_loser_rating)
elo_adj = np.log(pts_diff) * K * (1 - expected_winner_score)
#expected_loser_score = trans_loser_rating / (trans_winner_rating + trans_loser_rating)
season.at[i,'winner_adj_elo'] = season.at[i,'winner_elo'] + elo_adj
season.at[i,'loser_adj_elo'] = season.at[i,'loser_elo'] - elo_adj
season.at[i,'elo_adj_diff'] = season.at[i,'winner_adj_elo'] - season.at[i,'loser_adj_elo']
# Add our new elo scores to the team level spreadsheet
team_ref.at[winner,'elo'].append(season.at[i,'winner_adj_elo'])
team_ref.at[loser,'elo'].append(season.at[i,'loser_adj_elo'])
#team_ref.loc[team_ref.loc[winner], 'wins'] += 1
# Adds a given value to an elo rating
#team_ref.at['New York Giants','elo'].append(team_ref.at['New York Giants','elo'][-1] + 5)
#team_ref['elo'][-1]
# Get the current ELO, it's the last one in the ELO column list for each team
team_ref['Current ELO'] = [ a[-1] for a in team_ref['elo'] ]
# Display teams with the top ELOS
print(team_ref[['wins','losses','Current ELO']].sort_values('Current ELO',ascending=False))
# Now let's predict next week
current_week = season['week'].max()
next_week = pd.DataFrame(np.column_stack([week,scrape("winner",'td','a'),scrape("loser",'td','a')]),columns=['week','team1','team2'])
next_week = next_week[next_week['week'] == str(int(current_week) + 1)].reset_index(drop=True)
#initialize
next_week['team1_win_prob'] = 0.00
next_week['predicted_winner'] = ''
for i in range(len(next_week)):
team1_elo = team_ref.at[next_week.loc[i]['team1'],'elo'][-1]
team2_elo =team_ref.at[next_week.loc[i]['team2'],'elo'][-1]
elo_diff = team1_elo - team2_elo
trans_team1_rating = 10**(team1_elo / 400)
trans_team2_rating = 10**(team2_elo / 400)
# print(trans_team1_rating)
#print(trans_team2_rating)
# print(trans_winner_rating)
# print(trans_loser_rating)
next_week.at[i,'team1_win_prob'] = trans_team1_rating / (trans_team1_rating + trans_team2_rating)
if next_week.at[i,'team1_win_prob'] > 0.5:
next_week.at[i,'predicted_winner'] = next_week.at[i,'team1']
else:
next_week.at[i,'predicted_winner'] = next_week.at[i,'team2']
#print (trans_team1_rating / (trans_team1_rating + trans_team2_rating)
#if i == 0:
# break
team_ref.to_json('team_ref.json')
season.to_json('season.json')
next_week.to_json('predictions.json')