Skip to content

Commit

Permalink
Merge pull request #12 from roclark/fix-game-info-parsing
Browse files Browse the repository at this point in the history
Fix game info parsing
  • Loading branch information
roclark authored Nov 13, 2018
2 parents 2ff2725 + 3382625 commit f5ee778
Show file tree
Hide file tree
Showing 13 changed files with 425 additions and 197 deletions.
89 changes: 40 additions & 49 deletions sportsreference/mlb/boxscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,56 +144,50 @@ def _retrieve_html_page(self, uri):
return None
return pq(utils._remove_html_comment_tags(url_data))

def _parse_game_date_and_location(self, field, boxscore):
def _parse_game_date_and_location(self, boxscore):
"""
Retrieve the game's date and location.
The date and location of the game follow a more complicated parsing
scheme and should be handled differently from other tags. Both fields
are separated by a newline character ('\n') with the first line being
the date and the second being the location.
The game's meta information, such as date, location, attendance, and
duration, follow a complex parsing scheme that changes based on the
layout of the page. The information should be able to be parsed and set
regardless of the order and how much information is included. To do
this, the meta information should be iterated through line-by-line and
fields should be determined by the values that are found in each line.
Parameters
----------
field : string
The name of the attribute to parse
boxscore : PyQuery object
A PyQuery object containing all of the HTML data from the boxscore.
Returns
-------
string
Depending on the requested field, returns a text representation of
either the date or location of the game.
"""
scheme = BOXSCORE_SCHEME[field]
scheme = BOXSCORE_SCHEME["game_info"]
items = [i.text() for i in boxscore(scheme).items()]
index = BOXSCORE_ELEMENT_INDEX[field]
game_info = items[0].split('\n')
double_header = False
for item in items:
if 'first game of doubleheader' in item.lower() or \
'second game of doubleheader' in item.lower():
double_header = True
if field == 'date':
return game_info[0]
for element in game_info:
if field == 'time_of_day':
if 'night game' in element.lower() or \
'day game' in element.lower():
return element
continue # pragma: no cover
matcher = DOUBLE_HEADER_INDICES[field]
if matcher in element.lower():
return element
# Triggered for double headers when a specific field is not included
# in the game information summary. For double headers, random fields
# are omitted for no apparent reason and should be parsed differently.
# If the field can't be found, it should return a default value of an
# empty string.
if double_header:
return ''
return game_info[index]
attendance = None
date = None
duration = None
time = None
time_of_day = None
venue = None
if len(game_info) > 0:
date = game_info[0]
for line in game_info:
if 'Start Time: ' in line:
time = line.replace('Start Time: ', '')
if 'Attendance: ' in line:
attendance = line.replace('Attendance: ', '').replace(',', '')
if 'Venue: ' in line:
venue = line.replace('Venue: ', '')
if 'Game Duration: ' in line:
duration = line.replace('Game Duration: ', '')
if 'Night Game' in line or 'Day Game' in line:
time_of_day = line
setattr(self, '_attendance', attendance)
setattr(self, '_date', date)
setattr(self, '_duration', duration)
setattr(self, '_time', time)
setattr(self, '_time_of_day', time_of_day)
setattr(self, '_venue', venue)

def _parse_name(self, field, boxscore):
"""
Expand Down Expand Up @@ -251,17 +245,13 @@ def _parse_game_data(self, uri):
short_field == 'winning_abbr' or \
short_field == 'losing_name' or \
short_field == 'losing_abbr' or \
short_field == 'uri':
continue
if short_field == 'date' or \
short_field == 'uri' or \
short_field == 'date' or \
short_field == 'time' or \
short_field == 'venue' or \
short_field == 'attendance' or \
short_field == 'time_of_day' or \
short_field == 'duration':
value = self._parse_game_date_and_location(short_field,
boxscore)
setattr(self, field, value)
continue
if short_field == 'away_name' or \
short_field == 'home_name':
Expand All @@ -276,6 +266,7 @@ def _parse_game_data(self, uri):
short_field,
index)
setattr(self, field, value)
self._parse_game_date_and_location(boxscore)

@property
def dataframe(self):
Expand Down Expand Up @@ -389,29 +380,29 @@ def time(self):
"""
Returns a ``string`` of the time the game started.
"""
return self._time.replace('Start Time: ', '')
return self._time

@property
def venue(self):
"""
Returns a ``string`` of the name of the ballpark where the game was
played.
"""
return self._venue.replace('Venue: ', '')
return self._venue

@int_property_decorator
def attendance(self):
"""
Returns an ``int`` of the game's listed attendance.
"""
return self._attendance.replace('Attendance: ', '').replace(',', '')
return self._attendance

@property
def duration(self):
"""
Returns a ``string`` of the game's duration in the format 'H:MM'.
"""
return self._duration.replace('Game Duration: ', '')
return self._duration

@property
def time_of_day(self):
Expand Down
7 changes: 1 addition & 6 deletions sportsreference/mlb/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,12 +140,7 @@
}

BOXSCORE_SCHEME = {
'date': 'div[class="scorebox_meta"]',
'time': 'div[class="scorebox_meta"]',
'attendance': 'div[class="scorebox_meta"]',
'venue': 'div[class="scorebox_meta"]',
'time_of_day': 'div[class="scorebox_meta"]',
'duration': 'div[class="scorebox_meta"]',
'game_info': 'div[class="scorebox_meta"]',
'away_name': 'a[itemprop="name"]:first',
'home_name': 'a[itemprop="name"]:last',
'winner': 'td[data-stat=""]',
Expand Down
2 changes: 2 additions & 0 deletions sportsreference/nba/boxscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ def _parse_game_date_and_location(self, field, boxscore):
scheme = BOXSCORE_SCHEME[field]
items = [i.text() for i in boxscore(scheme).items()]
game_info = items[0].split('\n')
if len(game_info) < 3 and field == 'location':
return None
return game_info[BOXSCORE_ELEMENT_INDEX[field]]

def _parse_name(self, field, boxscore):
Expand Down
2 changes: 2 additions & 0 deletions sportsreference/ncaab/boxscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,8 @@ def _parse_game_date_and_location(self, field, boxscore):
scheme = BOXSCORE_SCHEME[field]
items = [i.text() for i in boxscore(scheme).items()]
game_info = items[0].split('\n')
if len(game_info) < 3 and field == 'location':
return None
return game_info[BOXSCORE_ELEMENT_INDEX[field]]

def _parse_name(self, field, boxscore):
Expand Down
60 changes: 35 additions & 25 deletions sportsreference/nfl/boxscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,32 +145,45 @@ def _retrieve_html_page(self, uri):
return None
return pq(utils._remove_html_comment_tags(url_data))

def _parse_game_date_and_location(self, field, boxscore):
def _parse_game_date_and_location(self, boxscore):
"""
Retrieve the game's date and location.
The date and location of the game follow a more complicated parsing
scheme and should be handled differently from other tags. Both fields
are separated by a newline character ('\n') with the first line being
the date and the second being the location.
The games' meta information, such as date, location, attendance, and
duration, follow a complex parsing scheme that changes based on the
layout of the page. The information should be able to be parsed and set
regardless of the order and how much information is included. To do
this, the meta information should be iterated through line-by-line and
fields should be determined by the values that are found in each line.
Parameters
----------
field : string
The name of the attribute to parse
boxscore : PyQuery object
A PyQuery object containing all of the HTML data from the boxscore.
Returns
-------
string
Depending on the requested field, returns a text representation of
either the date or location of the game.
"""
scheme = BOXSCORE_SCHEME[field]
scheme = BOXSCORE_SCHEME["game_info"]
items = [i.text() for i in boxscore(scheme).items()]
game_info = items[0].split('\n')
return game_info[BOXSCORE_ELEMENT_INDEX[field]]
attendance = None
date = None
duration = None
stadium = None
time = None
date = game_info[0]
for line in game_info:
if 'Attendance' in line:
attendance = line.replace('Attendance: ', '').replace(',', '')
if 'Time of Game' in line:
duration = line.replace('Time of Game: ', '')
if 'Stadium' in line:
stadium = line.replace('Stadium: ', '')
if 'Start Time' in line:
time = line.replace('Start Time: ', '')
setattr(self, '_attendance', attendance)
setattr(self, '_date', date)
setattr(self, '_duration', duration)
setattr(self, '_stadium', stadium)
setattr(self, '_time', time)

def _parse_name(self, field, boxscore):
"""
Expand Down Expand Up @@ -228,16 +241,12 @@ def _parse_game_data(self, uri):
short_field == 'winning_abbr' or \
short_field == 'losing_name' or \
short_field == 'losing_abbr' or \
short_field == 'uri':
continue
if short_field == 'date' or \
short_field == 'uri' or \
short_field == 'date' or \
short_field == 'time' or \
short_field == 'stadium' or \
short_field == 'attendance' or \
short_field == 'duration':
value = self._parse_game_date_and_location(short_field,
boxscore)
setattr(self, field, value)
continue
if short_field == 'away_name' or \
short_field == 'home_name':
Expand All @@ -252,6 +261,7 @@ def _parse_game_data(self, uri):
short_field,
index)
setattr(self, field, value)
self._parse_game_date_and_location(boxscore)

@property
def dataframe(self):
Expand Down Expand Up @@ -336,29 +346,29 @@ def time(self):
"""
Returns a ``string`` of the time the game started.
"""
return self._time.replace('Start Time: ', '')
return self._time

@property
def stadium(self):
"""
Returns a ``string`` of the name of the stadium where the game was
played.
"""
return self._stadium.replace('Stadium: ', '')
return self._stadium

@int_property_decorator
def attendance(self):
"""
Returns an ``int`` of the game's listed attendance.
"""
return self._attendance.replace('Attendance: ', '').replace(',', '')
return self._attendance

@property
def duration(self):
"""
Returns a ``string`` of the game's duration in the format 'H:MM'.
"""
return self._duration.replace('Time of Game: ', '')
return self._duration

@property
def winner(self):
Expand Down
6 changes: 1 addition & 5 deletions sportsreference/nfl/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,7 @@
}

BOXSCORE_SCHEME = {
'date': 'div[class="scorebox_meta"]:first',
'time': 'div[class="scorebox_meta"]:first',
'stadium': 'div[class="scorebox_meta"]:first',
'attendance': 'div[class="scorebox_meta"]:first',
'duration': 'div[class="scorebox_meta"]:first',
'game_info': 'div[class="scorebox_meta"]:first',
'home_name': 'a[itemprop="name"]:first',
'away_name': 'a[itemprop="name"]:last',
'away_points': 'div[class="scorebox"] div[class="score"]',
Expand Down
Loading

0 comments on commit f5ee778

Please sign in to comment.