Skip to content

Commit

Permalink
[ClubElo, FotMob, SoFIFA, FBREF] Fixing and unifying TEAMNAME_REPLACE…
Browse files Browse the repository at this point in the history
…MENTS loading (#755)

This commit fixes how alternative team names are handled in methods using a `team` parameter. These methods should allow providing both the team name that is used by the data source and the standardized team name in the teamname_replacements.json file.

Example: 
The teamname_replacements.json file sets "Manchester City" as the default name. However, ClubElo uses "Man City". Hence, the `ClubElo.read_team_history` function should work with both `team="Man City"` and `team="Manchester City"`.

---------

Co-authored-by: root <[email protected]>
Co-authored-by: Pieter Robberechts <[email protected]>
  • Loading branch information
3 people authored Nov 25, 2024
1 parent 6d1044a commit 34c57d7
Show file tree
Hide file tree
Showing 11 changed files with 133 additions and 36 deletions.
1 change: 1 addition & 0 deletions CONTRIBUTING.rst
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ Install the package with development requirements:
.. code:: console
$ poetry install
$ poetry self add poetry-plugin-export
You can now run an interactive Python session.

Expand Down
57 changes: 56 additions & 1 deletion soccerdata/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from packaging import version
from selenium.common.exceptions import JavascriptException, WebDriverException

from ._config import DATA_DIR, LEAGUE_DICT, MAXAGE, logger
from ._config import DATA_DIR, LEAGUE_DICT, MAXAGE, TEAMNAME_REPLACEMENTS, logger


class SeasonCode(Enum):
Expand Down Expand Up @@ -684,6 +684,61 @@ def make_game_id(row: pd.Series) -> str:
return game_id


def add_alt_team_names(team: Union[str, list[str]]) -> set[str]:
"""Add a set of alternative team names for a standardized team name.
If a standardized team name is given, add the set of alternative
names used by the data sources. If a non-standardized name is given,
a set only containing the given name is returned.
Parameters
----------
team : str or list of str
The team name(s) to consider.
Returns
-------
set of str
A set contraining the given team name(s) and alternative names.
"""
teams = [team] if isinstance(team, str) else team

alt_teams = set()
for team in teams:
for alt_name, norm_name in TEAMNAME_REPLACEMENTS.items():
if norm_name == team:
alt_teams.add(alt_name)
alt_teams.add(team)
return alt_teams


def add_standardized_team_name(team: Union[str, list[str]]) -> set[str]:
"""Add the standardized team name for a non-standardized team name.
If a non-standardized team name is given, add the standardized
name. If a standardized name is given, a set only containing the given
name is returned.
Parameters
----------
team : str or list of str
The team name(s) to consider.
Returns
-------
set of str
A set contraining the given team name(s) and standardized names.
"""
teams = [team] if isinstance(team, str) else team
std_teams = set()
for team in teams:
for alt_name, norm_name in TEAMNAME_REPLACEMENTS.items():
if alt_name == team:
std_teams.add(norm_name)
std_teams.add(team)
return std_teams


def standardize_colnames(df: pd.DataFrame, cols: Optional[list[str]] = None) -> pd.DataFrame:
"""Convert DataFrame column names to snake case."""

Expand Down
10 changes: 3 additions & 7 deletions soccerdata/clubelo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pandas as pd
from unidecode import unidecode

from ._common import BaseRequestsReader, standardize_colnames
from ._common import BaseRequestsReader, add_alt_team_names, standardize_colnames
from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS

CLUB_ELO_DATADIR = DATA_DIR / "ClubElo"
Expand Down Expand Up @@ -148,12 +148,8 @@ def read_team_history(
-------
pd.DataFrame
"""
teams_to_check = [k for k, v in TEAMNAME_REPLACEMENTS.items() if v == team]
teams_to_check.append(team)

for i, _ in enumerate(teams_to_check):
teams_to_check[i] = unidecode(teams_to_check[i])
teams_to_check[i] = re.sub(r"[\s']", "", teams_to_check[i])
teams_to_check = add_alt_team_names(team)
teams_to_check = {re.sub(r"[\s']", "", unidecode(team)) for team in teams_to_check}

for _team in teams_to_check:
filepath = self.data_dir / f"{_team}.csv"
Expand Down
17 changes: 8 additions & 9 deletions soccerdata/fbref.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,13 @@
import pandas as pd
from lxml import etree, html

from ._common import BaseRequestsReader, SeasonCode, make_game_id, standardize_colnames
from ._common import (
BaseRequestsReader,
SeasonCode,
add_alt_team_names,
make_game_id,
standardize_colnames,
)
from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS, logger

FBREF_DATADIR = DATA_DIR / "FBref"
Expand Down Expand Up @@ -411,14 +417,7 @@ def read_team_match_stats( # noqa: C901
df_teams = self.read_team_season_stats()

if team is not None:
# get alternative names of the specified team(s)
teams = [team] if isinstance(team, str) else team
teams_to_check = []
for team in teams:
for alt_name, norm_name in TEAMNAME_REPLACEMENTS.items():
if norm_name == team:
teams_to_check.append(alt_name)
teams_to_check.append(team)
teams_to_check = add_alt_team_names(team)

# select requested teams
iterator = df_teams.loc[df_teams.index.isin(teams_to_check, level=2), :]
Expand Down
13 changes: 4 additions & 9 deletions soccerdata/fotmob.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import pandas as pd
import requests

from ._common import BaseRequestsReader, make_game_id
from ._common import BaseRequestsReader, add_standardized_team_name, make_game_id
from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS, logger

FOTMOB_DATADIR = DATA_DIR / "FotMob"
Expand Down Expand Up @@ -327,7 +327,7 @@ def read_schedule(self, force_cache: bool = False) -> pd.DataFrame:
df[["home_score", "away_score"]] = df["status.scoreStr"].str.split("-", expand=True)
return df.set_index(["league", "season", "game"]).sort_index()[cols]

def read_team_match_stats( # noqa: C901
def read_team_match_stats(
self,
stat_type: str = "Top stats",
opponent_stats: bool = True,
Expand Down Expand Up @@ -378,13 +378,8 @@ def read_team_match_stats( # noqa: C901

if team is not None:
# get alternative names of the specified team(s)
teams = [team] if isinstance(team, str) else team
teams_to_check = []
for team in teams:
for alt_name, norm_name in TEAMNAME_REPLACEMENTS.items():
if norm_name == team:
teams_to_check.append(alt_name)
teams_to_check.append(team)
teams_to_check = add_standardized_team_name(team)

# select requested teams
iterator = df_complete.loc[
(
Expand Down
11 changes: 2 additions & 9 deletions soccerdata/sofifa.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import pandas as pd
from lxml import html

from ._common import BaseRequestsReader, standardize_colnames
from ._common import BaseRequestsReader, add_standardized_team_name, standardize_colnames
from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS, logger

SO_FIFA_DATADIR = DATA_DIR / "SoFIFA"
Expand Down Expand Up @@ -240,14 +240,7 @@ def read_players(self, team: Optional[Union[str, list[str]]] = None) -> pd.DataF
df_teams = self.read_teams()

if team is not None:
# get alternative names of the specified team(s)
teams = [team] if isinstance(team, str) else team
teams_to_check = []
for team in teams:
for alt_name, norm_name in TEAMNAME_REPLACEMENTS.items():
if norm_name == team:
teams_to_check.append(alt_name)
teams_to_check.append(team)
teams_to_check = add_standardized_team_name(team)

# select requested teams
iterator = df_teams.loc[df_teams.team.isin(teams_to_check), :]
Expand Down
5 changes: 4 additions & 1 deletion tests/appdata/config/teamname_replacements.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
{
"Manchester City": ["Man City"]
"Manchester City": ["Man City"],
"Olympique Marseille": ["Marseille"],
"Valencia CF": ["Valencia"],
"FC Bayern Munich": ["FC Bayern München"]
}
13 changes: 13 additions & 0 deletions tests/test_FBref.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,19 @@ def test_read_team_match_stats(fbref_ligue1: FBref, stat_type: str) -> None:
assert isinstance(fbref_ligue1.read_team_match_stats(stat_type), pd.DataFrame)


def test_read_team_match_stats_alt_names(fbref_ligue1: FBref) -> None:
# Test with FBref team name
assert isinstance(
fbref_ligue1.read_team_match_stats(stat_type="schedule", team="Olympique Marseille"),
pd.DataFrame,
)
# Test with standardized team name
assert isinstance(
fbref_ligue1.read_team_match_stats(stat_type="schedule", team="Marseille"),
pd.DataFrame,
)


@pytest.mark.parametrize(
"stat_type",
[
Expand Down
13 changes: 13 additions & 0 deletions tests/test_FotMob.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,16 @@ def test_read_team_match_stats(fotmob_laliga: FotMob, stat_type: str) -> None:
assert isinstance(
fotmob_laliga.read_team_match_stats(stat_type, team="Valencia"), pd.DataFrame
)


@pytest.mark.fails_gha()
def test_read_team_match_stats_alt_names(fotmob_laliga: FotMob) -> None:
# Test with Fotmob team name
assert isinstance(
fotmob_laliga.read_team_match_stats(stat_type="Top stats", team="Valencia"), pd.DataFrame
)
# Test with standardized team name
assert isinstance(
fotmob_laliga.read_team_match_stats(stat_type="Top stats", team="Valencia CF"),
pd.DataFrame,
)
10 changes: 10 additions & 0 deletions tests/test_SoFIFA.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@
from soccerdata.sofifa import SoFIFA


def test_read_players(sofifa_bundesliga: SoFIFA) -> None:
"""It should use the replacement names from teamname_replacements.json."""
assert isinstance(sofifa_bundesliga.read_players(team="FC Bayern München"), pd.DataFrame)


def test_read_players_replacement(sofifa_bundesliga: SoFIFA) -> None:
"""It should use the replacement names from teamname_replacements.json."""
assert isinstance(sofifa_bundesliga.read_players(team="FC Bayern Munich"), pd.DataFrame)


def test_read_team_ratings(sofifa_bundesliga: SoFIFA) -> None:
"""It should return a dataframe with the team ratings."""
assert isinstance(sofifa_bundesliga.read_team_ratings(), pd.DataFrame)
Expand Down
19 changes: 19 additions & 0 deletions tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from soccerdata._common import (
BaseRequestsReader,
SeasonCode,
add_alt_team_names,
add_standardized_team_name,
make_game_id,
standardize_colnames,
)
Expand Down Expand Up @@ -108,6 +110,23 @@ def test_make_game_id():
assert game_id == "1993-07-30 Barcelona-Real Madrid"


# add_alt_team_names


def test_add_alt_team_names():
# "Valencia" is replaced by "Valencia CF"
assert add_alt_team_names("Valencia CF") == {"Valencia", "Valencia CF"}
# "Real Madrid" is not replaced
assert add_alt_team_names("Real Madrid") == {"Real Madrid"}


def test_add_standardize_team_name():
# "Valencia" is replaced by "Valencia CF"
assert add_standardized_team_name("Valencia") == {"Valencia", "Valencia CF"}
# "Real Madrid" is not replaced
assert add_standardized_team_name("Real Madrid") == {"Real Madrid"}


# standardize_colnames


Expand Down

0 comments on commit 34c57d7

Please sign in to comment.