Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MatchHistory] Different encoding since 2024/25 #788

Merged
merged 2 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions soccerdata/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
},
}
logging.config.dictConfig(logging_config)
logging.captureWarnings(True)
logger = logging.getLogger("root")
logger.handlers[0] = RichHandler(markup=True)

Expand Down
28 changes: 22 additions & 6 deletions soccerdata/match_history.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,35 @@

import itertools
from pathlib import Path
from typing import Callable, Optional, Union
from typing import IO, Callable, Optional, Union

import pandas as pd

from ._common import BaseRequestsReader, make_game_id
from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS
from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS, logger

MATCH_HISTORY_DATA_DIR = DATA_DIR / "MatchHistory"
MATCH_HISTORY_API = "https://www.football-data.co.uk"


def _parse_csv(raw_data: IO[bytes], lkey: str, skey: str) -> pd.DataFrame:
logger.info("Parsing league=%s season=%s", lkey, skey)
if int(skey) >= 2425:
# Since 2024-25, the CSV files are encoded in UTF-8-SIG
df_games = pd.read_csv(
raw_data,
encoding="UTF-8-SIG",
on_bad_lines="warn",
)
else:
df_games = pd.read_csv(
raw_data,
encoding="latin-1",
on_bad_lines="warn",
)
return df_games


class MatchHistory(BaseRequestsReader):
"""Provides pd.DataFrames from CSV files available at http://www.football-data.co.uk/data.php.

Expand Down Expand Up @@ -92,12 +110,10 @@ def read_games(self) -> pd.DataFrame:
filepath = self.data_dir / filemask.format(lkey, skey)
url = urlmask.format(skey, lkey)
current_season = not self._is_complete(lkey, skey)

reader = self.get(url, filepath, no_cache=current_season)
df_games = _parse_csv(reader, lkey, skey).assign(season=skey)

df_games = pd.read_csv(
reader,
encoding="ISO-8859-1",
).assign(season=skey)
if "Time" not in df_games.columns:
df_games["Time"] = "12:00"
df_games["Time"] = df_games["Time"].fillna("12:00")
Expand Down
6 changes: 3 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def elo() -> sd.ClubElo:


@pytest.fixture()
def match_epl_2y() -> sd.MatchHistory:
"""Return a MatchHistory instance for the last 2 years of the EPL."""
return sd.MatchHistory("ENG-Premier League", list(range(2018, 2020)))
def match_epl_5y() -> sd.MatchHistory:
"""Return a MatchHistory instance for the last 5 years of the EPL."""
return sd.MatchHistory("ENG-Premier League", list(range(2019, 2025)))


@pytest.fixture()
Expand Down
9 changes: 5 additions & 4 deletions tests/test_MatchHistory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
from soccerdata.match_history import MatchHistory


def test_read_games(match_epl_2y: MatchHistory) -> None:
def test_read_games(match_epl_5y: MatchHistory) -> None:
"""It should return a DataFrame with all games from the selected leagues and seasons."""
df = match_epl_2y.read_games()
df = match_epl_5y.read_games()
assert isinstance(df, pd.DataFrame)
assert len(df.index.get_level_values("season").unique()) == 2
assert len(df) == 760
assert len(df.index.get_level_values("season").unique()) == 5
assert len(df) == 2107
assert not any("" in c for c in df.columns)
Loading