Skip to content

data

data

Data loading and schema definitions.

FPLDataLoader

FPLDataLoader(cache_dir: Optional[Path] = None)

Load and manage FPL data from various sources (API, CSV, cache).

PARAMETER DESCRIPTION
cache_dir

Directory to cache downloaded data

TYPE: Optional[Path] DEFAULT: None

Source code in fplx/data/loaders.py
def __init__(self, cache_dir: Optional[Path] = None):
    self.cache_dir = cache_dir or Path.home() / ".fplx" / "cache"
    self.cache_dir.mkdir(parents=True, exist_ok=True)
    self._bootstrap_data = None

fetch_bootstrap_data

fetch_bootstrap_data(force_refresh: bool = False) -> dict

Fetch main FPL data (players, teams, gameweeks).

PARAMETER DESCRIPTION
force_refresh

Force refresh even if cached

TYPE: bool DEFAULT: False

RETURNS DESCRIPTION
Dict

Bootstrap data containing players, teams, events

Source code in fplx/data/loaders.py
def fetch_bootstrap_data(self, force_refresh: bool = False) -> dict:
    """
    Fetch main FPL data (players, teams, gameweeks).

    Parameters
    ----------
    force_refresh : bool
        Force refresh even if cached

    Returns
    -------
    Dict
        Bootstrap data containing players, teams, events
    """
    cache_file = self.cache_dir / "bootstrap.json"

    if not force_refresh and cache_file.exists():
        import json

        with open(cache_file) as f:
            logger.info("Loading bootstrap data from cache")
            return json.load(f)

    logger.info("Fetching bootstrap data from FPL API")
    response = requests.get(self.BOOTSTRAP_URL)
    response.raise_for_status()

    data = response.json()

    # Cache the data
    import json

    with open(cache_file, "w") as f:
        json.dump(data, f)

    self._bootstrap_data = data
    return data

load_players

load_players(force_refresh: bool = False) -> list[Player]

Load all players with basic info.

PARAMETER DESCRIPTION
force_refresh

Force refresh from API

TYPE: bool DEFAULT: False

RETURNS DESCRIPTION
list[Player]

List of Player objects

Source code in fplx/data/loaders.py
def load_players(self, force_refresh: bool = False) -> list[Player]:
    """
    Load all players with basic info.

    Parameters
    ----------
    force_refresh : bool
        Force refresh from API

    Returns
    -------
    list[Player]
        List of Player objects
    """
    data = self.fetch_bootstrap_data(force_refresh)

    # Build team mapping
    teams = {t["id"]: t["name"] for t in data["teams"]}
    positions = {1: "GK", 2: "DEF", 3: "MID", 4: "FWD"}

    players = []
    for element in data["elements"]:
        # Create minimal timeseries (can be enriched later)
        ts_data = {
            "gameweek": [0],
            "points": [element.get("total_points", 0)],
            "minutes": [element.get("minutes", 0)],
            "form": [float(element.get("form", 0))],
        }

        player = Player(
            id=element["id"],
            name=element["web_name"],
            team=teams[element["team"]],
            position=positions[element["element_type"]],
            price=element["now_cost"] / 10.0,  # Convert to £m
            timeseries=pd.DataFrame(ts_data),
            news={
                "text": element.get("news", ""),
                "availability": 1.0
                if element.get("chance_of_playing_next_round") is None
                else element.get("chance_of_playing_next_round") / 100.0,
            },
        )
        players.append(player)

    logger.info(f"Loaded {len(players)} players")
    return players

load_player_history

load_player_history(player_id: int) -> DataFrame

Load detailed historical data for a specific player.

PARAMETER DESCRIPTION
player_id

Player ID

TYPE: int

RETURNS DESCRIPTION
DataFrame

Historical gameweek stats

Source code in fplx/data/loaders.py
def load_player_history(self, player_id: int) -> pd.DataFrame:
    """
    Load detailed historical data for a specific player.

    Parameters
    ----------
    player_id : int
        Player ID

    Returns
    -------
    pd.DataFrame
        Historical gameweek stats
    """
    url = self.PLAYER_DETAIL_URL.format(player_id=player_id)
    response = requests.get(url)
    response.raise_for_status()

    data = response.json()
    history = pd.DataFrame(data["history"])

    # Rename columns for consistency
    if not history.empty:
        history = history.rename(
            columns={
                "round": "gameweek",
                "total_points": "points",
                "minutes": "minutes",
                "goals_scored": "goals",
                "assists": "assists",
                "expected_goals": "xG",
                "expected_assists": "xA",
            }
        )

    return history

load_fixtures

load_fixtures() -> DataFrame

Load all fixtures.

RETURNS DESCRIPTION
DataFrame

Fixtures data

Source code in fplx/data/loaders.py
def load_fixtures(self) -> pd.DataFrame:
    """
    Load all fixtures.

    Returns
    -------
    pd.DataFrame
        Fixtures data
    """
    response = requests.get(self.FIXTURES_URL)
    response.raise_for_status()

    fixtures = pd.DataFrame(response.json())
    return fixtures

load_from_csv

load_from_csv(filepath: Path) -> DataFrame

Load data from CSV file.

PARAMETER DESCRIPTION
filepath

Path to CSV file

TYPE: Path

RETURNS DESCRIPTION
DataFrame

Loaded data

Source code in fplx/data/loaders.py
def load_from_csv(self, filepath: Path) -> pd.DataFrame:
    """
    Load data from CSV file.

    Parameters
    ----------
    filepath : Path
        Path to CSV file

    Returns
    -------
    pd.DataFrame
        Loaded data
    """
    logger.info("Loading data from %s", filepath)
    df = pd.read_csv(filepath)
    return df

enrich_player_history

enrich_player_history(
    players: list[Player],
) -> list[Player]

Enrich players with full historical data.

PARAMETER DESCRIPTION
players

List of players to enrich

TYPE: list[Player]

RETURNS DESCRIPTION
list[Player]

Players with enriched timeseries

Source code in fplx/data/loaders.py
def enrich_player_history(self, players: list[Player]) -> list[Player]:
    """
    Enrich players with full historical data.

    Parameters
    ----------
    players : list[Player]
        List of players to enrich

    Returns
    -------
    list[Player]
        Players with enriched timeseries
    """
    enriched = []
    for player in players:
        try:
            history = self.load_player_history(player.id)
            if not history.empty:
                player.timeseries = history
            enriched.append(player)
        except Exception as e:
            logger.warning(f"Could not load history for %s : %s", player.name, e)
            enriched.append(player)

    return enriched

VaastavLoader

VaastavLoader(
    season: str = "2023-24",
    data_dir: Optional[str | Path] = None,
    cache_dir: Optional[str | Path] = None,
)

Load historical FPL data from the vaastav dataset.

PARAMETER DESCRIPTION
season

Season string, e.g. "2023-24".

TYPE: str DEFAULT: '2023-24'

data_dir

Path to a local clone. If None, fetches from GitHub.

TYPE: str or Path DEFAULT: None

cache_dir

Where to cache downloaded CSVs. Defaults to ~/.fplx/vaastav/.

TYPE: str or Path DEFAULT: None

Source code in fplx/data/vaastav_loader.py
def __init__(
    self,
    season: str = "2023-24",
    data_dir: Optional[str | Path] = None,
    cache_dir: Optional[str | Path] = None,
):
    self.season = self._validate_season(season)
    self.data_dir = Path(data_dir) if data_dir else None
    # Default cache is project-local to keep artifacts within the workspace.
    project_root = Path(__file__).resolve().parents[2]
    self.cache_dir = Path(cache_dir) if cache_dir else project_root / ".fplx" / "vaastav"
    self.cache_dir.mkdir(parents=True, exist_ok=True)

    self._merged_gw: Optional[pd.DataFrame] = None
    self._player_raw: Optional[pd.DataFrame] = None

load_merged_gw

load_merged_gw() -> DataFrame

Load the merged gameweek file (all GWs, all players, one CSV).

RETURNS DESCRIPTION
DataFrame

One row per player-gameweek appearance.

Source code in fplx/data/vaastav_loader.py
def load_merged_gw(self) -> pd.DataFrame:
    """
    Load the merged gameweek file (all GWs, all players, one CSV).

    Returns
    -------
    pd.DataFrame
        One row per player-gameweek appearance.
    """
    if self._merged_gw is not None:
        return self._merged_gw

    df = self._read_csv("gws/merged_gw.csv")
    df = df.rename(columns={c: COLUMN_MAP.get(c, c) for c in df.columns})
    df = self._coalesce_duplicate_columns(df)

    if "gameweek" in df.columns:
        df["gameweek"] = pd.to_numeric(df["gameweek"], errors="coerce")

    self._merged_gw = df
    logger.info(
        "Loaded merged_gw: %d rows, %d players, GW %d-%d",
        len(df),
        df["element"].nunique(),
        df["gameweek"].min(),
        df["gameweek"].max(),
    )
    return df

load_player_raw

load_player_raw() -> DataFrame

Load season-level player metadata.

Source code in fplx/data/vaastav_loader.py
def load_player_raw(self) -> pd.DataFrame:
    """Load season-level player metadata."""
    if self._player_raw is not None:
        return self._player_raw
    self._player_raw = self._read_csv("players_raw.csv")
    return self._player_raw

load_gameweek

load_gameweek(gw: int) -> DataFrame

Load a single gameweek from merged data.

Source code in fplx/data/vaastav_loader.py
def load_gameweek(self, gw: int) -> pd.DataFrame:
    """Load a single gameweek from merged data."""
    df = self.load_merged_gw()
    return df[df["gameweek"] == gw].copy()

build_player_objects

build_player_objects(
    up_to_gw: Optional[int] = None,
) -> list[Player]

Build Player objects with timeseries up to a given gameweek.

PARAMETER DESCRIPTION
up_to_gw

Only include gameweeks 1..up_to_gw. If None, include all.

TYPE: int DEFAULT: None

RETURNS DESCRIPTION
list[Player]
Source code in fplx/data/vaastav_loader.py
def build_player_objects(
    self,
    up_to_gw: Optional[int] = None,
) -> list[Player]:
    """
    Build Player objects with timeseries up to a given gameweek.

    Parameters
    ----------
    up_to_gw : int, optional
        Only include gameweeks 1..up_to_gw. If None, include all.

    Returns
    -------
    list[Player]
    """
    all_gw = self.load_merged_gw()

    if up_to_gw is not None:
        all_gw = all_gw[all_gw["gameweek"] <= up_to_gw]

    if all_gw.empty:
        return []

    players = []
    grouped = all_gw.groupby("element")

    for pid, grp in grouped:
        pid = int(pid)
        grp = grp.sort_values("gameweek").reset_index(drop=True)

        # Player metadata from the row itself
        name = str(grp["name"].iloc[0]) if "name" in grp.columns else f"Player_{pid}"
        team = str(grp["team"].iloc[0]) if "team" in grp.columns else "Unknown"
        pos_raw = grp["position"].iloc[0] if "position" in grp.columns else "MID"
        price = grp["value"].iloc[-1] / 10.0 if "value" in grp.columns else 5.0

        position = POSITION_MAP.get(pos_raw, POSITION_MAP.get(str(pos_raw), "MID"))

        # Build timeseries with available columns
        keep = [
            c
            for c in [
                "gameweek",
                "points",
                "minutes",
                "starts",
                "goals",
                "assists",
                "xG",
                "xA",
                "bonus",
                "bps",
                "clean_sheets",
                "goals_conceded",
                "saves",
                "yellow_cards",
                "red_cards",
                "own_goals",
                "penalties_missed",
                "penalties_saved",
                "influence",
                "creativity",
                "threat",
                "ict_index",
                "was_home",
                "opponent_team",
                "expected_goals_conceded",
                "xP",
                "value",
                "selected",
                "transfers_in",
                "transfers_out",
            ]
            if c in grp.columns
        ]
        timeseries = grp[keep].copy()
        for col in timeseries.columns:
            timeseries[col] = pd.to_numeric(timeseries[col], errors="coerce")

        # ── DGW aggregation ───────────────────────────────────────────
        # Always collapse to one row per GW decision period.
        # DGW gameweeks receive per-fixture normalised scores so that the
        # inference pipeline (HMM, enriched, KF) operates on single-game-
        # equivalent observations. See double_gameweek.py for details.
        timeseries = aggregate_dgw_timeseries(timeseries)

        player = Player(
            id=pid,
            name=name,
            team=team,
            position=position,
            price=float(price),
            timeseries=timeseries,
        )
        players.append(player)

    logger.info("Built %d Player objects (up_to_gw=%s).", len(players), up_to_gw)
    return players

get_actual_points

get_actual_points(gw: int) -> dict[int, float]

Get actual points scored by each player in a specific gameweek.

For Double Gameweek players (two fixtures in the same round) the points from both fixtures are summed, which is the correct FPL score for that gameweek. The previous implementation used dict(zip(…)) which silently discarded the first fixture row when a player appeared twice, underreporting DGW scores.

RETURNS DESCRIPTION
dict[int, float]

{player_id: actual_points} (summed across fixtures for DGW players)

Source code in fplx/data/vaastav_loader.py
def get_actual_points(self, gw: int) -> dict[int, float]:
    """
    Get actual points scored by each player in a specific gameweek.

    For Double Gameweek players (two fixtures in the same round) the
    points from both fixtures are **summed**, which is the correct FPL
    score for that gameweek. The previous implementation used ``dict(zip(…))``
    which silently discarded the first fixture row when a player appeared
    twice, underreporting DGW scores.

    Returns
    -------
    dict[int, float]
        {player_id: actual_points}  (summed across fixtures for DGW players)
    """
    df = self.load_gameweek(gw)
    pts_col = "points" if "points" in df.columns else "total_points"
    # groupby + sum handles both SGW (one row) and DGW (two rows) correctly
    summed = df.groupby("element")[pts_col].sum().reset_index()
    return dict(zip(summed["element"].astype(int), summed[pts_col].astype(float)))

get_fixture_info

get_fixture_info(gw: int) -> dict[int, dict]

Get fixture context (opponent, home/away, xP) per player for a GW.

Source code in fplx/data/vaastav_loader.py
def get_fixture_info(self, gw: int) -> dict[int, dict]:
    """Get fixture context (opponent, home/away, xP) per player for a GW."""
    df = self.load_gameweek(gw)
    info = {}
    for _, row in df.iterrows():
        pid = int(row.get("element", 0))
        info[pid] = {
            "was_home": bool(row.get("was_home", False)),
            "opponent_team": int(row.get("opponent_team", 0)) if "opponent_team" in df.columns else 0,
            "xP": float(row.get("xP", 0.0)) if "xP" in df.columns else 0.0,
        }
    return info