data

data ¶

Data loading and schema definitions.

FPLDataLoader ¶

FPLDataLoader(cache_dir: Optional[Path] = None)

Load and manage FPL data from various sources (API, CSV, cache).

PARAMETER	DESCRIPTION
`cache_dir`	Directory to cache downloaded data TYPE: `Optional[Path]` DEFAULT: `None`

Source code in fplx/data/loaders.py

def __init__(self, cache_dir: Optional[Path] = None):
    self.cache_dir = cache_dir or Path.home() / ".fplx" / "cache"
    self.cache_dir.mkdir(parents=True, exist_ok=True)
    self._bootstrap_data = None

fetch_bootstrap_data ¶

fetch_bootstrap_data(force_refresh: bool = False) -> dict

Fetch main FPL data (players, teams, gameweeks).

PARAMETER	DESCRIPTION
`force_refresh`	Force refresh even if cached TYPE: `bool` DEFAULT: `False`

RETURNS	DESCRIPTION
`Dict`	Bootstrap data containing players, teams, events

Source code in fplx/data/loaders.py

def fetch_bootstrap_data(self, force_refresh: bool = False) -> dict:
    """
    Fetch main FPL data (players, teams, gameweeks).

    Parameters
    ----------
    force_refresh : bool
        Force refresh even if cached

    Returns
    -------
    Dict
        Bootstrap data containing players, teams, events
    """
    cache_file = self.cache_dir / "bootstrap.json"

    if not force_refresh and cache_file.exists():
        import json

        with open(cache_file) as f:
            logger.info("Loading bootstrap data from cache")
            return json.load(f)

    logger.info("Fetching bootstrap data from FPL API")
    response = requests.get(self.BOOTSTRAP_URL)
    response.raise_for_status()

    data = response.json()

    # Cache the data
    import json

    with open(cache_file, "w") as f:
        json.dump(data, f)

    self._bootstrap_data = data
    return data

load_players ¶

load_players(force_refresh: bool = False) -> list[Player]

Load all players with basic info.

PARAMETER	DESCRIPTION
`force_refresh`	Force refresh from API TYPE: `bool` DEFAULT: `False`

RETURNS	DESCRIPTION
`list[Player]`	List of Player objects

Source code in fplx/data/loaders.py

def load_players(self, force_refresh: bool = False) -> list[Player]:
    """
    Load all players with basic info.

    Parameters
    ----------
    force_refresh : bool
        Force refresh from API

    Returns
    -------
    list[Player]
        List of Player objects
    """
    data = self.fetch_bootstrap_data(force_refresh)

    # Build team mapping
    teams = {t["id"]: t["name"] for t in data["teams"]}
    positions = {1: "GK", 2: "DEF", 3: "MID", 4: "FWD"}

    players = []
    for element in data["elements"]:
        # Create minimal timeseries (can be enriched later)
        ts_data = {
            "gameweek": [0],
            "points": [element.get("total_points", 0)],
            "minutes": [element.get("minutes", 0)],
            "form": [float(element.get("form", 0))],
        }

        player = Player(
            id=element["id"],
            name=element["web_name"],
            team=teams[element["team"]],
            position=positions[element["element_type"]],
            price=element["now_cost"] / 10.0,  # Convert to £m
            timeseries=pd.DataFrame(ts_data),
            news={
                "text": element.get("news", ""),
                "availability": 1.0
                if element.get("chance_of_playing_next_round") is None
                else element.get("chance_of_playing_next_round") / 100.0,
            },
        )
        players.append(player)

    logger.info(f"Loaded {len(players)} players")
    return players

load_player_history ¶

load_player_history(player_id: int) -> DataFrame

Load detailed historical data for a specific player.

PARAMETER	DESCRIPTION
`player_id`	Player ID TYPE: `int`

RETURNS	DESCRIPTION
`DataFrame`	Historical gameweek stats

Source code in fplx/data/loaders.py

def load_player_history(self, player_id: int) -> pd.DataFrame:
    """
    Load detailed historical data for a specific player.

    Parameters
    ----------
    player_id : int
        Player ID

    Returns
    -------
    pd.DataFrame
        Historical gameweek stats
    """
    url = self.PLAYER_DETAIL_URL.format(player_id=player_id)
    response = requests.get(url)
    response.raise_for_status()

    data = response.json()
    history = pd.DataFrame(data["history"])

    # Rename columns for consistency
    if not history.empty:
        history = history.rename(
            columns={
                "round": "gameweek",
                "total_points": "points",
                "minutes": "minutes",
                "goals_scored": "goals",
                "assists": "assists",
                "expected_goals": "xG",
                "expected_assists": "xA",
            }
        )

    return history

load_fixtures ¶

load_fixtures() -> DataFrame

Load all fixtures.

RETURNS	DESCRIPTION
`DataFrame`	Fixtures data

Source code in fplx/data/loaders.py

def load_fixtures(self) -> pd.DataFrame:
    """
    Load all fixtures.

    Returns
    -------
    pd.DataFrame
        Fixtures data
    """
    response = requests.get(self.FIXTURES_URL)
    response.raise_for_status()

    fixtures = pd.DataFrame(response.json())
    return fixtures

load_from_csv ¶

load_from_csv(filepath: Path) -> DataFrame

Load data from CSV file.

PARAMETER	DESCRIPTION
`filepath`	Path to CSV file TYPE: `Path`

RETURNS	DESCRIPTION
`DataFrame`	Loaded data

Source code in fplx/data/loaders.py

def load_from_csv(self, filepath: Path) -> pd.DataFrame:
    """
    Load data from CSV file.

    Parameters
    ----------
    filepath : Path
        Path to CSV file

    Returns
    -------
    pd.DataFrame
        Loaded data
    """
    logger.info("Loading data from %s", filepath)
    df = pd.read_csv(filepath)
    return df

enrich_player_history ¶

enrich_player_history(
    players: list[Player],
) -> list[Player]

Enrich players with full historical data.

PARAMETER	DESCRIPTION
`players`	List of players to enrich TYPE: `list[Player]`

RETURNS	DESCRIPTION
`list[Player]`	Players with enriched timeseries

Source code in fplx/data/loaders.py

def enrich_player_history(self, players: list[Player]) -> list[Player]:
    """
    Enrich players with full historical data.

    Parameters
    ----------
    players : list[Player]
        List of players to enrich

    Returns
    -------
    list[Player]
        Players with enriched timeseries
    """
    enriched = []
    for player in players:
        try:
            history = self.load_player_history(player.id)
            if not history.empty:
                player.timeseries = history
            enriched.append(player)
        except Exception as e:
            logger.warning(f"Could not load history for %s : %s", player.name, e)
            enriched.append(player)

    return enriched

VaastavLoader ¶

VaastavLoader(
    season: str = "2023-24",
    data_dir: Optional[str | Path] = None,
    cache_dir: Optional[str | Path] = None,
)

Load historical FPL data from the vaastav dataset.

PARAMETER	DESCRIPTION
`season`	Season string, e.g. "2023-24". TYPE: `str` DEFAULT: `'2023-24'`
`data_dir`	Path to a local clone. If None, fetches from GitHub. TYPE: `str or Path` DEFAULT: `None`
`cache_dir`	Where to cache downloaded CSVs. Defaults to ~/.fplx/vaastav/. TYPE: `str or Path` DEFAULT: `None`

Source code in fplx/data/vaastav_loader.py

def __init__(
    self,
    season: str = "2023-24",
    data_dir: Optional[str | Path] = None,
    cache_dir: Optional[str | Path] = None,
):
    self.season = self._validate_season(season)
    self.data_dir = Path(data_dir) if data_dir else None
    # Default cache is project-local to keep artifacts within the workspace.
    project_root = Path(__file__).resolve().parents[2]
    self.cache_dir = Path(cache_dir) if cache_dir else project_root / ".fplx" / "vaastav"
    self.cache_dir.mkdir(parents=True, exist_ok=True)

    self._merged_gw: Optional[pd.DataFrame] = None
    self._player_raw: Optional[pd.DataFrame] = None

load_merged_gw ¶

load_merged_gw() -> DataFrame

Load the merged gameweek file (all GWs, all players, one CSV).

RETURNS	DESCRIPTION
`DataFrame`	One row per player-gameweek appearance.

Source code in fplx/data/vaastav_loader.py

def load_merged_gw(self) -> pd.DataFrame:
    """
    Load the merged gameweek file (all GWs, all players, one CSV).

    Returns
    -------
    pd.DataFrame
        One row per player-gameweek appearance.
    """
    if self._merged_gw is not None:
        return self._merged_gw

    df = self._read_csv("gws/merged_gw.csv")
    df = df.rename(columns={c: COLUMN_MAP.get(c, c) for c in df.columns})
    df = self._coalesce_duplicate_columns(df)

    if "gameweek" in df.columns:
        df["gameweek"] = pd.to_numeric(df["gameweek"], errors="coerce")

    self._merged_gw = df
    logger.info(
        "Loaded merged_gw: %d rows, %d players, GW %d-%d",
        len(df),
        df["element"].nunique(),
        df["gameweek"].min(),
        df["gameweek"].max(),
    )
    return df

load_player_raw ¶

load_player_raw() -> DataFrame

Load season-level player metadata.

Source code in fplx/data/vaastav_loader.py

def load_player_raw(self) -> pd.DataFrame:
    """Load season-level player metadata."""
    if self._player_raw is not None:
        return self._player_raw
    self._player_raw = self._read_csv("players_raw.csv")
    return self._player_raw

load_gameweek ¶

load_gameweek(gw: int) -> DataFrame

Load a single gameweek from merged data.

Source code in fplx/data/vaastav_loader.py

def load_gameweek(self, gw: int) -> pd.DataFrame:
    """Load a single gameweek from merged data."""
    df = self.load_merged_gw()
    return df[df["gameweek"] == gw].copy()

build_player_objects ¶

build_player_objects(
    up_to_gw: Optional[int] = None,
) -> list[Player]

Build Player objects with timeseries up to a given gameweek.

PARAMETER	DESCRIPTION
`up_to_gw`	Only include gameweeks 1..up_to_gw. If None, include all. TYPE: `int` DEFAULT: `None`

RETURNS	DESCRIPTION
`list[Player]`

Source code in fplx/data/vaastav_loader.py

def build_player_objects(
    self,
    up_to_gw: Optional[int] = None,
) -> list[Player]:
    """
    Build Player objects with timeseries up to a given gameweek.

    Parameters
    ----------
    up_to_gw : int, optional
        Only include gameweeks 1..up_to_gw. If None, include all.

    Returns
    -------
    list[Player]
    """
    all_gw = self.load_merged_gw()

    if up_to_gw is not None:
        all_gw = all_gw[all_gw["gameweek"] <= up_to_gw]

    if all_gw.empty:
        return []

    players = []
    grouped = all_gw.groupby("element")

    for pid, grp in grouped:
        pid = int(pid)
        grp = grp.sort_values("gameweek").reset_index(drop=True)

        # Player metadata from the row itself
        name = str(grp["name"].iloc[0]) if "name" in grp.columns else f"Player_{pid}"
        team = str(grp["team"].iloc[0]) if "team" in grp.columns else "Unknown"
        pos_raw = grp["position"].iloc[0] if "position" in grp.columns else "MID"
        price = grp["value"].iloc[-1] / 10.0 if "value" in grp.columns else 5.0

        position = POSITION_MAP.get(pos_raw, POSITION_MAP.get(str(pos_raw), "MID"))

        # Build timeseries with available columns
        keep = [
            c
            for c in [
                "gameweek",
                "points",
                "minutes",
                "starts",
                "goals",
                "assists",
                "xG",
                "xA",
                "bonus",
                "bps",
                "clean_sheets",
                "goals_conceded",
                "saves",
                "yellow_cards",
                "red_cards",
                "own_goals",
                "penalties_missed",
                "penalties_saved",
                "influence",
                "creativity",
                "threat",
                "ict_index",
                "was_home",
                "opponent_team",
                "expected_goals_conceded",
                "xP",
                "value",
                "selected",
                "transfers_in",
                "transfers_out",
            ]
            if c in grp.columns
        ]
        timeseries = grp[keep].copy()
        for col in timeseries.columns:
            timeseries[col] = pd.to_numeric(timeseries[col], errors="coerce")

        # ── DGW aggregation ───────────────────────────────────────────
        # Always collapse to one row per GW decision period.
        # DGW gameweeks receive per-fixture normalised scores so that the
        # inference pipeline (HMM, enriched, KF) operates on single-game-
        # equivalent observations. See double_gameweek.py for details.
        timeseries = aggregate_dgw_timeseries(timeseries)

        player = Player(
            id=pid,
            name=name,
            team=team,
            position=position,
            price=float(price),
            timeseries=timeseries,
        )
        players.append(player)

    logger.info("Built %d Player objects (up_to_gw=%s).", len(players), up_to_gw)
    return players

get_actual_points ¶

get_actual_points(gw: int) -> dict[int, float]

Get actual points scored by each player in a specific gameweek.

For Double Gameweek players (two fixtures in the same round) the points from both fixtures are summed, which is the correct FPL score for that gameweek. The previous implementation used dict(zip(…)) which silently discarded the first fixture row when a player appeared twice, underreporting DGW scores.

RETURNS	DESCRIPTION
`dict[int, float]`	{player_id: actual_points} (summed across fixtures for DGW players)

Source code in fplx/data/vaastav_loader.py

def get_actual_points(self, gw: int) -> dict[int, float]:
    """
    Get actual points scored by each player in a specific gameweek.

    For Double Gameweek players (two fixtures in the same round) the
    points from both fixtures are **summed**, which is the correct FPL
    score for that gameweek. The previous implementation used ``dict(zip(…))``
    which silently discarded the first fixture row when a player appeared
    twice, underreporting DGW scores.

    Returns
    -------
    dict[int, float]
        {player_id: actual_points}  (summed across fixtures for DGW players)
    """
    df = self.load_gameweek(gw)
    pts_col = "points" if "points" in df.columns else "total_points"
    # groupby + sum handles both SGW (one row) and DGW (two rows) correctly
    summed = df.groupby("element")[pts_col].sum().reset_index()
    return dict(zip(summed["element"].astype(int), summed[pts_col].astype(float)))

get_fixture_info ¶

get_fixture_info(gw: int) -> dict[int, dict]

Get fixture context (opponent, home/away, xP) per player for a GW.

Source code in fplx/data/vaastav_loader.py

def get_fixture_info(self, gw: int) -> dict[int, dict]:
    """Get fixture context (opponent, home/away, xP) per player for a GW."""
    df = self.load_gameweek(gw)
    info = {}
    for _, row in df.iterrows():
        pid = int(row.get("element", 0))
        info[pid] = {
            "was_home": bool(row.get("was_home", False)),
            "opponent_team": int(row.get("opponent_team", 0)) if "opponent_team" in df.columns else 0,
            "xP": float(row.get("xP", 0.0)) if "xP" in df.columns else 0.0,
        }
    return info