vaastav_loader

vaastav_loader ¶

Loader for the vaastav/Fantasy-Premier-League dataset.

Supports two modes: 1. Remote: fetch CSVs directly from GitHub (no clone needed). 2. Local: read from a cloned repo directory.

Usage (remote): loader = VaastavLoader(season="2023-24") players = loader.build_player_objects(up_to_gw=20)

Usage (local): loader = VaastavLoader(season="2023-24", data_dir="./Fantasy-Premier-League") players = loader.build_player_objects(up_to_gw=20)

Dataset: https://github.com/vaastav/Fantasy-Premier-League

Double Gameweek handling

build_player_objects automatically calls aggregate_dgw_timeseries on every player's raw timeseries before constructing the Player object. This means all downstream consumers (inference pipeline, MV-HMM, enriched predictor, Kalman Filter) always receive exactly one row per FPL decision period.

For DGW gameweeks, the resulting row contains: points – raw total (both fixtures summed, used for scoring / oracle) points_norm – per-fixture average (used by inference components) n_fixtures – number of fixtures played (1 for SGW, 2 for DGW)

The inference pipeline uses points_norm so that HMM emission distributions remain calibrated on single-game-equivalent observations. The ILP objective then scales back via scale_predictions_for_dgw to reflect the full DGW opportunity.

VaastavLoader ¶

VaastavLoader(
    season: str = "2023-24",
    data_dir: Optional[str | Path] = None,
    cache_dir: Optional[str | Path] = None,
)

Load historical FPL data from the vaastav dataset.

PARAMETER	DESCRIPTION
`season`	Season string, e.g. "2023-24". TYPE: `str` DEFAULT: `'2023-24'`
`data_dir`	Path to a local clone. If None, fetches from GitHub. TYPE: `str or Path` DEFAULT: `None`
`cache_dir`	Where to cache downloaded CSVs. Defaults to ~/.fplx/vaastav/. TYPE: `str or Path` DEFAULT: `None`

Source code in fplx/data/vaastav_loader.py

def __init__(
    self,
    season: str = "2023-24",
    data_dir: Optional[str | Path] = None,
    cache_dir: Optional[str | Path] = None,
):
    self.season = self._validate_season(season)
    self.data_dir = Path(data_dir) if data_dir else None
    # Default cache is project-local to keep artifacts within the workspace.
    project_root = Path(__file__).resolve().parents[2]
    self.cache_dir = Path(cache_dir) if cache_dir else project_root / ".fplx" / "vaastav"
    self.cache_dir.mkdir(parents=True, exist_ok=True)

    self._merged_gw: Optional[pd.DataFrame] = None
    self._player_raw: Optional[pd.DataFrame] = None

load_merged_gw ¶

load_merged_gw() -> DataFrame

Load the merged gameweek file (all GWs, all players, one CSV).

RETURNS	DESCRIPTION
`DataFrame`	One row per player-gameweek appearance.

Source code in fplx/data/vaastav_loader.py

def load_merged_gw(self) -> pd.DataFrame:
    """
    Load the merged gameweek file (all GWs, all players, one CSV).

    Returns
    -------
    pd.DataFrame
        One row per player-gameweek appearance.
    """
    if self._merged_gw is not None:
        return self._merged_gw

    df = self._read_csv("gws/merged_gw.csv")
    df = df.rename(columns={c: COLUMN_MAP.get(c, c) for c in df.columns})
    df = self._coalesce_duplicate_columns(df)

    if "gameweek" in df.columns:
        df["gameweek"] = pd.to_numeric(df["gameweek"], errors="coerce")

    self._merged_gw = df
    logger.info(
        "Loaded merged_gw: %d rows, %d players, GW %d-%d",
        len(df),
        df["element"].nunique(),
        df["gameweek"].min(),
        df["gameweek"].max(),
    )
    return df

load_player_raw ¶

load_player_raw() -> DataFrame

Load season-level player metadata.

Source code in fplx/data/vaastav_loader.py

def load_player_raw(self) -> pd.DataFrame:
    """Load season-level player metadata."""
    if self._player_raw is not None:
        return self._player_raw
    self._player_raw = self._read_csv("players_raw.csv")
    return self._player_raw

load_gameweek ¶

load_gameweek(gw: int) -> DataFrame

Load a single gameweek from merged data.

Source code in fplx/data/vaastav_loader.py

def load_gameweek(self, gw: int) -> pd.DataFrame:
    """Load a single gameweek from merged data."""
    df = self.load_merged_gw()
    return df[df["gameweek"] == gw].copy()

build_player_objects ¶

build_player_objects(
    up_to_gw: Optional[int] = None,
) -> list[Player]

Build Player objects with timeseries up to a given gameweek.

PARAMETER	DESCRIPTION
`up_to_gw`	Only include gameweeks 1..up_to_gw. If None, include all. TYPE: `int` DEFAULT: `None`

RETURNS	DESCRIPTION
`list[Player]`

Source code in fplx/data/vaastav_loader.py

def build_player_objects(
    self,
    up_to_gw: Optional[int] = None,
) -> list[Player]:
    """
    Build Player objects with timeseries up to a given gameweek.

    Parameters
    ----------
    up_to_gw : int, optional
        Only include gameweeks 1..up_to_gw. If None, include all.

    Returns
    -------
    list[Player]
    """
    all_gw = self.load_merged_gw()

    if up_to_gw is not None:
        all_gw = all_gw[all_gw["gameweek"] <= up_to_gw]

    if all_gw.empty:
        return []

    players = []
    grouped = all_gw.groupby("element")

    for pid, grp in grouped:
        pid = int(pid)
        grp = grp.sort_values("gameweek").reset_index(drop=True)

        # Player metadata from the row itself
        name = str(grp["name"].iloc[0]) if "name" in grp.columns else f"Player_{pid}"
        team = str(grp["team"].iloc[0]) if "team" in grp.columns else "Unknown"
        pos_raw = grp["position"].iloc[0] if "position" in grp.columns else "MID"
        price = grp["value"].iloc[-1] / 10.0 if "value" in grp.columns else 5.0

        position = POSITION_MAP.get(pos_raw, POSITION_MAP.get(str(pos_raw), "MID"))

        # Build timeseries with available columns
        keep = [
            c
            for c in [
                "gameweek",
                "points",
                "minutes",
                "starts",
                "goals",
                "assists",
                "xG",
                "xA",
                "bonus",
                "bps",
                "clean_sheets",
                "goals_conceded",
                "saves",
                "yellow_cards",
                "red_cards",
                "own_goals",
                "penalties_missed",
                "penalties_saved",
                "influence",
                "creativity",
                "threat",
                "ict_index",
                "was_home",
                "opponent_team",
                "expected_goals_conceded",
                "xP",
                "value",
                "selected",
                "transfers_in",
                "transfers_out",
            ]
            if c in grp.columns
        ]
        timeseries = grp[keep].copy()
        for col in timeseries.columns:
            timeseries[col] = pd.to_numeric(timeseries[col], errors="coerce")

        # ── DGW aggregation ───────────────────────────────────────────
        # Always collapse to one row per GW decision period.
        # DGW gameweeks receive per-fixture normalised scores so that the
        # inference pipeline (HMM, enriched, KF) operates on single-game-
        # equivalent observations. See double_gameweek.py for details.
        timeseries = aggregate_dgw_timeseries(timeseries)

        player = Player(
            id=pid,
            name=name,
            team=team,
            position=position,
            price=float(price),
            timeseries=timeseries,
        )
        players.append(player)

    logger.info("Built %d Player objects (up_to_gw=%s).", len(players), up_to_gw)
    return players

get_actual_points ¶

get_actual_points(gw: int) -> dict[int, float]

Get actual points scored by each player in a specific gameweek.

For Double Gameweek players (two fixtures in the same round) the points from both fixtures are summed, which is the correct FPL score for that gameweek. The previous implementation used dict(zip(…)) which silently discarded the first fixture row when a player appeared twice, underreporting DGW scores.

RETURNS	DESCRIPTION
`dict[int, float]`	{player_id: actual_points} (summed across fixtures for DGW players)

Source code in fplx/data/vaastav_loader.py

def get_actual_points(self, gw: int) -> dict[int, float]:
    """
    Get actual points scored by each player in a specific gameweek.

    For Double Gameweek players (two fixtures in the same round) the
    points from both fixtures are **summed**, which is the correct FPL
    score for that gameweek. The previous implementation used ``dict(zip(…))``
    which silently discarded the first fixture row when a player appeared
    twice, underreporting DGW scores.

    Returns
    -------
    dict[int, float]
        {player_id: actual_points}  (summed across fixtures for DGW players)
    """
    df = self.load_gameweek(gw)
    pts_col = "points" if "points" in df.columns else "total_points"
    # groupby + sum handles both SGW (one row) and DGW (two rows) correctly
    summed = df.groupby("element")[pts_col].sum().reset_index()
    return dict(zip(summed["element"].astype(int), summed[pts_col].astype(float)))

get_fixture_info ¶

get_fixture_info(gw: int) -> dict[int, dict]

Get fixture context (opponent, home/away, xP) per player for a GW.

Source code in fplx/data/vaastav_loader.py

def get_fixture_info(self, gw: int) -> dict[int, dict]:
    """Get fixture context (opponent, home/away, xP) per player for a GW."""
    df = self.load_gameweek(gw)
    info = {}
    for _, row in df.iterrows():
        pid = int(row.get("element", 0))
        info[pid] = {
            "was_home": bool(row.get("was_home", False)),
            "opponent_team": int(row.get("opponent_team", 0)) if "opponent_team" in df.columns else 0,
            "xP": float(row.get("xP", 0.0)) if "xP" in df.columns else 0.0,
        }
    return info