perfect-postcode/pipeline/download/gias.py

"""Download the Get Information About Schools (GIAS) full establishments extract.

GIAS is the DfE register of all educational establishments in England, updated
daily. The CSV is generated on-demand via a four-step interaction with the
public Downloads page (there is no static URL):

1.  GET /Downloads — extract anti-forgery token, the `all.edubase.data` tag,
    and the FileGeneratedDate that the server expects for that tag today.
2.  POST /Downloads/Collate — submit the form to start file generation. The
    redirect URL contains a generation UUID.
3.  Poll /Downloads/GenerateAjax/{id} until status:true.
4.  GET the Azure blob URL with ?id={id} — returns a ZIP containing
    `edubasealldataYYYYMMDD.csv`.

The CSV is cp1252-encoded with 135 columns. We keep the fields useful for a
schools map (identification, status, phase, age range, religious character,
admissions policy, headline figures, contact details) and project Easting/
Northing (EPSG:27700) to WGS84 lat/lng.
"""

import argparse
import io
import json
import re
import time
import zipfile
from pathlib import Path

import httpx
import polars as pl
from pyproj import Transformer

from pipeline.local_temp import local_tmp_dir

BASE_URL = "https://get-information-schools.service.gov.uk"
DOWNLOADS_URL = f"{BASE_URL}/Downloads"
COLLATE_URL = f"{BASE_URL}/Downloads/Collate"
AJAX_URL = f"{BASE_URL}/Downloads/GenerateAjax"
AZURE_FILE_URL = (
    "https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/File.xhtml"
)
EXTRACT_TAG = "all.edubase.data"

USER_AGENT = (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)

POLL_INTERVAL_S = 2.0
POLL_TIMEOUT_S = 300.0

_TOKEN_RE = re.compile(
    r'name="__RequestVerificationToken"[^>]*value="([^"]+)"', re.IGNORECASE
)
_GEN_DATE_RE = re.compile(
    r'Downloads_0__FileGeneratedDate"[^>]*value="([^"]+)"', re.IGNORECASE
)
_GEN_ID_RE = re.compile(
    r"/Downloads/Generated/([0-9a-f-]{36})", re.IGNORECASE
)

# Columns to read from the CSV (the file has 135; we keep what is useful for a
# schools map and contact card). Names must match the CSV header verbatim.
_CSV_COLUMNS: list[str] = [
    "URN",
    "EstablishmentName",
    "TypeOfEstablishment (name)",
    "EstablishmentTypeGroup (name)",
    "EstablishmentStatus (name)",
    "PhaseOfEducation (name)",
    "StatutoryLowAge",
    "StatutoryHighAge",
    "NurseryProvision (name)",
    "OfficialSixthForm (name)",
    "Gender (name)",
    "ReligiousCharacter (name)",
    "AdmissionsPolicy (name)",
    "SchoolCapacity",
    "NumberOfPupils",
    "PercentageFSM",
    "Trusts (name)",
    "Street",
    "Locality",
    "Town",
    "County (name)",
    "Postcode",
    "SchoolWebsite",
    "TelephoneNum",
    "HeadTitle (name)",
    "HeadFirstName",
    "HeadLastName",
    "Easting",
    "Northing",
    "LA (name)",
]

_NULL_VALUES = ["", "NULL", "Not applicable", "Does not apply"]

_to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)


def _extract_token(html: str) -> str:
    match = _TOKEN_RE.search(html)
    if match is None:
        raise RuntimeError("Could not find __RequestVerificationToken on GIAS page")
    return match.group(1)


def _extract_file_generated_date(html: str) -> str:
    match = _GEN_DATE_RE.search(html)
    if match is None:
        raise RuntimeError(
            "Could not find FileGeneratedDate for the establishments extract"
        )
    return match.group(1)


def _start_generation(client: httpx.Client) -> str:
    """Submit the Downloads form and return the generation UUID."""
    initial = client.get(DOWNLOADS_URL)
    initial.raise_for_status()
    token = _extract_token(initial.text)
    file_generated_date = _extract_file_generated_date(initial.text)

    response = client.post(
        COLLATE_URL,
        data={
            "__RequestVerificationToken": token,
            "Downloads[0].Tag": EXTRACT_TAG,
            "Downloads[0].FileGeneratedDate": file_generated_date,
            "Downloads[0].Selected": "true",
        },
        follow_redirects=True,
    )
    response.raise_for_status()

    match = _GEN_ID_RE.search(str(response.url)) or _GEN_ID_RE.search(response.text)
    if match is None:
        raise RuntimeError("GIAS Collate did not yield a generation UUID")
    return match.group(1)


def _wait_for_generation(client: httpx.Client, generation_id: str) -> None:
    deadline = time.monotonic() + POLL_TIMEOUT_S
    while time.monotonic() < deadline:
        response = client.get(
            f"{AJAX_URL}/{generation_id}",
            headers={"X-Requested-With": "XMLHttpRequest"},
        )
        response.raise_for_status()
        # The endpoint returns JSON whose payload is itself a JSON-encoded string,
        # e.g. response.json() returns the string `{"status":true,...}` which we
        # then need to decode a second time.
        payload = json.loads(response.json())
        if payload.get("status") is True:
            return
        time.sleep(POLL_INTERVAL_S)
    raise RuntimeError(
        f"GIAS extract generation timed out after {POLL_TIMEOUT_S:.0f}s"
    )


def _download_zip(client: httpx.Client, generation_id: str) -> bytes:
    response = client.get(AZURE_FILE_URL, params={"id": generation_id})
    response.raise_for_status()
    if not response.content.startswith(b"PK"):
        raise RuntimeError("GIAS Azure response was not a ZIP archive")
    return response.content


def fetch_extract_zip() -> bytes:
    """Run the full GIAS download flow and return the raw ZIP bytes."""
    headers = {"User-Agent": USER_AGENT}
    with httpx.Client(headers=headers, timeout=httpx.Timeout(30.0, read=120.0)) as client:
        generation_id = _start_generation(client)
        _wait_for_generation(client, generation_id)
        return _download_zip(client, generation_id)


def _read_csv_from_zip(zip_bytes: bytes) -> pl.DataFrame:
    with zipfile.ZipFile(io.BytesIO(zip_bytes)) as archive:
        csv_names = [name for name in archive.namelist() if name.lower().endswith(".csv")]
        if not csv_names:
            raise RuntimeError("GIAS ZIP did not contain a CSV file")
        with archive.open(csv_names[0]) as raw:
            data = raw.read()

    text = data.decode("cp1252")
    return pl.read_csv(
        io.StringIO(text),
        columns=_CSV_COLUMNS,
        infer_schema_length=20000,
        null_values=_NULL_VALUES,
        truncate_ragged_lines=True,
    )


def _project_easting_northing(easting: pl.Series, northing: pl.Series) -> tuple[list[float | None], list[float | None]]:
    e = easting.to_numpy()
    n = northing.to_numpy()
    lng, lat = _to_wgs84.transform(e, n)
    lng_out: list[float | None] = []
    lat_out: list[float | None] = []
    for east_val, lat_val, lng_val in zip(e, lat, lng):
        if east_val is None or float(east_val) == 0.0:
            lng_out.append(None)
            lat_out.append(None)
        else:
            lng_out.append(float(lng_val))
            lat_out.append(float(lat_val))
    return lat_out, lng_out


def _format_age_range(low: int | None, high: int | None) -> str | None:
    if low is None and high is None:
        return None
    if low is None:
        return f"up to {high}"
    if high is None:
        return f"{low}+"
    return f"{low}–{high}"


def _format_address(street: str | None, locality: str | None, town: str | None) -> str | None:
    parts = [part.strip() for part in (street, locality, town) if part]
    parts = [part for part in parts if part]
    return ", ".join(parts) if parts else None


def _format_head_name(title: str | None, first: str | None, last: str | None) -> str | None:
    parts = [part.strip() for part in (title, first, last) if part]
    parts = [part for part in parts if part]
    return " ".join(parts) if parts else None


def transform(zip_bytes: bytes) -> pl.DataFrame:
    """Convert the GIAS extract ZIP into a clean schools DataFrame."""
    raw = _read_csv_from_zip(zip_bytes)

    # Filter to currently-open establishments; the CSV also includes closed,
    # proposed-to-open, and proposed-to-close rows we do not want on a map.
    df = raw.filter(pl.col("EstablishmentStatus (name)") == "Open")

    df = df.with_columns(
        pl.col("URN").cast(pl.Int64),
        pl.col("StatutoryLowAge").cast(pl.Int32, strict=False),
        pl.col("StatutoryHighAge").cast(pl.Int32, strict=False),
        pl.col("SchoolCapacity").cast(pl.Int32, strict=False),
        pl.col("NumberOfPupils").cast(pl.Int32, strict=False),
        pl.col("Easting").cast(pl.Float64, strict=False),
        pl.col("Northing").cast(pl.Float64, strict=False),
        pl.col("PercentageFSM")
        .cast(pl.String)
        .str.replace_all("%", "", literal=True)
        .str.strip_chars()
        .cast(pl.Float32, strict=False),
    )

    # Drop rows without coordinates — a small number of historic/dummy entries
    # have Easting=0 which would map to the Atlantic.
    df = df.filter(
        pl.col("Easting").is_not_null()
        & pl.col("Northing").is_not_null()
        & (pl.col("Easting") > 0)
        & (pl.col("Northing") > 0)
    )

    lat, lng = _project_easting_northing(df["Easting"], df["Northing"])

    age_range = [
        _format_age_range(low, high)
        for low, high in zip(df["StatutoryLowAge"].to_list(), df["StatutoryHighAge"].to_list())
    ]
    address = [
        _format_address(street, locality, town)
        for street, locality, town in zip(
            df["Street"].to_list(),
            df["Locality"].to_list(),
            df["Town"].to_list(),
        )
    ]
    head_name = [
        _format_head_name(title, first, last)
        for title, first, last in zip(
            df["HeadTitle (name)"].to_list(),
            df["HeadFirstName"].to_list(),
            df["HeadLastName"].to_list(),
        )
    ]

    out = pl.DataFrame(
        {
            "urn": df["URN"],
            "name": df["EstablishmentName"],
            "lat": pl.Series(lat, dtype=pl.Float64),
            "lng": pl.Series(lng, dtype=pl.Float64),
            "phase": df["PhaseOfEducation (name)"],
            "type": df["TypeOfEstablishment (name)"],
            "type_group": df["EstablishmentTypeGroup (name)"],
            "age_range": pl.Series(age_range, dtype=pl.String),
            "gender": df["Gender (name)"],
            "religious_character": df["ReligiousCharacter (name)"],
            "admissions_policy": df["AdmissionsPolicy (name)"],
            "nursery_provision": df["NurseryProvision (name)"],
            "sixth_form": df["OfficialSixthForm (name)"],
            "capacity": df["SchoolCapacity"],
            "pupils": df["NumberOfPupils"],
            "fsm_percent": df["PercentageFSM"],
            "trust": df["Trusts (name)"],
            "address": pl.Series(address, dtype=pl.String),
            "postcode": df["Postcode"],
            "local_authority": df["LA (name)"],
            "website": df["SchoolWebsite"],
            "telephone": df["TelephoneNum"],
            "head_name": pl.Series(head_name, dtype=pl.String),
        }
    )

    # Drop any remaining rows where projection failed (extremely rare).
    return out.filter(pl.col("lat").is_not_null() & pl.col("lng").is_not_null())


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Download the GIAS full establishments extract → parquet"
    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
    )
    args = parser.parse_args()

    cache_dir = local_tmp_dir() / "gias"
    cache_dir.mkdir(parents=True, exist_ok=True)
    cache_path = cache_dir / "edubase.zip"

    print("Fetching GIAS extract...")
    zip_bytes = fetch_extract_zip()
    cache_path.write_bytes(zip_bytes)
    print(f"Downloaded {len(zip_bytes) / (1024 * 1024):.1f} MB to {cache_path}")

    print("Transforming...")
    df = transform(zip_bytes)
    args.output.parent.mkdir(parents=True, exist_ok=True)
    df.write_parquet(args.output, compression="zstd")
    print(f"Wrote {args.output} ({len(df):,} open establishments)")


if __name__ == "__main__":
    main()