perfect-postcode/pipeline/download/gias.py
2026-05-25 13:20:17 +01:00

349 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Download the Get Information About Schools (GIAS) full establishments extract.
GIAS is the DfE register of all educational establishments in England, updated
daily. The CSV is generated on-demand via a four-step interaction with the
public Downloads page (there is no static URL):
1. GET /Downloads — extract anti-forgery token, the `all.edubase.data` tag,
and the FileGeneratedDate that the server expects for that tag today.
2. POST /Downloads/Collate — submit the form to start file generation. The
redirect URL contains a generation UUID.
3. Poll /Downloads/GenerateAjax/{id} until status:true.
4. GET the Azure blob URL with ?id={id} — returns a ZIP containing
`edubasealldataYYYYMMDD.csv`.
The CSV is cp1252-encoded with 135 columns. We keep the fields useful for a
schools map (identification, status, phase, age range, religious character,
admissions policy, headline figures, contact details) and project Easting/
Northing (EPSG:27700) to WGS84 lat/lng.
"""
import argparse
import io
import json
import re
import time
import zipfile
from pathlib import Path
import httpx
import polars as pl
from pyproj import Transformer
from pipeline.local_temp import local_tmp_dir
BASE_URL = "https://get-information-schools.service.gov.uk"
DOWNLOADS_URL = f"{BASE_URL}/Downloads"
COLLATE_URL = f"{BASE_URL}/Downloads/Collate"
AJAX_URL = f"{BASE_URL}/Downloads/GenerateAjax"
AZURE_FILE_URL = (
"https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/File.xhtml"
)
EXTRACT_TAG = "all.edubase.data"
USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
POLL_INTERVAL_S = 2.0
POLL_TIMEOUT_S = 300.0
_TOKEN_RE = re.compile(
r'name="__RequestVerificationToken"[^>]*value="([^"]+)"', re.IGNORECASE
)
_GEN_DATE_RE = re.compile(
r'Downloads_0__FileGeneratedDate"[^>]*value="([^"]+)"', re.IGNORECASE
)
_GEN_ID_RE = re.compile(
r"/Downloads/Generated/([0-9a-f-]{36})", re.IGNORECASE
)
# Columns to read from the CSV (the file has 135; we keep what is useful for a
# schools map and contact card). Names must match the CSV header verbatim.
_CSV_COLUMNS: list[str] = [
"URN",
"EstablishmentName",
"TypeOfEstablishment (name)",
"EstablishmentTypeGroup (name)",
"EstablishmentStatus (name)",
"PhaseOfEducation (name)",
"StatutoryLowAge",
"StatutoryHighAge",
"NurseryProvision (name)",
"OfficialSixthForm (name)",
"Gender (name)",
"ReligiousCharacter (name)",
"AdmissionsPolicy (name)",
"SchoolCapacity",
"NumberOfPupils",
"PercentageFSM",
"Trusts (name)",
"Street",
"Locality",
"Town",
"County (name)",
"Postcode",
"SchoolWebsite",
"TelephoneNum",
"HeadTitle (name)",
"HeadFirstName",
"HeadLastName",
"Easting",
"Northing",
"LA (name)",
]
_NULL_VALUES = ["", "NULL", "Not applicable", "Does not apply"]
_to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
def _extract_token(html: str) -> str:
match = _TOKEN_RE.search(html)
if match is None:
raise RuntimeError("Could not find __RequestVerificationToken on GIAS page")
return match.group(1)
def _extract_file_generated_date(html: str) -> str:
match = _GEN_DATE_RE.search(html)
if match is None:
raise RuntimeError(
"Could not find FileGeneratedDate for the establishments extract"
)
return match.group(1)
def _start_generation(client: httpx.Client) -> str:
"""Submit the Downloads form and return the generation UUID."""
initial = client.get(DOWNLOADS_URL)
initial.raise_for_status()
token = _extract_token(initial.text)
file_generated_date = _extract_file_generated_date(initial.text)
response = client.post(
COLLATE_URL,
data={
"__RequestVerificationToken": token,
"Downloads[0].Tag": EXTRACT_TAG,
"Downloads[0].FileGeneratedDate": file_generated_date,
"Downloads[0].Selected": "true",
},
follow_redirects=True,
)
response.raise_for_status()
match = _GEN_ID_RE.search(str(response.url)) or _GEN_ID_RE.search(response.text)
if match is None:
raise RuntimeError("GIAS Collate did not yield a generation UUID")
return match.group(1)
def _wait_for_generation(client: httpx.Client, generation_id: str) -> None:
deadline = time.monotonic() + POLL_TIMEOUT_S
while time.monotonic() < deadline:
response = client.get(
f"{AJAX_URL}/{generation_id}",
headers={"X-Requested-With": "XMLHttpRequest"},
)
response.raise_for_status()
# The endpoint returns JSON whose payload is itself a JSON-encoded string,
# e.g. response.json() returns the string `{"status":true,...}` which we
# then need to decode a second time.
payload = json.loads(response.json())
if payload.get("status") is True:
return
time.sleep(POLL_INTERVAL_S)
raise RuntimeError(
f"GIAS extract generation timed out after {POLL_TIMEOUT_S:.0f}s"
)
def _download_zip(client: httpx.Client, generation_id: str) -> bytes:
response = client.get(AZURE_FILE_URL, params={"id": generation_id})
response.raise_for_status()
if not response.content.startswith(b"PK"):
raise RuntimeError("GIAS Azure response was not a ZIP archive")
return response.content
def fetch_extract_zip() -> bytes:
"""Run the full GIAS download flow and return the raw ZIP bytes."""
headers = {"User-Agent": USER_AGENT}
with httpx.Client(headers=headers, timeout=httpx.Timeout(30.0, read=120.0)) as client:
generation_id = _start_generation(client)
_wait_for_generation(client, generation_id)
return _download_zip(client, generation_id)
def _read_csv_from_zip(zip_bytes: bytes) -> pl.DataFrame:
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as archive:
csv_names = [name for name in archive.namelist() if name.lower().endswith(".csv")]
if not csv_names:
raise RuntimeError("GIAS ZIP did not contain a CSV file")
with archive.open(csv_names[0]) as raw:
data = raw.read()
text = data.decode("cp1252")
return pl.read_csv(
io.StringIO(text),
columns=_CSV_COLUMNS,
infer_schema_length=20000,
null_values=_NULL_VALUES,
truncate_ragged_lines=True,
)
def _project_easting_northing(easting: pl.Series, northing: pl.Series) -> tuple[list[float | None], list[float | None]]:
e = easting.to_numpy()
n = northing.to_numpy()
lng, lat = _to_wgs84.transform(e, n)
lng_out: list[float | None] = []
lat_out: list[float | None] = []
for east_val, lat_val, lng_val in zip(e, lat, lng):
if east_val is None or float(east_val) == 0.0:
lng_out.append(None)
lat_out.append(None)
else:
lng_out.append(float(lng_val))
lat_out.append(float(lat_val))
return lat_out, lng_out
def _format_age_range(low: int | None, high: int | None) -> str | None:
if low is None and high is None:
return None
if low is None:
return f"up to {high}"
if high is None:
return f"{low}+"
return f"{low}{high}"
def _format_address(street: str | None, locality: str | None, town: str | None) -> str | None:
parts = [part.strip() for part in (street, locality, town) if part]
parts = [part for part in parts if part]
return ", ".join(parts) if parts else None
def _format_head_name(title: str | None, first: str | None, last: str | None) -> str | None:
parts = [part.strip() for part in (title, first, last) if part]
parts = [part for part in parts if part]
return " ".join(parts) if parts else None
def transform(zip_bytes: bytes) -> pl.DataFrame:
"""Convert the GIAS extract ZIP into a clean schools DataFrame."""
raw = _read_csv_from_zip(zip_bytes)
# Filter to currently-open establishments; the CSV also includes closed,
# proposed-to-open, and proposed-to-close rows we do not want on a map.
df = raw.filter(pl.col("EstablishmentStatus (name)") == "Open")
df = df.with_columns(
pl.col("URN").cast(pl.Int64),
pl.col("StatutoryLowAge").cast(pl.Int32, strict=False),
pl.col("StatutoryHighAge").cast(pl.Int32, strict=False),
pl.col("SchoolCapacity").cast(pl.Int32, strict=False),
pl.col("NumberOfPupils").cast(pl.Int32, strict=False),
pl.col("Easting").cast(pl.Float64, strict=False),
pl.col("Northing").cast(pl.Float64, strict=False),
pl.col("PercentageFSM")
.cast(pl.String)
.str.replace_all("%", "", literal=True)
.str.strip_chars()
.cast(pl.Float32, strict=False),
)
# Drop rows without coordinates — a small number of historic/dummy entries
# have Easting=0 which would map to the Atlantic.
df = df.filter(
pl.col("Easting").is_not_null()
& pl.col("Northing").is_not_null()
& (pl.col("Easting") > 0)
& (pl.col("Northing") > 0)
)
lat, lng = _project_easting_northing(df["Easting"], df["Northing"])
age_range = [
_format_age_range(low, high)
for low, high in zip(df["StatutoryLowAge"].to_list(), df["StatutoryHighAge"].to_list())
]
address = [
_format_address(street, locality, town)
for street, locality, town in zip(
df["Street"].to_list(),
df["Locality"].to_list(),
df["Town"].to_list(),
)
]
head_name = [
_format_head_name(title, first, last)
for title, first, last in zip(
df["HeadTitle (name)"].to_list(),
df["HeadFirstName"].to_list(),
df["HeadLastName"].to_list(),
)
]
out = pl.DataFrame(
{
"urn": df["URN"],
"name": df["EstablishmentName"],
"lat": pl.Series(lat, dtype=pl.Float64),
"lng": pl.Series(lng, dtype=pl.Float64),
"phase": df["PhaseOfEducation (name)"],
"type": df["TypeOfEstablishment (name)"],
"type_group": df["EstablishmentTypeGroup (name)"],
"age_range": pl.Series(age_range, dtype=pl.String),
"gender": df["Gender (name)"],
"religious_character": df["ReligiousCharacter (name)"],
"admissions_policy": df["AdmissionsPolicy (name)"],
"nursery_provision": df["NurseryProvision (name)"],
"sixth_form": df["OfficialSixthForm (name)"],
"capacity": df["SchoolCapacity"],
"pupils": df["NumberOfPupils"],
"fsm_percent": df["PercentageFSM"],
"trust": df["Trusts (name)"],
"address": pl.Series(address, dtype=pl.String),
"postcode": df["Postcode"],
"local_authority": df["LA (name)"],
"website": df["SchoolWebsite"],
"telephone": df["TelephoneNum"],
"head_name": pl.Series(head_name, dtype=pl.String),
}
)
# Drop any remaining rows where projection failed (extremely rare).
return out.filter(pl.col("lat").is_not_null() & pl.col("lng").is_not_null())
def main() -> None:
parser = argparse.ArgumentParser(
description="Download the GIAS full establishments extract → parquet"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
cache_dir = local_tmp_dir() / "gias"
cache_dir.mkdir(parents=True, exist_ok=True)
cache_path = cache_dir / "edubase.zip"
print("Fetching GIAS extract...")
zip_bytes = fetch_extract_zip()
cache_path.write_bytes(zip_bytes)
print(f"Downloaded {len(zip_bytes) / (1024 * 1024):.1f} MB to {cache_path}")
print("Transforming...")
df = transform(zip_bytes)
args.output.parent.mkdir(parents=True, exist_ok=True)
df.write_parquet(args.output, compression="zstd")
print(f"Wrote {args.output} ({len(df):,} open establishments)")
if __name__ == "__main__":
main()