349 lines
12 KiB
Python
349 lines
12 KiB
Python
"""Download the Get Information About Schools (GIAS) full establishments extract.
|
||
|
||
GIAS is the DfE register of all educational establishments in England, updated
|
||
daily. The CSV is generated on-demand via a four-step interaction with the
|
||
public Downloads page (there is no static URL):
|
||
|
||
1. GET /Downloads — extract anti-forgery token, the `all.edubase.data` tag,
|
||
and the FileGeneratedDate that the server expects for that tag today.
|
||
2. POST /Downloads/Collate — submit the form to start file generation. The
|
||
redirect URL contains a generation UUID.
|
||
3. Poll /Downloads/GenerateAjax/{id} until status:true.
|
||
4. GET the Azure blob URL with ?id={id} — returns a ZIP containing
|
||
`edubasealldataYYYYMMDD.csv`.
|
||
|
||
The CSV is cp1252-encoded with 135 columns. We keep the fields useful for a
|
||
schools map (identification, status, phase, age range, religious character,
|
||
admissions policy, headline figures, contact details) and project Easting/
|
||
Northing (EPSG:27700) to WGS84 lat/lng.
|
||
"""
|
||
|
||
import argparse
|
||
import io
|
||
import json
|
||
import re
|
||
import time
|
||
import zipfile
|
||
from pathlib import Path
|
||
|
||
import httpx
|
||
import polars as pl
|
||
from pyproj import Transformer
|
||
|
||
from pipeline.local_temp import local_tmp_dir
|
||
|
||
BASE_URL = "https://get-information-schools.service.gov.uk"
|
||
DOWNLOADS_URL = f"{BASE_URL}/Downloads"
|
||
COLLATE_URL = f"{BASE_URL}/Downloads/Collate"
|
||
AJAX_URL = f"{BASE_URL}/Downloads/GenerateAjax"
|
||
AZURE_FILE_URL = (
|
||
"https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/File.xhtml"
|
||
)
|
||
EXTRACT_TAG = "all.edubase.data"
|
||
|
||
USER_AGENT = (
|
||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||
)
|
||
|
||
POLL_INTERVAL_S = 2.0
|
||
POLL_TIMEOUT_S = 300.0
|
||
|
||
_TOKEN_RE = re.compile(
|
||
r'name="__RequestVerificationToken"[^>]*value="([^"]+)"', re.IGNORECASE
|
||
)
|
||
_GEN_DATE_RE = re.compile(
|
||
r'Downloads_0__FileGeneratedDate"[^>]*value="([^"]+)"', re.IGNORECASE
|
||
)
|
||
_GEN_ID_RE = re.compile(
|
||
r"/Downloads/Generated/([0-9a-f-]{36})", re.IGNORECASE
|
||
)
|
||
|
||
# Columns to read from the CSV (the file has 135; we keep what is useful for a
|
||
# schools map and contact card). Names must match the CSV header verbatim.
|
||
_CSV_COLUMNS: list[str] = [
|
||
"URN",
|
||
"EstablishmentName",
|
||
"TypeOfEstablishment (name)",
|
||
"EstablishmentTypeGroup (name)",
|
||
"EstablishmentStatus (name)",
|
||
"PhaseOfEducation (name)",
|
||
"StatutoryLowAge",
|
||
"StatutoryHighAge",
|
||
"NurseryProvision (name)",
|
||
"OfficialSixthForm (name)",
|
||
"Gender (name)",
|
||
"ReligiousCharacter (name)",
|
||
"AdmissionsPolicy (name)",
|
||
"SchoolCapacity",
|
||
"NumberOfPupils",
|
||
"PercentageFSM",
|
||
"Trusts (name)",
|
||
"Street",
|
||
"Locality",
|
||
"Town",
|
||
"County (name)",
|
||
"Postcode",
|
||
"SchoolWebsite",
|
||
"TelephoneNum",
|
||
"HeadTitle (name)",
|
||
"HeadFirstName",
|
||
"HeadLastName",
|
||
"Easting",
|
||
"Northing",
|
||
"LA (name)",
|
||
]
|
||
|
||
_NULL_VALUES = ["", "NULL", "Not applicable", "Does not apply"]
|
||
|
||
_to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
|
||
|
||
|
||
def _extract_token(html: str) -> str:
|
||
match = _TOKEN_RE.search(html)
|
||
if match is None:
|
||
raise RuntimeError("Could not find __RequestVerificationToken on GIAS page")
|
||
return match.group(1)
|
||
|
||
|
||
def _extract_file_generated_date(html: str) -> str:
|
||
match = _GEN_DATE_RE.search(html)
|
||
if match is None:
|
||
raise RuntimeError(
|
||
"Could not find FileGeneratedDate for the establishments extract"
|
||
)
|
||
return match.group(1)
|
||
|
||
|
||
def _start_generation(client: httpx.Client) -> str:
|
||
"""Submit the Downloads form and return the generation UUID."""
|
||
initial = client.get(DOWNLOADS_URL)
|
||
initial.raise_for_status()
|
||
token = _extract_token(initial.text)
|
||
file_generated_date = _extract_file_generated_date(initial.text)
|
||
|
||
response = client.post(
|
||
COLLATE_URL,
|
||
data={
|
||
"__RequestVerificationToken": token,
|
||
"Downloads[0].Tag": EXTRACT_TAG,
|
||
"Downloads[0].FileGeneratedDate": file_generated_date,
|
||
"Downloads[0].Selected": "true",
|
||
},
|
||
follow_redirects=True,
|
||
)
|
||
response.raise_for_status()
|
||
|
||
match = _GEN_ID_RE.search(str(response.url)) or _GEN_ID_RE.search(response.text)
|
||
if match is None:
|
||
raise RuntimeError("GIAS Collate did not yield a generation UUID")
|
||
return match.group(1)
|
||
|
||
|
||
def _wait_for_generation(client: httpx.Client, generation_id: str) -> None:
|
||
deadline = time.monotonic() + POLL_TIMEOUT_S
|
||
while time.monotonic() < deadline:
|
||
response = client.get(
|
||
f"{AJAX_URL}/{generation_id}",
|
||
headers={"X-Requested-With": "XMLHttpRequest"},
|
||
)
|
||
response.raise_for_status()
|
||
# The endpoint returns JSON whose payload is itself a JSON-encoded string,
|
||
# e.g. response.json() returns the string `{"status":true,...}` which we
|
||
# then need to decode a second time.
|
||
payload = json.loads(response.json())
|
||
if payload.get("status") is True:
|
||
return
|
||
time.sleep(POLL_INTERVAL_S)
|
||
raise RuntimeError(
|
||
f"GIAS extract generation timed out after {POLL_TIMEOUT_S:.0f}s"
|
||
)
|
||
|
||
|
||
def _download_zip(client: httpx.Client, generation_id: str) -> bytes:
|
||
response = client.get(AZURE_FILE_URL, params={"id": generation_id})
|
||
response.raise_for_status()
|
||
if not response.content.startswith(b"PK"):
|
||
raise RuntimeError("GIAS Azure response was not a ZIP archive")
|
||
return response.content
|
||
|
||
|
||
def fetch_extract_zip() -> bytes:
|
||
"""Run the full GIAS download flow and return the raw ZIP bytes."""
|
||
headers = {"User-Agent": USER_AGENT}
|
||
with httpx.Client(headers=headers, timeout=httpx.Timeout(30.0, read=120.0)) as client:
|
||
generation_id = _start_generation(client)
|
||
_wait_for_generation(client, generation_id)
|
||
return _download_zip(client, generation_id)
|
||
|
||
|
||
def _read_csv_from_zip(zip_bytes: bytes) -> pl.DataFrame:
|
||
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as archive:
|
||
csv_names = [name for name in archive.namelist() if name.lower().endswith(".csv")]
|
||
if not csv_names:
|
||
raise RuntimeError("GIAS ZIP did not contain a CSV file")
|
||
with archive.open(csv_names[0]) as raw:
|
||
data = raw.read()
|
||
|
||
text = data.decode("cp1252")
|
||
return pl.read_csv(
|
||
io.StringIO(text),
|
||
columns=_CSV_COLUMNS,
|
||
infer_schema_length=20000,
|
||
null_values=_NULL_VALUES,
|
||
truncate_ragged_lines=True,
|
||
)
|
||
|
||
|
||
def _project_easting_northing(easting: pl.Series, northing: pl.Series) -> tuple[list[float | None], list[float | None]]:
|
||
e = easting.to_numpy()
|
||
n = northing.to_numpy()
|
||
lng, lat = _to_wgs84.transform(e, n)
|
||
lng_out: list[float | None] = []
|
||
lat_out: list[float | None] = []
|
||
for east_val, lat_val, lng_val in zip(e, lat, lng):
|
||
if east_val is None or float(east_val) == 0.0:
|
||
lng_out.append(None)
|
||
lat_out.append(None)
|
||
else:
|
||
lng_out.append(float(lng_val))
|
||
lat_out.append(float(lat_val))
|
||
return lat_out, lng_out
|
||
|
||
|
||
def _format_age_range(low: int | None, high: int | None) -> str | None:
|
||
if low is None and high is None:
|
||
return None
|
||
if low is None:
|
||
return f"up to {high}"
|
||
if high is None:
|
||
return f"{low}+"
|
||
return f"{low}–{high}"
|
||
|
||
|
||
def _format_address(street: str | None, locality: str | None, town: str | None) -> str | None:
|
||
parts = [part.strip() for part in (street, locality, town) if part]
|
||
parts = [part for part in parts if part]
|
||
return ", ".join(parts) if parts else None
|
||
|
||
|
||
def _format_head_name(title: str | None, first: str | None, last: str | None) -> str | None:
|
||
parts = [part.strip() for part in (title, first, last) if part]
|
||
parts = [part for part in parts if part]
|
||
return " ".join(parts) if parts else None
|
||
|
||
|
||
def transform(zip_bytes: bytes) -> pl.DataFrame:
|
||
"""Convert the GIAS extract ZIP into a clean schools DataFrame."""
|
||
raw = _read_csv_from_zip(zip_bytes)
|
||
|
||
# Filter to currently-open establishments; the CSV also includes closed,
|
||
# proposed-to-open, and proposed-to-close rows we do not want on a map.
|
||
df = raw.filter(pl.col("EstablishmentStatus (name)") == "Open")
|
||
|
||
df = df.with_columns(
|
||
pl.col("URN").cast(pl.Int64),
|
||
pl.col("StatutoryLowAge").cast(pl.Int32, strict=False),
|
||
pl.col("StatutoryHighAge").cast(pl.Int32, strict=False),
|
||
pl.col("SchoolCapacity").cast(pl.Int32, strict=False),
|
||
pl.col("NumberOfPupils").cast(pl.Int32, strict=False),
|
||
pl.col("Easting").cast(pl.Float64, strict=False),
|
||
pl.col("Northing").cast(pl.Float64, strict=False),
|
||
pl.col("PercentageFSM")
|
||
.cast(pl.String)
|
||
.str.replace_all("%", "", literal=True)
|
||
.str.strip_chars()
|
||
.cast(pl.Float32, strict=False),
|
||
)
|
||
|
||
# Drop rows without coordinates — a small number of historic/dummy entries
|
||
# have Easting=0 which would map to the Atlantic.
|
||
df = df.filter(
|
||
pl.col("Easting").is_not_null()
|
||
& pl.col("Northing").is_not_null()
|
||
& (pl.col("Easting") > 0)
|
||
& (pl.col("Northing") > 0)
|
||
)
|
||
|
||
lat, lng = _project_easting_northing(df["Easting"], df["Northing"])
|
||
|
||
age_range = [
|
||
_format_age_range(low, high)
|
||
for low, high in zip(df["StatutoryLowAge"].to_list(), df["StatutoryHighAge"].to_list())
|
||
]
|
||
address = [
|
||
_format_address(street, locality, town)
|
||
for street, locality, town in zip(
|
||
df["Street"].to_list(),
|
||
df["Locality"].to_list(),
|
||
df["Town"].to_list(),
|
||
)
|
||
]
|
||
head_name = [
|
||
_format_head_name(title, first, last)
|
||
for title, first, last in zip(
|
||
df["HeadTitle (name)"].to_list(),
|
||
df["HeadFirstName"].to_list(),
|
||
df["HeadLastName"].to_list(),
|
||
)
|
||
]
|
||
|
||
out = pl.DataFrame(
|
||
{
|
||
"urn": df["URN"],
|
||
"name": df["EstablishmentName"],
|
||
"lat": pl.Series(lat, dtype=pl.Float64),
|
||
"lng": pl.Series(lng, dtype=pl.Float64),
|
||
"phase": df["PhaseOfEducation (name)"],
|
||
"type": df["TypeOfEstablishment (name)"],
|
||
"type_group": df["EstablishmentTypeGroup (name)"],
|
||
"age_range": pl.Series(age_range, dtype=pl.String),
|
||
"gender": df["Gender (name)"],
|
||
"religious_character": df["ReligiousCharacter (name)"],
|
||
"admissions_policy": df["AdmissionsPolicy (name)"],
|
||
"nursery_provision": df["NurseryProvision (name)"],
|
||
"sixth_form": df["OfficialSixthForm (name)"],
|
||
"capacity": df["SchoolCapacity"],
|
||
"pupils": df["NumberOfPupils"],
|
||
"fsm_percent": df["PercentageFSM"],
|
||
"trust": df["Trusts (name)"],
|
||
"address": pl.Series(address, dtype=pl.String),
|
||
"postcode": df["Postcode"],
|
||
"local_authority": df["LA (name)"],
|
||
"website": df["SchoolWebsite"],
|
||
"telephone": df["TelephoneNum"],
|
||
"head_name": pl.Series(head_name, dtype=pl.String),
|
||
}
|
||
)
|
||
|
||
# Drop any remaining rows where projection failed (extremely rare).
|
||
return out.filter(pl.col("lat").is_not_null() & pl.col("lng").is_not_null())
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(
|
||
description="Download the GIAS full establishments extract → parquet"
|
||
)
|
||
parser.add_argument(
|
||
"--output", type=Path, required=True, help="Output parquet file path"
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
cache_dir = local_tmp_dir() / "gias"
|
||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||
cache_path = cache_dir / "edubase.zip"
|
||
|
||
print("Fetching GIAS extract...")
|
||
zip_bytes = fetch_extract_zip()
|
||
cache_path.write_bytes(zip_bytes)
|
||
print(f"Downloaded {len(zip_bytes) / (1024 * 1024):.1f} MB to {cache_path}")
|
||
|
||
print("Transforming...")
|
||
df = transform(zip_bytes)
|
||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||
df.write_parquet(args.output, compression="zstd")
|
||
print(f"Wrote {args.output} ({len(df):,} open establishments)")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|