has issues

2026-05-25 13:20:17 +01:00 · 2026-05-25 13:20:17 +01:00 · c645b0f1d4
commit c645b0f1d4
parent 2e112d7398
96 changed files with 2147083 additions and 5787 deletions
--- a/pipeline/download/conservation_areas.py
+++ b/pipeline/download/conservation_areas.py
@ -0,0 +1,51 @@
+"""Download Historic England conservation area polygons.
+
+Source: Historic England Conservation Areas
+License: Open Government Licence v3.0
+"""
+
+import argparse
+from pathlib import Path
+
+import httpx
+import pyogrio
+
+URL = (
+    "https://opendata-historicengland.hub.arcgis.com/api/download/v1/items/"
+    "446bc9bf8b5b440386d0c504caa3dac5/geoPackage?layers=0"
+)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Download Historic England conservation area polygons"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output GeoPackage file path"
+    )
+    args = parser.parse_args()
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    tmp_path = args.output.with_name(f"{args.output.stem}.tmp{args.output.suffix}")
+
+    print("Downloading Historic England conservation areas...")
+    with httpx.stream("GET", URL, follow_redirects=True, timeout=300) as response:
+        response.raise_for_status()
+        with tmp_path.open("wb") as fh:
+            for chunk in response.iter_bytes():
+                fh.write(chunk)
+
+    info = pyogrio.read_info(tmp_path)
+    features = info.get("features", 0)
+    geometry_type = info.get("geometry_type")
+    if features <= 0:
+        raise ValueError("Downloaded conservation areas file contains no features")
+    if "Polygon" not in str(geometry_type):
+        raise ValueError(f"Expected polygon geometry, got {geometry_type!r}")
+
+    tmp_path.replace(args.output)
+    size_mb = args.output.stat().st_size / (1024 * 1024)
+    print(f"Saved {features} conservation areas to {args.output} ({size_mb:.1f} MB)")
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/download/gias.py
+++ b/pipeline/download/gias.py
@ -0,0 +1,349 @@
+"""Download the Get Information About Schools (GIAS) full establishments extract.
+
+GIAS is the DfE register of all educational establishments in England, updated
+daily. The CSV is generated on-demand via a four-step interaction with the
+public Downloads page (there is no static URL):
+
+1.  GET /Downloads — extract anti-forgery token, the `all.edubase.data` tag,
+    and the FileGeneratedDate that the server expects for that tag today.
+2.  POST /Downloads/Collate — submit the form to start file generation. The
+    redirect URL contains a generation UUID.
+3.  Poll /Downloads/GenerateAjax/{id} until status:true.
+4.  GET the Azure blob URL with ?id={id} — returns a ZIP containing
+    `edubasealldataYYYYMMDD.csv`.
+
+The CSV is cp1252-encoded with 135 columns. We keep the fields useful for a
+schools map (identification, status, phase, age range, religious character,
+admissions policy, headline figures, contact details) and project Easting/
+Northing (EPSG:27700) to WGS84 lat/lng.
+"""
+
+import argparse
+import io
+import json
+import re
+import time
+import zipfile
+from pathlib import Path
+
+import httpx
+import polars as pl
+from pyproj import Transformer
+
+from pipeline.local_temp import local_tmp_dir
+
+BASE_URL = "https://get-information-schools.service.gov.uk"
+DOWNLOADS_URL = f"{BASE_URL}/Downloads"
+COLLATE_URL = f"{BASE_URL}/Downloads/Collate"
+AJAX_URL = f"{BASE_URL}/Downloads/GenerateAjax"
+AZURE_FILE_URL = (
+    "https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/File.xhtml"
+)
+EXTRACT_TAG = "all.edubase.data"
+
+USER_AGENT = (
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+    "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+)
+
+POLL_INTERVAL_S = 2.0
+POLL_TIMEOUT_S = 300.0
+
+_TOKEN_RE = re.compile(
+    r'name="__RequestVerificationToken"[^>]*value="([^"]+)"', re.IGNORECASE
+)
+_GEN_DATE_RE = re.compile(
+    r'Downloads_0__FileGeneratedDate"[^>]*value="([^"]+)"', re.IGNORECASE
+)
+_GEN_ID_RE = re.compile(
+    r"/Downloads/Generated/([0-9a-f-]{36})", re.IGNORECASE
+)
+
+# Columns to read from the CSV (the file has 135; we keep what is useful for a
+# schools map and contact card). Names must match the CSV header verbatim.
+_CSV_COLUMNS: list[str] = [
+    "URN",
+    "EstablishmentName",
+    "TypeOfEstablishment (name)",
+    "EstablishmentTypeGroup (name)",
+    "EstablishmentStatus (name)",
+    "PhaseOfEducation (name)",
+    "StatutoryLowAge",
+    "StatutoryHighAge",
+    "NurseryProvision (name)",
+    "OfficialSixthForm (name)",
+    "Gender (name)",
+    "ReligiousCharacter (name)",
+    "AdmissionsPolicy (name)",
+    "SchoolCapacity",
+    "NumberOfPupils",
+    "PercentageFSM",
+    "Trusts (name)",
+    "Street",
+    "Locality",
+    "Town",
+    "County (name)",
+    "Postcode",
+    "SchoolWebsite",
+    "TelephoneNum",
+    "HeadTitle (name)",
+    "HeadFirstName",
+    "HeadLastName",
+    "Easting",
+    "Northing",
+    "LA (name)",
+]
+
+_NULL_VALUES = ["", "NULL", "Not applicable", "Does not apply"]
+
+_to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
+
+
+def _extract_token(html: str) -> str:
+    match = _TOKEN_RE.search(html)
+    if match is None:
+        raise RuntimeError("Could not find __RequestVerificationToken on GIAS page")
+    return match.group(1)
+
+
+def _extract_file_generated_date(html: str) -> str:
+    match = _GEN_DATE_RE.search(html)
+    if match is None:
+        raise RuntimeError(
+            "Could not find FileGeneratedDate for the establishments extract"
+        )
+    return match.group(1)
+
+
+def _start_generation(client: httpx.Client) -> str:
+    """Submit the Downloads form and return the generation UUID."""
+    initial = client.get(DOWNLOADS_URL)
+    initial.raise_for_status()
+    token = _extract_token(initial.text)
+    file_generated_date = _extract_file_generated_date(initial.text)
+
+    response = client.post(
+        COLLATE_URL,
+        data={
+            "__RequestVerificationToken": token,
+            "Downloads[0].Tag": EXTRACT_TAG,
+            "Downloads[0].FileGeneratedDate": file_generated_date,
+            "Downloads[0].Selected": "true",
+        },
+        follow_redirects=True,
+    )
+    response.raise_for_status()
+
+    match = _GEN_ID_RE.search(str(response.url)) or _GEN_ID_RE.search(response.text)
+    if match is None:
+        raise RuntimeError("GIAS Collate did not yield a generation UUID")
+    return match.group(1)
+
+
+def _wait_for_generation(client: httpx.Client, generation_id: str) -> None:
+    deadline = time.monotonic() + POLL_TIMEOUT_S
+    while time.monotonic() < deadline:
+        response = client.get(
+            f"{AJAX_URL}/{generation_id}",
+            headers={"X-Requested-With": "XMLHttpRequest"},
+        )
+        response.raise_for_status()
+        # The endpoint returns JSON whose payload is itself a JSON-encoded string,
+        # e.g. response.json() returns the string `{"status":true,...}` which we
+        # then need to decode a second time.
+        payload = json.loads(response.json())
+        if payload.get("status") is True:
+            return
+        time.sleep(POLL_INTERVAL_S)
+    raise RuntimeError(
+        f"GIAS extract generation timed out after {POLL_TIMEOUT_S:.0f}s"
+    )
+
+
+def _download_zip(client: httpx.Client, generation_id: str) -> bytes:
+    response = client.get(AZURE_FILE_URL, params={"id": generation_id})
+    response.raise_for_status()
+    if not response.content.startswith(b"PK"):
+        raise RuntimeError("GIAS Azure response was not a ZIP archive")
+    return response.content
+
+
+def fetch_extract_zip() -> bytes:
+    """Run the full GIAS download flow and return the raw ZIP bytes."""
+    headers = {"User-Agent": USER_AGENT}
+    with httpx.Client(headers=headers, timeout=httpx.Timeout(30.0, read=120.0)) as client:
+        generation_id = _start_generation(client)
+        _wait_for_generation(client, generation_id)
+        return _download_zip(client, generation_id)
+
+
+def _read_csv_from_zip(zip_bytes: bytes) -> pl.DataFrame:
+    with zipfile.ZipFile(io.BytesIO(zip_bytes)) as archive:
+        csv_names = [name for name in archive.namelist() if name.lower().endswith(".csv")]
+        if not csv_names:
+            raise RuntimeError("GIAS ZIP did not contain a CSV file")
+        with archive.open(csv_names[0]) as raw:
+            data = raw.read()
+
+    text = data.decode("cp1252")
+    return pl.read_csv(
+        io.StringIO(text),
+        columns=_CSV_COLUMNS,
+        infer_schema_length=20000,
+        null_values=_NULL_VALUES,
+        truncate_ragged_lines=True,
+    )
+
+
+def _project_easting_northing(easting: pl.Series, northing: pl.Series) -> tuple[list[float | None], list[float | None]]:
+    e = easting.to_numpy()
+    n = northing.to_numpy()
+    lng, lat = _to_wgs84.transform(e, n)
+    lng_out: list[float | None] = []
+    lat_out: list[float | None] = []
+    for east_val, lat_val, lng_val in zip(e, lat, lng):
+        if east_val is None or float(east_val) == 0.0:
+            lng_out.append(None)
+            lat_out.append(None)
+        else:
+            lng_out.append(float(lng_val))
+            lat_out.append(float(lat_val))
+    return lat_out, lng_out
+
+
+def _format_age_range(low: int | None, high: int | None) -> str | None:
+    if low is None and high is None:
+        return None
+    if low is None:
+        return f"up to {high}"
+    if high is None:
+        return f"{low}+"
+    return f"{low}–{high}"
+
+
+def _format_address(street: str | None, locality: str | None, town: str | None) -> str | None:
+    parts = [part.strip() for part in (street, locality, town) if part]
+    parts = [part for part in parts if part]
+    return ", ".join(parts) if parts else None
+
+
+def _format_head_name(title: str | None, first: str | None, last: str | None) -> str | None:
+    parts = [part.strip() for part in (title, first, last) if part]
+    parts = [part for part in parts if part]
+    return " ".join(parts) if parts else None
+
+
+def transform(zip_bytes: bytes) -> pl.DataFrame:
+    """Convert the GIAS extract ZIP into a clean schools DataFrame."""
+    raw = _read_csv_from_zip(zip_bytes)
+
+    # Filter to currently-open establishments; the CSV also includes closed,
+    # proposed-to-open, and proposed-to-close rows we do not want on a map.
+    df = raw.filter(pl.col("EstablishmentStatus (name)") == "Open")
+
+    df = df.with_columns(
+        pl.col("URN").cast(pl.Int64),
+        pl.col("StatutoryLowAge").cast(pl.Int32, strict=False),
+        pl.col("StatutoryHighAge").cast(pl.Int32, strict=False),
+        pl.col("SchoolCapacity").cast(pl.Int32, strict=False),
+        pl.col("NumberOfPupils").cast(pl.Int32, strict=False),
+        pl.col("Easting").cast(pl.Float64, strict=False),
+        pl.col("Northing").cast(pl.Float64, strict=False),
+        pl.col("PercentageFSM")
+        .cast(pl.String)
+        .str.replace_all("%", "", literal=True)
+        .str.strip_chars()
+        .cast(pl.Float32, strict=False),
+    )
+
+    # Drop rows without coordinates — a small number of historic/dummy entries
+    # have Easting=0 which would map to the Atlantic.
+    df = df.filter(
+        pl.col("Easting").is_not_null()
+        & pl.col("Northing").is_not_null()
+        & (pl.col("Easting") > 0)
+        & (pl.col("Northing") > 0)
+    )
+
+    lat, lng = _project_easting_northing(df["Easting"], df["Northing"])
+
+    age_range = [
+        _format_age_range(low, high)
+        for low, high in zip(df["StatutoryLowAge"].to_list(), df["StatutoryHighAge"].to_list())
+    ]
+    address = [
+        _format_address(street, locality, town)
+        for street, locality, town in zip(
+            df["Street"].to_list(),
+            df["Locality"].to_list(),
+            df["Town"].to_list(),
+        )
+    ]
+    head_name = [
+        _format_head_name(title, first, last)
+        for title, first, last in zip(
+            df["HeadTitle (name)"].to_list(),
+            df["HeadFirstName"].to_list(),
+            df["HeadLastName"].to_list(),
+        )
+    ]
+
+    out = pl.DataFrame(
+        {
+            "urn": df["URN"],
+            "name": df["EstablishmentName"],
+            "lat": pl.Series(lat, dtype=pl.Float64),
+            "lng": pl.Series(lng, dtype=pl.Float64),
+            "phase": df["PhaseOfEducation (name)"],
+            "type": df["TypeOfEstablishment (name)"],
+            "type_group": df["EstablishmentTypeGroup (name)"],
+            "age_range": pl.Series(age_range, dtype=pl.String),
+            "gender": df["Gender (name)"],
+            "religious_character": df["ReligiousCharacter (name)"],
+            "admissions_policy": df["AdmissionsPolicy (name)"],
+            "nursery_provision": df["NurseryProvision (name)"],
+            "sixth_form": df["OfficialSixthForm (name)"],
+            "capacity": df["SchoolCapacity"],
+            "pupils": df["NumberOfPupils"],
+            "fsm_percent": df["PercentageFSM"],
+            "trust": df["Trusts (name)"],
+            "address": pl.Series(address, dtype=pl.String),
+            "postcode": df["Postcode"],
+            "local_authority": df["LA (name)"],
+            "website": df["SchoolWebsite"],
+            "telephone": df["TelephoneNum"],
+            "head_name": pl.Series(head_name, dtype=pl.String),
+        }
+    )
+
+    # Drop any remaining rows where projection failed (extremely rare).
+    return out.filter(pl.col("lat").is_not_null() & pl.col("lng").is_not_null())
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Download the GIAS full establishments extract → parquet"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output parquet file path"
+    )
+    args = parser.parse_args()
+
+    cache_dir = local_tmp_dir() / "gias"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    cache_path = cache_dir / "edubase.zip"
+
+    print("Fetching GIAS extract...")
+    zip_bytes = fetch_extract_zip()
+    cache_path.write_bytes(zip_bytes)
+    print(f"Downloaded {len(zip_bytes) / (1024 * 1024):.1f} MB to {cache_path}")
+
+    print("Transforming...")
+    df = transform(zip_bytes)
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    df.write_parquet(args.output, compression="zstd")
+    print(f"Wrote {args.output} ({len(df):,} open establishments)")
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/download/listed_buildings.py
+++ b/pipeline/download/listed_buildings.py
@ -0,0 +1,53 @@
+"""Download Historic England listed-building point data.
+
+Source: Historic England National Heritage List for England (NHLE)
+License: Open Government Licence v3.0
+"""
+
+import argparse
+from pathlib import Path
+
+import httpx
+import pyogrio
+
+URL = (
+    "https://opendata-historicengland.hub.arcgis.com/api/download/v1/items/"
+    "767f279327a24845bf47dfe5eae9862b/geoPackage?layers=0"
+)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Download Historic England NHLE listed-building points"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output GeoPackage file path"
+    )
+    args = parser.parse_args()
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    tmp_path = args.output.with_name(f"{args.output.stem}.tmp{args.output.suffix}")
+
+    print("Downloading Historic England listed-building points...")
+    with httpx.stream("GET", URL, follow_redirects=True, timeout=300) as response:
+        response.raise_for_status()
+        with tmp_path.open("wb") as fh:
+            for chunk in response.iter_bytes():
+                fh.write(chunk)
+
+    info = pyogrio.read_info(tmp_path)
+    features = info.get("features", 0)
+    geometry_type = str(info.get("geometry_type") or "")
+    if features <= 0:
+        raise ValueError("Downloaded listed-buildings file contains no features")
+    if "Point" not in geometry_type:
+        raise ValueError(f"Expected point geometry, got {geometry_type!r}")
+
+    tmp_path.replace(args.output)
+    size_mb = args.output.stat().st_size / (1024 * 1024)
+    print(
+        f"Saved {features} listed-building points to {args.output} ({size_mb:.1f} MB)"
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/download/lsoa_2011_to_2021.py
+++ b/pipeline/download/lsoa_2011_to_2021.py
@ -0,0 +1,85 @@
+"""Download the ONS LSOA 2011 → LSOA 2021 lookup.
+
+Source: ONS Open Geography Portal (LSOA11_LSOA21_LAD22_EW_LU_v5)
+License: Open Government Licence v3.0
+
+The lookup tells us how 2011 LSOA boundaries map to 2021 ones. We use it to
+remap older crime data (police.uk reported in 2011 codes pre-2022) into the
+2021 codes the rest of the pipeline keys on, so the crime-over-time chart can
+show the full history instead of only post-boundary-change years.
+
+CHGIND values: U (unchanged), S (split into multiple 2021), M (multiple 2011
+merged into one 2021), X (irregular reshape).
+"""
+
+import argparse
+from pathlib import Path
+
+import httpx
+import polars as pl
+
+BASE_URL = (
+    "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/"
+    "LSOA11_LSOA21_LAD22_EW_LU_v5/FeatureServer/0/query"
+)
+PAGE_SIZE = 2000
+
+
+def download(output_path: Path) -> None:
+    print("Downloading ONS LSOA 2011 → 2021 lookup...")
+    rows: list[dict[str, str]] = []
+    offset = 0
+    while True:
+        params = {
+            "where": "1=1",
+            "outFields": "LSOA11CD,LSOA21CD,CHGIND",
+            "returnGeometry": "false",
+            "orderByFields": "LSOA11CD",
+            "f": "json",
+            "resultRecordCount": str(PAGE_SIZE),
+            "resultOffset": str(offset),
+        }
+        response = httpx.get(BASE_URL, params=params, timeout=60)
+        response.raise_for_status()
+        data = response.json()
+        features = data.get("features", [])
+        if not features:
+            break
+        for feat in features:
+            attrs = feat.get("attributes", {})
+            if attrs.get("LSOA11CD") and attrs.get("LSOA21CD"):
+                rows.append(
+                    {
+                        "lsoa11": attrs["LSOA11CD"],
+                        "lsoa21": attrs["LSOA21CD"],
+                        "chgind": attrs.get("CHGIND") or "U",
+                    }
+                )
+        print(f"  Fetched {len(features)} rows (offset={offset})")
+        if not data.get("exceededTransferLimit") and len(features) < PAGE_SIZE:
+            break
+        offset += len(features)
+
+    if not rows:
+        raise RuntimeError("ONS lookup returned no rows")
+
+    df = pl.DataFrame(rows)
+    # England-only matches the rest of the pipeline.
+    df = df.filter(pl.col("lsoa11").str.starts_with("E"))
+    print(f"England LSOA mappings: {df.height}")
+    print(f"  CHGIND breakdown: {df.group_by('chgind').len().sort('chgind').to_dicts()}")
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    df.write_parquet(output_path, compression="zstd")
+    print(f"Saved to {output_path}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Download ONS LSOA 2011 → 2021 lookup")
+    parser.add_argument("--output", type=Path, required=True)
+    args = parser.parse_args()
+    download(args.output)
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/download/noise.py
+++ b/pipeline/download/noise.py
@ -148,6 +148,20 @@ def _looks_like_tiff(response: httpx.Response) -> bool:
    return "tiff" in content_type or response.content[:4] in (b"II*\x00", b"MM\x00*")


+def _validate_geotiff(path: Path) -> None:
+    """Open and fully decode the raster to catch truncated/corrupt downloads.
+
+    The WCS endpoint occasionally returns a TIFF that opens cleanly (valid
+    header + IFD) but whose encoded pixel data is truncated. The corruption
+    only surfaces when rasterio actually decodes a strip/tile.
+    """
+    try:
+        with rasterio.open(path) as src:
+            src.read(1)
+    except (rasterio.errors.RasterioIOError, rasterio.errors.RasterioError) as e:
+        raise NoGeoTiffError(f"Downloaded TIFF failed to decode: {e}") from e
+
+
 def _fetch_tile_bytes(
    wcs_base: str,
    coverage_id: str,
@ -216,7 +230,17 @@ def _download_tile(
            content = _fetch_tile_bytes(
                wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
            )
-            tile_path.write_bytes(content)
+            # Write to a sibling temp file and rename atomically so partial
+            # writes (or truncated bodies that pass the magic-byte sniff but
+            # fail full decode) never poison the cache.
+            tmp_path = tile_path.with_suffix(tile_path.suffix + ".part")
+            tmp_path.write_bytes(content)
+            try:
+                _validate_geotiff(tmp_path)
+            except NoGeoTiffError:
+                tmp_path.unlink(missing_ok=True)
+                raise
+            tmp_path.replace(tile_path)
            return [tile_path], []
        except (
            NoGeoTiffError,