has issues

This commit is contained in:
Andras Schmelczer 2026-05-25 13:20:17 +01:00
parent 2e112d7398
commit c645b0f1d4
96 changed files with 2147083 additions and 5787 deletions

View file

@ -0,0 +1,51 @@
"""Download Historic England conservation area polygons.
Source: Historic England Conservation Areas
License: Open Government Licence v3.0
"""
import argparse
from pathlib import Path
import httpx
import pyogrio
URL = (
"https://opendata-historicengland.hub.arcgis.com/api/download/v1/items/"
"446bc9bf8b5b440386d0c504caa3dac5/geoPackage?layers=0"
)
def main() -> None:
parser = argparse.ArgumentParser(
description="Download Historic England conservation area polygons"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output GeoPackage file path"
)
args = parser.parse_args()
args.output.parent.mkdir(parents=True, exist_ok=True)
tmp_path = args.output.with_name(f"{args.output.stem}.tmp{args.output.suffix}")
print("Downloading Historic England conservation areas...")
with httpx.stream("GET", URL, follow_redirects=True, timeout=300) as response:
response.raise_for_status()
with tmp_path.open("wb") as fh:
for chunk in response.iter_bytes():
fh.write(chunk)
info = pyogrio.read_info(tmp_path)
features = info.get("features", 0)
geometry_type = info.get("geometry_type")
if features <= 0:
raise ValueError("Downloaded conservation areas file contains no features")
if "Polygon" not in str(geometry_type):
raise ValueError(f"Expected polygon geometry, got {geometry_type!r}")
tmp_path.replace(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Saved {features} conservation areas to {args.output} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()

349
pipeline/download/gias.py Normal file
View file

@ -0,0 +1,349 @@
"""Download the Get Information About Schools (GIAS) full establishments extract.
GIAS is the DfE register of all educational establishments in England, updated
daily. The CSV is generated on-demand via a four-step interaction with the
public Downloads page (there is no static URL):
1. GET /Downloads extract anti-forgery token, the `all.edubase.data` tag,
and the FileGeneratedDate that the server expects for that tag today.
2. POST /Downloads/Collate submit the form to start file generation. The
redirect URL contains a generation UUID.
3. Poll /Downloads/GenerateAjax/{id} until status:true.
4. GET the Azure blob URL with ?id={id} returns a ZIP containing
`edubasealldataYYYYMMDD.csv`.
The CSV is cp1252-encoded with 135 columns. We keep the fields useful for a
schools map (identification, status, phase, age range, religious character,
admissions policy, headline figures, contact details) and project Easting/
Northing (EPSG:27700) to WGS84 lat/lng.
"""
import argparse
import io
import json
import re
import time
import zipfile
from pathlib import Path
import httpx
import polars as pl
from pyproj import Transformer
from pipeline.local_temp import local_tmp_dir
BASE_URL = "https://get-information-schools.service.gov.uk"
DOWNLOADS_URL = f"{BASE_URL}/Downloads"
COLLATE_URL = f"{BASE_URL}/Downloads/Collate"
AJAX_URL = f"{BASE_URL}/Downloads/GenerateAjax"
AZURE_FILE_URL = (
"https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/File.xhtml"
)
EXTRACT_TAG = "all.edubase.data"
USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
POLL_INTERVAL_S = 2.0
POLL_TIMEOUT_S = 300.0
_TOKEN_RE = re.compile(
r'name="__RequestVerificationToken"[^>]*value="([^"]+)"', re.IGNORECASE
)
_GEN_DATE_RE = re.compile(
r'Downloads_0__FileGeneratedDate"[^>]*value="([^"]+)"', re.IGNORECASE
)
_GEN_ID_RE = re.compile(
r"/Downloads/Generated/([0-9a-f-]{36})", re.IGNORECASE
)
# Columns to read from the CSV (the file has 135; we keep what is useful for a
# schools map and contact card). Names must match the CSV header verbatim.
_CSV_COLUMNS: list[str] = [
"URN",
"EstablishmentName",
"TypeOfEstablishment (name)",
"EstablishmentTypeGroup (name)",
"EstablishmentStatus (name)",
"PhaseOfEducation (name)",
"StatutoryLowAge",
"StatutoryHighAge",
"NurseryProvision (name)",
"OfficialSixthForm (name)",
"Gender (name)",
"ReligiousCharacter (name)",
"AdmissionsPolicy (name)",
"SchoolCapacity",
"NumberOfPupils",
"PercentageFSM",
"Trusts (name)",
"Street",
"Locality",
"Town",
"County (name)",
"Postcode",
"SchoolWebsite",
"TelephoneNum",
"HeadTitle (name)",
"HeadFirstName",
"HeadLastName",
"Easting",
"Northing",
"LA (name)",
]
_NULL_VALUES = ["", "NULL", "Not applicable", "Does not apply"]
_to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
def _extract_token(html: str) -> str:
match = _TOKEN_RE.search(html)
if match is None:
raise RuntimeError("Could not find __RequestVerificationToken on GIAS page")
return match.group(1)
def _extract_file_generated_date(html: str) -> str:
match = _GEN_DATE_RE.search(html)
if match is None:
raise RuntimeError(
"Could not find FileGeneratedDate for the establishments extract"
)
return match.group(1)
def _start_generation(client: httpx.Client) -> str:
"""Submit the Downloads form and return the generation UUID."""
initial = client.get(DOWNLOADS_URL)
initial.raise_for_status()
token = _extract_token(initial.text)
file_generated_date = _extract_file_generated_date(initial.text)
response = client.post(
COLLATE_URL,
data={
"__RequestVerificationToken": token,
"Downloads[0].Tag": EXTRACT_TAG,
"Downloads[0].FileGeneratedDate": file_generated_date,
"Downloads[0].Selected": "true",
},
follow_redirects=True,
)
response.raise_for_status()
match = _GEN_ID_RE.search(str(response.url)) or _GEN_ID_RE.search(response.text)
if match is None:
raise RuntimeError("GIAS Collate did not yield a generation UUID")
return match.group(1)
def _wait_for_generation(client: httpx.Client, generation_id: str) -> None:
deadline = time.monotonic() + POLL_TIMEOUT_S
while time.monotonic() < deadline:
response = client.get(
f"{AJAX_URL}/{generation_id}",
headers={"X-Requested-With": "XMLHttpRequest"},
)
response.raise_for_status()
# The endpoint returns JSON whose payload is itself a JSON-encoded string,
# e.g. response.json() returns the string `{"status":true,...}` which we
# then need to decode a second time.
payload = json.loads(response.json())
if payload.get("status") is True:
return
time.sleep(POLL_INTERVAL_S)
raise RuntimeError(
f"GIAS extract generation timed out after {POLL_TIMEOUT_S:.0f}s"
)
def _download_zip(client: httpx.Client, generation_id: str) -> bytes:
response = client.get(AZURE_FILE_URL, params={"id": generation_id})
response.raise_for_status()
if not response.content.startswith(b"PK"):
raise RuntimeError("GIAS Azure response was not a ZIP archive")
return response.content
def fetch_extract_zip() -> bytes:
"""Run the full GIAS download flow and return the raw ZIP bytes."""
headers = {"User-Agent": USER_AGENT}
with httpx.Client(headers=headers, timeout=httpx.Timeout(30.0, read=120.0)) as client:
generation_id = _start_generation(client)
_wait_for_generation(client, generation_id)
return _download_zip(client, generation_id)
def _read_csv_from_zip(zip_bytes: bytes) -> pl.DataFrame:
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as archive:
csv_names = [name for name in archive.namelist() if name.lower().endswith(".csv")]
if not csv_names:
raise RuntimeError("GIAS ZIP did not contain a CSV file")
with archive.open(csv_names[0]) as raw:
data = raw.read()
text = data.decode("cp1252")
return pl.read_csv(
io.StringIO(text),
columns=_CSV_COLUMNS,
infer_schema_length=20000,
null_values=_NULL_VALUES,
truncate_ragged_lines=True,
)
def _project_easting_northing(easting: pl.Series, northing: pl.Series) -> tuple[list[float | None], list[float | None]]:
e = easting.to_numpy()
n = northing.to_numpy()
lng, lat = _to_wgs84.transform(e, n)
lng_out: list[float | None] = []
lat_out: list[float | None] = []
for east_val, lat_val, lng_val in zip(e, lat, lng):
if east_val is None or float(east_val) == 0.0:
lng_out.append(None)
lat_out.append(None)
else:
lng_out.append(float(lng_val))
lat_out.append(float(lat_val))
return lat_out, lng_out
def _format_age_range(low: int | None, high: int | None) -> str | None:
if low is None and high is None:
return None
if low is None:
return f"up to {high}"
if high is None:
return f"{low}+"
return f"{low}{high}"
def _format_address(street: str | None, locality: str | None, town: str | None) -> str | None:
parts = [part.strip() for part in (street, locality, town) if part]
parts = [part for part in parts if part]
return ", ".join(parts) if parts else None
def _format_head_name(title: str | None, first: str | None, last: str | None) -> str | None:
parts = [part.strip() for part in (title, first, last) if part]
parts = [part for part in parts if part]
return " ".join(parts) if parts else None
def transform(zip_bytes: bytes) -> pl.DataFrame:
"""Convert the GIAS extract ZIP into a clean schools DataFrame."""
raw = _read_csv_from_zip(zip_bytes)
# Filter to currently-open establishments; the CSV also includes closed,
# proposed-to-open, and proposed-to-close rows we do not want on a map.
df = raw.filter(pl.col("EstablishmentStatus (name)") == "Open")
df = df.with_columns(
pl.col("URN").cast(pl.Int64),
pl.col("StatutoryLowAge").cast(pl.Int32, strict=False),
pl.col("StatutoryHighAge").cast(pl.Int32, strict=False),
pl.col("SchoolCapacity").cast(pl.Int32, strict=False),
pl.col("NumberOfPupils").cast(pl.Int32, strict=False),
pl.col("Easting").cast(pl.Float64, strict=False),
pl.col("Northing").cast(pl.Float64, strict=False),
pl.col("PercentageFSM")
.cast(pl.String)
.str.replace_all("%", "", literal=True)
.str.strip_chars()
.cast(pl.Float32, strict=False),
)
# Drop rows without coordinates — a small number of historic/dummy entries
# have Easting=0 which would map to the Atlantic.
df = df.filter(
pl.col("Easting").is_not_null()
& pl.col("Northing").is_not_null()
& (pl.col("Easting") > 0)
& (pl.col("Northing") > 0)
)
lat, lng = _project_easting_northing(df["Easting"], df["Northing"])
age_range = [
_format_age_range(low, high)
for low, high in zip(df["StatutoryLowAge"].to_list(), df["StatutoryHighAge"].to_list())
]
address = [
_format_address(street, locality, town)
for street, locality, town in zip(
df["Street"].to_list(),
df["Locality"].to_list(),
df["Town"].to_list(),
)
]
head_name = [
_format_head_name(title, first, last)
for title, first, last in zip(
df["HeadTitle (name)"].to_list(),
df["HeadFirstName"].to_list(),
df["HeadLastName"].to_list(),
)
]
out = pl.DataFrame(
{
"urn": df["URN"],
"name": df["EstablishmentName"],
"lat": pl.Series(lat, dtype=pl.Float64),
"lng": pl.Series(lng, dtype=pl.Float64),
"phase": df["PhaseOfEducation (name)"],
"type": df["TypeOfEstablishment (name)"],
"type_group": df["EstablishmentTypeGroup (name)"],
"age_range": pl.Series(age_range, dtype=pl.String),
"gender": df["Gender (name)"],
"religious_character": df["ReligiousCharacter (name)"],
"admissions_policy": df["AdmissionsPolicy (name)"],
"nursery_provision": df["NurseryProvision (name)"],
"sixth_form": df["OfficialSixthForm (name)"],
"capacity": df["SchoolCapacity"],
"pupils": df["NumberOfPupils"],
"fsm_percent": df["PercentageFSM"],
"trust": df["Trusts (name)"],
"address": pl.Series(address, dtype=pl.String),
"postcode": df["Postcode"],
"local_authority": df["LA (name)"],
"website": df["SchoolWebsite"],
"telephone": df["TelephoneNum"],
"head_name": pl.Series(head_name, dtype=pl.String),
}
)
# Drop any remaining rows where projection failed (extremely rare).
return out.filter(pl.col("lat").is_not_null() & pl.col("lng").is_not_null())
def main() -> None:
parser = argparse.ArgumentParser(
description="Download the GIAS full establishments extract → parquet"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
cache_dir = local_tmp_dir() / "gias"
cache_dir.mkdir(parents=True, exist_ok=True)
cache_path = cache_dir / "edubase.zip"
print("Fetching GIAS extract...")
zip_bytes = fetch_extract_zip()
cache_path.write_bytes(zip_bytes)
print(f"Downloaded {len(zip_bytes) / (1024 * 1024):.1f} MB to {cache_path}")
print("Transforming...")
df = transform(zip_bytes)
args.output.parent.mkdir(parents=True, exist_ok=True)
df.write_parquet(args.output, compression="zstd")
print(f"Wrote {args.output} ({len(df):,} open establishments)")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,53 @@
"""Download Historic England listed-building point data.
Source: Historic England National Heritage List for England (NHLE)
License: Open Government Licence v3.0
"""
import argparse
from pathlib import Path
import httpx
import pyogrio
URL = (
"https://opendata-historicengland.hub.arcgis.com/api/download/v1/items/"
"767f279327a24845bf47dfe5eae9862b/geoPackage?layers=0"
)
def main() -> None:
parser = argparse.ArgumentParser(
description="Download Historic England NHLE listed-building points"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output GeoPackage file path"
)
args = parser.parse_args()
args.output.parent.mkdir(parents=True, exist_ok=True)
tmp_path = args.output.with_name(f"{args.output.stem}.tmp{args.output.suffix}")
print("Downloading Historic England listed-building points...")
with httpx.stream("GET", URL, follow_redirects=True, timeout=300) as response:
response.raise_for_status()
with tmp_path.open("wb") as fh:
for chunk in response.iter_bytes():
fh.write(chunk)
info = pyogrio.read_info(tmp_path)
features = info.get("features", 0)
geometry_type = str(info.get("geometry_type") or "")
if features <= 0:
raise ValueError("Downloaded listed-buildings file contains no features")
if "Point" not in geometry_type:
raise ValueError(f"Expected point geometry, got {geometry_type!r}")
tmp_path.replace(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(
f"Saved {features} listed-building points to {args.output} ({size_mb:.1f} MB)"
)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,85 @@
"""Download the ONS LSOA 2011 → LSOA 2021 lookup.
Source: ONS Open Geography Portal (LSOA11_LSOA21_LAD22_EW_LU_v5)
License: Open Government Licence v3.0
The lookup tells us how 2011 LSOA boundaries map to 2021 ones. We use it to
remap older crime data (police.uk reported in 2011 codes pre-2022) into the
2021 codes the rest of the pipeline keys on, so the crime-over-time chart can
show the full history instead of only post-boundary-change years.
CHGIND values: U (unchanged), S (split into multiple 2021), M (multiple 2011
merged into one 2021), X (irregular reshape).
"""
import argparse
from pathlib import Path
import httpx
import polars as pl
BASE_URL = (
"https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/"
"LSOA11_LSOA21_LAD22_EW_LU_v5/FeatureServer/0/query"
)
PAGE_SIZE = 2000
def download(output_path: Path) -> None:
print("Downloading ONS LSOA 2011 → 2021 lookup...")
rows: list[dict[str, str]] = []
offset = 0
while True:
params = {
"where": "1=1",
"outFields": "LSOA11CD,LSOA21CD,CHGIND",
"returnGeometry": "false",
"orderByFields": "LSOA11CD",
"f": "json",
"resultRecordCount": str(PAGE_SIZE),
"resultOffset": str(offset),
}
response = httpx.get(BASE_URL, params=params, timeout=60)
response.raise_for_status()
data = response.json()
features = data.get("features", [])
if not features:
break
for feat in features:
attrs = feat.get("attributes", {})
if attrs.get("LSOA11CD") and attrs.get("LSOA21CD"):
rows.append(
{
"lsoa11": attrs["LSOA11CD"],
"lsoa21": attrs["LSOA21CD"],
"chgind": attrs.get("CHGIND") or "U",
}
)
print(f" Fetched {len(features)} rows (offset={offset})")
if not data.get("exceededTransferLimit") and len(features) < PAGE_SIZE:
break
offset += len(features)
if not rows:
raise RuntimeError("ONS lookup returned no rows")
df = pl.DataFrame(rows)
# England-only matches the rest of the pipeline.
df = df.filter(pl.col("lsoa11").str.starts_with("E"))
print(f"England LSOA mappings: {df.height}")
print(f" CHGIND breakdown: {df.group_by('chgind').len().sort('chgind').to_dicts()}")
output_path.parent.mkdir(parents=True, exist_ok=True)
df.write_parquet(output_path, compression="zstd")
print(f"Saved to {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(description="Download ONS LSOA 2011 → 2021 lookup")
parser.add_argument("--output", type=Path, required=True)
args = parser.parse_args()
download(args.output)
if __name__ == "__main__":
main()

View file

@ -148,6 +148,20 @@ def _looks_like_tiff(response: httpx.Response) -> bool:
return "tiff" in content_type or response.content[:4] in (b"II*\x00", b"MM\x00*")
def _validate_geotiff(path: Path) -> None:
"""Open and fully decode the raster to catch truncated/corrupt downloads.
The WCS endpoint occasionally returns a TIFF that opens cleanly (valid
header + IFD) but whose encoded pixel data is truncated. The corruption
only surfaces when rasterio actually decodes a strip/tile.
"""
try:
with rasterio.open(path) as src:
src.read(1)
except (rasterio.errors.RasterioIOError, rasterio.errors.RasterioError) as e:
raise NoGeoTiffError(f"Downloaded TIFF failed to decode: {e}") from e
def _fetch_tile_bytes(
wcs_base: str,
coverage_id: str,
@ -216,7 +230,17 @@ def _download_tile(
content = _fetch_tile_bytes(
wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
)
tile_path.write_bytes(content)
# Write to a sibling temp file and rename atomically so partial
# writes (or truncated bodies that pass the magic-byte sniff but
# fail full decode) never poison the cache.
tmp_path = tile_path.with_suffix(tile_path.suffix + ".part")
tmp_path.write_bytes(content)
try:
_validate_geotiff(tmp_path)
except NoGeoTiffError:
tmp_path.unlink(missing_ok=True)
raise
tmp_path.replace(tile_path)
return [tile_path], []
except (
NoGeoTiffError,