has issues
This commit is contained in:
parent
2e112d7398
commit
c645b0f1d4
96 changed files with 2147083 additions and 5787 deletions
51
pipeline/download/conservation_areas.py
Normal file
51
pipeline/download/conservation_areas.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
"""Download Historic England conservation area polygons.
|
||||
|
||||
Source: Historic England Conservation Areas
|
||||
License: Open Government Licence v3.0
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import pyogrio
|
||||
|
||||
URL = (
|
||||
"https://opendata-historicengland.hub.arcgis.com/api/download/v1/items/"
|
||||
"446bc9bf8b5b440386d0c504caa3dac5/geoPackage?layers=0"
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download Historic England conservation area polygons"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output GeoPackage file path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp_path = args.output.with_name(f"{args.output.stem}.tmp{args.output.suffix}")
|
||||
|
||||
print("Downloading Historic England conservation areas...")
|
||||
with httpx.stream("GET", URL, follow_redirects=True, timeout=300) as response:
|
||||
response.raise_for_status()
|
||||
with tmp_path.open("wb") as fh:
|
||||
for chunk in response.iter_bytes():
|
||||
fh.write(chunk)
|
||||
|
||||
info = pyogrio.read_info(tmp_path)
|
||||
features = info.get("features", 0)
|
||||
geometry_type = info.get("geometry_type")
|
||||
if features <= 0:
|
||||
raise ValueError("Downloaded conservation areas file contains no features")
|
||||
if "Polygon" not in str(geometry_type):
|
||||
raise ValueError(f"Expected polygon geometry, got {geometry_type!r}")
|
||||
|
||||
tmp_path.replace(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(f"Saved {features} conservation areas to {args.output} ({size_mb:.1f} MB)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
349
pipeline/download/gias.py
Normal file
349
pipeline/download/gias.py
Normal file
|
|
@ -0,0 +1,349 @@
|
|||
"""Download the Get Information About Schools (GIAS) full establishments extract.
|
||||
|
||||
GIAS is the DfE register of all educational establishments in England, updated
|
||||
daily. The CSV is generated on-demand via a four-step interaction with the
|
||||
public Downloads page (there is no static URL):
|
||||
|
||||
1. GET /Downloads — extract anti-forgery token, the `all.edubase.data` tag,
|
||||
and the FileGeneratedDate that the server expects for that tag today.
|
||||
2. POST /Downloads/Collate — submit the form to start file generation. The
|
||||
redirect URL contains a generation UUID.
|
||||
3. Poll /Downloads/GenerateAjax/{id} until status:true.
|
||||
4. GET the Azure blob URL with ?id={id} — returns a ZIP containing
|
||||
`edubasealldataYYYYMMDD.csv`.
|
||||
|
||||
The CSV is cp1252-encoded with 135 columns. We keep the fields useful for a
|
||||
schools map (identification, status, phase, age range, religious character,
|
||||
admissions policy, headline figures, contact details) and project Easting/
|
||||
Northing (EPSG:27700) to WGS84 lat/lng.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import io
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import polars as pl
|
||||
from pyproj import Transformer
|
||||
|
||||
from pipeline.local_temp import local_tmp_dir
|
||||
|
||||
BASE_URL = "https://get-information-schools.service.gov.uk"
|
||||
DOWNLOADS_URL = f"{BASE_URL}/Downloads"
|
||||
COLLATE_URL = f"{BASE_URL}/Downloads/Collate"
|
||||
AJAX_URL = f"{BASE_URL}/Downloads/GenerateAjax"
|
||||
AZURE_FILE_URL = (
|
||||
"https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/File.xhtml"
|
||||
)
|
||||
EXTRACT_TAG = "all.edubase.data"
|
||||
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
POLL_INTERVAL_S = 2.0
|
||||
POLL_TIMEOUT_S = 300.0
|
||||
|
||||
_TOKEN_RE = re.compile(
|
||||
r'name="__RequestVerificationToken"[^>]*value="([^"]+)"', re.IGNORECASE
|
||||
)
|
||||
_GEN_DATE_RE = re.compile(
|
||||
r'Downloads_0__FileGeneratedDate"[^>]*value="([^"]+)"', re.IGNORECASE
|
||||
)
|
||||
_GEN_ID_RE = re.compile(
|
||||
r"/Downloads/Generated/([0-9a-f-]{36})", re.IGNORECASE
|
||||
)
|
||||
|
||||
# Columns to read from the CSV (the file has 135; we keep what is useful for a
|
||||
# schools map and contact card). Names must match the CSV header verbatim.
|
||||
_CSV_COLUMNS: list[str] = [
|
||||
"URN",
|
||||
"EstablishmentName",
|
||||
"TypeOfEstablishment (name)",
|
||||
"EstablishmentTypeGroup (name)",
|
||||
"EstablishmentStatus (name)",
|
||||
"PhaseOfEducation (name)",
|
||||
"StatutoryLowAge",
|
||||
"StatutoryHighAge",
|
||||
"NurseryProvision (name)",
|
||||
"OfficialSixthForm (name)",
|
||||
"Gender (name)",
|
||||
"ReligiousCharacter (name)",
|
||||
"AdmissionsPolicy (name)",
|
||||
"SchoolCapacity",
|
||||
"NumberOfPupils",
|
||||
"PercentageFSM",
|
||||
"Trusts (name)",
|
||||
"Street",
|
||||
"Locality",
|
||||
"Town",
|
||||
"County (name)",
|
||||
"Postcode",
|
||||
"SchoolWebsite",
|
||||
"TelephoneNum",
|
||||
"HeadTitle (name)",
|
||||
"HeadFirstName",
|
||||
"HeadLastName",
|
||||
"Easting",
|
||||
"Northing",
|
||||
"LA (name)",
|
||||
]
|
||||
|
||||
_NULL_VALUES = ["", "NULL", "Not applicable", "Does not apply"]
|
||||
|
||||
_to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
|
||||
|
||||
|
||||
def _extract_token(html: str) -> str:
|
||||
match = _TOKEN_RE.search(html)
|
||||
if match is None:
|
||||
raise RuntimeError("Could not find __RequestVerificationToken on GIAS page")
|
||||
return match.group(1)
|
||||
|
||||
|
||||
def _extract_file_generated_date(html: str) -> str:
|
||||
match = _GEN_DATE_RE.search(html)
|
||||
if match is None:
|
||||
raise RuntimeError(
|
||||
"Could not find FileGeneratedDate for the establishments extract"
|
||||
)
|
||||
return match.group(1)
|
||||
|
||||
|
||||
def _start_generation(client: httpx.Client) -> str:
|
||||
"""Submit the Downloads form and return the generation UUID."""
|
||||
initial = client.get(DOWNLOADS_URL)
|
||||
initial.raise_for_status()
|
||||
token = _extract_token(initial.text)
|
||||
file_generated_date = _extract_file_generated_date(initial.text)
|
||||
|
||||
response = client.post(
|
||||
COLLATE_URL,
|
||||
data={
|
||||
"__RequestVerificationToken": token,
|
||||
"Downloads[0].Tag": EXTRACT_TAG,
|
||||
"Downloads[0].FileGeneratedDate": file_generated_date,
|
||||
"Downloads[0].Selected": "true",
|
||||
},
|
||||
follow_redirects=True,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
match = _GEN_ID_RE.search(str(response.url)) or _GEN_ID_RE.search(response.text)
|
||||
if match is None:
|
||||
raise RuntimeError("GIAS Collate did not yield a generation UUID")
|
||||
return match.group(1)
|
||||
|
||||
|
||||
def _wait_for_generation(client: httpx.Client, generation_id: str) -> None:
|
||||
deadline = time.monotonic() + POLL_TIMEOUT_S
|
||||
while time.monotonic() < deadline:
|
||||
response = client.get(
|
||||
f"{AJAX_URL}/{generation_id}",
|
||||
headers={"X-Requested-With": "XMLHttpRequest"},
|
||||
)
|
||||
response.raise_for_status()
|
||||
# The endpoint returns JSON whose payload is itself a JSON-encoded string,
|
||||
# e.g. response.json() returns the string `{"status":true,...}` which we
|
||||
# then need to decode a second time.
|
||||
payload = json.loads(response.json())
|
||||
if payload.get("status") is True:
|
||||
return
|
||||
time.sleep(POLL_INTERVAL_S)
|
||||
raise RuntimeError(
|
||||
f"GIAS extract generation timed out after {POLL_TIMEOUT_S:.0f}s"
|
||||
)
|
||||
|
||||
|
||||
def _download_zip(client: httpx.Client, generation_id: str) -> bytes:
|
||||
response = client.get(AZURE_FILE_URL, params={"id": generation_id})
|
||||
response.raise_for_status()
|
||||
if not response.content.startswith(b"PK"):
|
||||
raise RuntimeError("GIAS Azure response was not a ZIP archive")
|
||||
return response.content
|
||||
|
||||
|
||||
def fetch_extract_zip() -> bytes:
|
||||
"""Run the full GIAS download flow and return the raw ZIP bytes."""
|
||||
headers = {"User-Agent": USER_AGENT}
|
||||
with httpx.Client(headers=headers, timeout=httpx.Timeout(30.0, read=120.0)) as client:
|
||||
generation_id = _start_generation(client)
|
||||
_wait_for_generation(client, generation_id)
|
||||
return _download_zip(client, generation_id)
|
||||
|
||||
|
||||
def _read_csv_from_zip(zip_bytes: bytes) -> pl.DataFrame:
|
||||
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as archive:
|
||||
csv_names = [name for name in archive.namelist() if name.lower().endswith(".csv")]
|
||||
if not csv_names:
|
||||
raise RuntimeError("GIAS ZIP did not contain a CSV file")
|
||||
with archive.open(csv_names[0]) as raw:
|
||||
data = raw.read()
|
||||
|
||||
text = data.decode("cp1252")
|
||||
return pl.read_csv(
|
||||
io.StringIO(text),
|
||||
columns=_CSV_COLUMNS,
|
||||
infer_schema_length=20000,
|
||||
null_values=_NULL_VALUES,
|
||||
truncate_ragged_lines=True,
|
||||
)
|
||||
|
||||
|
||||
def _project_easting_northing(easting: pl.Series, northing: pl.Series) -> tuple[list[float | None], list[float | None]]:
|
||||
e = easting.to_numpy()
|
||||
n = northing.to_numpy()
|
||||
lng, lat = _to_wgs84.transform(e, n)
|
||||
lng_out: list[float | None] = []
|
||||
lat_out: list[float | None] = []
|
||||
for east_val, lat_val, lng_val in zip(e, lat, lng):
|
||||
if east_val is None or float(east_val) == 0.0:
|
||||
lng_out.append(None)
|
||||
lat_out.append(None)
|
||||
else:
|
||||
lng_out.append(float(lng_val))
|
||||
lat_out.append(float(lat_val))
|
||||
return lat_out, lng_out
|
||||
|
||||
|
||||
def _format_age_range(low: int | None, high: int | None) -> str | None:
|
||||
if low is None and high is None:
|
||||
return None
|
||||
if low is None:
|
||||
return f"up to {high}"
|
||||
if high is None:
|
||||
return f"{low}+"
|
||||
return f"{low}–{high}"
|
||||
|
||||
|
||||
def _format_address(street: str | None, locality: str | None, town: str | None) -> str | None:
|
||||
parts = [part.strip() for part in (street, locality, town) if part]
|
||||
parts = [part for part in parts if part]
|
||||
return ", ".join(parts) if parts else None
|
||||
|
||||
|
||||
def _format_head_name(title: str | None, first: str | None, last: str | None) -> str | None:
|
||||
parts = [part.strip() for part in (title, first, last) if part]
|
||||
parts = [part for part in parts if part]
|
||||
return " ".join(parts) if parts else None
|
||||
|
||||
|
||||
def transform(zip_bytes: bytes) -> pl.DataFrame:
|
||||
"""Convert the GIAS extract ZIP into a clean schools DataFrame."""
|
||||
raw = _read_csv_from_zip(zip_bytes)
|
||||
|
||||
# Filter to currently-open establishments; the CSV also includes closed,
|
||||
# proposed-to-open, and proposed-to-close rows we do not want on a map.
|
||||
df = raw.filter(pl.col("EstablishmentStatus (name)") == "Open")
|
||||
|
||||
df = df.with_columns(
|
||||
pl.col("URN").cast(pl.Int64),
|
||||
pl.col("StatutoryLowAge").cast(pl.Int32, strict=False),
|
||||
pl.col("StatutoryHighAge").cast(pl.Int32, strict=False),
|
||||
pl.col("SchoolCapacity").cast(pl.Int32, strict=False),
|
||||
pl.col("NumberOfPupils").cast(pl.Int32, strict=False),
|
||||
pl.col("Easting").cast(pl.Float64, strict=False),
|
||||
pl.col("Northing").cast(pl.Float64, strict=False),
|
||||
pl.col("PercentageFSM")
|
||||
.cast(pl.String)
|
||||
.str.replace_all("%", "", literal=True)
|
||||
.str.strip_chars()
|
||||
.cast(pl.Float32, strict=False),
|
||||
)
|
||||
|
||||
# Drop rows without coordinates — a small number of historic/dummy entries
|
||||
# have Easting=0 which would map to the Atlantic.
|
||||
df = df.filter(
|
||||
pl.col("Easting").is_not_null()
|
||||
& pl.col("Northing").is_not_null()
|
||||
& (pl.col("Easting") > 0)
|
||||
& (pl.col("Northing") > 0)
|
||||
)
|
||||
|
||||
lat, lng = _project_easting_northing(df["Easting"], df["Northing"])
|
||||
|
||||
age_range = [
|
||||
_format_age_range(low, high)
|
||||
for low, high in zip(df["StatutoryLowAge"].to_list(), df["StatutoryHighAge"].to_list())
|
||||
]
|
||||
address = [
|
||||
_format_address(street, locality, town)
|
||||
for street, locality, town in zip(
|
||||
df["Street"].to_list(),
|
||||
df["Locality"].to_list(),
|
||||
df["Town"].to_list(),
|
||||
)
|
||||
]
|
||||
head_name = [
|
||||
_format_head_name(title, first, last)
|
||||
for title, first, last in zip(
|
||||
df["HeadTitle (name)"].to_list(),
|
||||
df["HeadFirstName"].to_list(),
|
||||
df["HeadLastName"].to_list(),
|
||||
)
|
||||
]
|
||||
|
||||
out = pl.DataFrame(
|
||||
{
|
||||
"urn": df["URN"],
|
||||
"name": df["EstablishmentName"],
|
||||
"lat": pl.Series(lat, dtype=pl.Float64),
|
||||
"lng": pl.Series(lng, dtype=pl.Float64),
|
||||
"phase": df["PhaseOfEducation (name)"],
|
||||
"type": df["TypeOfEstablishment (name)"],
|
||||
"type_group": df["EstablishmentTypeGroup (name)"],
|
||||
"age_range": pl.Series(age_range, dtype=pl.String),
|
||||
"gender": df["Gender (name)"],
|
||||
"religious_character": df["ReligiousCharacter (name)"],
|
||||
"admissions_policy": df["AdmissionsPolicy (name)"],
|
||||
"nursery_provision": df["NurseryProvision (name)"],
|
||||
"sixth_form": df["OfficialSixthForm (name)"],
|
||||
"capacity": df["SchoolCapacity"],
|
||||
"pupils": df["NumberOfPupils"],
|
||||
"fsm_percent": df["PercentageFSM"],
|
||||
"trust": df["Trusts (name)"],
|
||||
"address": pl.Series(address, dtype=pl.String),
|
||||
"postcode": df["Postcode"],
|
||||
"local_authority": df["LA (name)"],
|
||||
"website": df["SchoolWebsite"],
|
||||
"telephone": df["TelephoneNum"],
|
||||
"head_name": pl.Series(head_name, dtype=pl.String),
|
||||
}
|
||||
)
|
||||
|
||||
# Drop any remaining rows where projection failed (extremely rare).
|
||||
return out.filter(pl.col("lat").is_not_null() & pl.col("lng").is_not_null())
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download the GIAS full establishments extract → parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
cache_dir = local_tmp_dir() / "gias"
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
cache_path = cache_dir / "edubase.zip"
|
||||
|
||||
print("Fetching GIAS extract...")
|
||||
zip_bytes = fetch_extract_zip()
|
||||
cache_path.write_bytes(zip_bytes)
|
||||
print(f"Downloaded {len(zip_bytes) / (1024 * 1024):.1f} MB to {cache_path}")
|
||||
|
||||
print("Transforming...")
|
||||
df = transform(zip_bytes)
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
df.write_parquet(args.output, compression="zstd")
|
||||
print(f"Wrote {args.output} ({len(df):,} open establishments)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
53
pipeline/download/listed_buildings.py
Normal file
53
pipeline/download/listed_buildings.py
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
"""Download Historic England listed-building point data.
|
||||
|
||||
Source: Historic England National Heritage List for England (NHLE)
|
||||
License: Open Government Licence v3.0
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import pyogrio
|
||||
|
||||
URL = (
|
||||
"https://opendata-historicengland.hub.arcgis.com/api/download/v1/items/"
|
||||
"767f279327a24845bf47dfe5eae9862b/geoPackage?layers=0"
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download Historic England NHLE listed-building points"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output GeoPackage file path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp_path = args.output.with_name(f"{args.output.stem}.tmp{args.output.suffix}")
|
||||
|
||||
print("Downloading Historic England listed-building points...")
|
||||
with httpx.stream("GET", URL, follow_redirects=True, timeout=300) as response:
|
||||
response.raise_for_status()
|
||||
with tmp_path.open("wb") as fh:
|
||||
for chunk in response.iter_bytes():
|
||||
fh.write(chunk)
|
||||
|
||||
info = pyogrio.read_info(tmp_path)
|
||||
features = info.get("features", 0)
|
||||
geometry_type = str(info.get("geometry_type") or "")
|
||||
if features <= 0:
|
||||
raise ValueError("Downloaded listed-buildings file contains no features")
|
||||
if "Point" not in geometry_type:
|
||||
raise ValueError(f"Expected point geometry, got {geometry_type!r}")
|
||||
|
||||
tmp_path.replace(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(
|
||||
f"Saved {features} listed-building points to {args.output} ({size_mb:.1f} MB)"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
85
pipeline/download/lsoa_2011_to_2021.py
Normal file
85
pipeline/download/lsoa_2011_to_2021.py
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
"""Download the ONS LSOA 2011 → LSOA 2021 lookup.
|
||||
|
||||
Source: ONS Open Geography Portal (LSOA11_LSOA21_LAD22_EW_LU_v5)
|
||||
License: Open Government Licence v3.0
|
||||
|
||||
The lookup tells us how 2011 LSOA boundaries map to 2021 ones. We use it to
|
||||
remap older crime data (police.uk reported in 2011 codes pre-2022) into the
|
||||
2021 codes the rest of the pipeline keys on, so the crime-over-time chart can
|
||||
show the full history instead of only post-boundary-change years.
|
||||
|
||||
CHGIND values: U (unchanged), S (split into multiple 2021), M (multiple 2011
|
||||
merged into one 2021), X (irregular reshape).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import polars as pl
|
||||
|
||||
BASE_URL = (
|
||||
"https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/"
|
||||
"LSOA11_LSOA21_LAD22_EW_LU_v5/FeatureServer/0/query"
|
||||
)
|
||||
PAGE_SIZE = 2000
|
||||
|
||||
|
||||
def download(output_path: Path) -> None:
|
||||
print("Downloading ONS LSOA 2011 → 2021 lookup...")
|
||||
rows: list[dict[str, str]] = []
|
||||
offset = 0
|
||||
while True:
|
||||
params = {
|
||||
"where": "1=1",
|
||||
"outFields": "LSOA11CD,LSOA21CD,CHGIND",
|
||||
"returnGeometry": "false",
|
||||
"orderByFields": "LSOA11CD",
|
||||
"f": "json",
|
||||
"resultRecordCount": str(PAGE_SIZE),
|
||||
"resultOffset": str(offset),
|
||||
}
|
||||
response = httpx.get(BASE_URL, params=params, timeout=60)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
features = data.get("features", [])
|
||||
if not features:
|
||||
break
|
||||
for feat in features:
|
||||
attrs = feat.get("attributes", {})
|
||||
if attrs.get("LSOA11CD") and attrs.get("LSOA21CD"):
|
||||
rows.append(
|
||||
{
|
||||
"lsoa11": attrs["LSOA11CD"],
|
||||
"lsoa21": attrs["LSOA21CD"],
|
||||
"chgind": attrs.get("CHGIND") or "U",
|
||||
}
|
||||
)
|
||||
print(f" Fetched {len(features)} rows (offset={offset})")
|
||||
if not data.get("exceededTransferLimit") and len(features) < PAGE_SIZE:
|
||||
break
|
||||
offset += len(features)
|
||||
|
||||
if not rows:
|
||||
raise RuntimeError("ONS lookup returned no rows")
|
||||
|
||||
df = pl.DataFrame(rows)
|
||||
# England-only matches the rest of the pipeline.
|
||||
df = df.filter(pl.col("lsoa11").str.starts_with("E"))
|
||||
print(f"England LSOA mappings: {df.height}")
|
||||
print(f" CHGIND breakdown: {df.group_by('chgind').len().sort('chgind').to_dicts()}")
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
df.write_parquet(output_path, compression="zstd")
|
||||
print(f"Saved to {output_path}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Download ONS LSOA 2011 → 2021 lookup")
|
||||
parser.add_argument("--output", type=Path, required=True)
|
||||
args = parser.parse_args()
|
||||
download(args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -148,6 +148,20 @@ def _looks_like_tiff(response: httpx.Response) -> bool:
|
|||
return "tiff" in content_type or response.content[:4] in (b"II*\x00", b"MM\x00*")
|
||||
|
||||
|
||||
def _validate_geotiff(path: Path) -> None:
|
||||
"""Open and fully decode the raster to catch truncated/corrupt downloads.
|
||||
|
||||
The WCS endpoint occasionally returns a TIFF that opens cleanly (valid
|
||||
header + IFD) but whose encoded pixel data is truncated. The corruption
|
||||
only surfaces when rasterio actually decodes a strip/tile.
|
||||
"""
|
||||
try:
|
||||
with rasterio.open(path) as src:
|
||||
src.read(1)
|
||||
except (rasterio.errors.RasterioIOError, rasterio.errors.RasterioError) as e:
|
||||
raise NoGeoTiffError(f"Downloaded TIFF failed to decode: {e}") from e
|
||||
|
||||
|
||||
def _fetch_tile_bytes(
|
||||
wcs_base: str,
|
||||
coverage_id: str,
|
||||
|
|
@ -216,7 +230,17 @@ def _download_tile(
|
|||
content = _fetch_tile_bytes(
|
||||
wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
|
||||
)
|
||||
tile_path.write_bytes(content)
|
||||
# Write to a sibling temp file and rename atomically so partial
|
||||
# writes (or truncated bodies that pass the magic-byte sniff but
|
||||
# fail full decode) never poison the cache.
|
||||
tmp_path = tile_path.with_suffix(tile_path.suffix + ".part")
|
||||
tmp_path.write_bytes(content)
|
||||
try:
|
||||
_validate_geotiff(tmp_path)
|
||||
except NoGeoTiffError:
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
raise
|
||||
tmp_path.replace(tile_path)
|
||||
return [tile_path], []
|
||||
except (
|
||||
NoGeoTiffError,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue