perfect-postcode/pipeline/download/lsoa_2011_to_2021.py
2026-05-25 13:20:17 +01:00

85 lines
2.8 KiB
Python

"""Download the ONS LSOA 2011 → LSOA 2021 lookup.
Source: ONS Open Geography Portal (LSOA11_LSOA21_LAD22_EW_LU_v5)
License: Open Government Licence v3.0
The lookup tells us how 2011 LSOA boundaries map to 2021 ones. We use it to
remap older crime data (police.uk reported in 2011 codes pre-2022) into the
2021 codes the rest of the pipeline keys on, so the crime-over-time chart can
show the full history instead of only post-boundary-change years.
CHGIND values: U (unchanged), S (split into multiple 2021), M (multiple 2011
merged into one 2021), X (irregular reshape).
"""
import argparse
from pathlib import Path
import httpx
import polars as pl
BASE_URL = (
"https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/"
"LSOA11_LSOA21_LAD22_EW_LU_v5/FeatureServer/0/query"
)
PAGE_SIZE = 2000
def download(output_path: Path) -> None:
print("Downloading ONS LSOA 2011 → 2021 lookup...")
rows: list[dict[str, str]] = []
offset = 0
while True:
params = {
"where": "1=1",
"outFields": "LSOA11CD,LSOA21CD,CHGIND",
"returnGeometry": "false",
"orderByFields": "LSOA11CD",
"f": "json",
"resultRecordCount": str(PAGE_SIZE),
"resultOffset": str(offset),
}
response = httpx.get(BASE_URL, params=params, timeout=60)
response.raise_for_status()
data = response.json()
features = data.get("features", [])
if not features:
break
for feat in features:
attrs = feat.get("attributes", {})
if attrs.get("LSOA11CD") and attrs.get("LSOA21CD"):
rows.append(
{
"lsoa11": attrs["LSOA11CD"],
"lsoa21": attrs["LSOA21CD"],
"chgind": attrs.get("CHGIND") or "U",
}
)
print(f" Fetched {len(features)} rows (offset={offset})")
if not data.get("exceededTransferLimit") and len(features) < PAGE_SIZE:
break
offset += len(features)
if not rows:
raise RuntimeError("ONS lookup returned no rows")
df = pl.DataFrame(rows)
# England-only matches the rest of the pipeline.
df = df.filter(pl.col("lsoa11").str.starts_with("E"))
print(f"England LSOA mappings: {df.height}")
print(f" CHGIND breakdown: {df.group_by('chgind').len().sort('chgind').to_dicts()}")
output_path.parent.mkdir(parents=True, exist_ok=True)
df.write_parquet(output_path, compression="zstd")
print(f"Saved to {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(description="Download ONS LSOA 2011 → 2021 lookup")
parser.add_argument("--output", type=Path, required=True)
args = parser.parse_args()
download(args.output)
if __name__ == "__main__":
main()