"""Download the ONS LSOA 2011 → LSOA 2021 lookup. Source: ONS Open Geography Portal (LSOA11_LSOA21_LAD22_EW_LU_v5) License: Open Government Licence v3.0 The lookup tells us how 2011 LSOA boundaries map to 2021 ones. We use it to remap older crime data (police.uk reported in 2011 codes pre-2022) into the 2021 codes the rest of the pipeline keys on, so the crime-over-time chart can show the full history instead of only post-boundary-change years. CHGIND values: U (unchanged), S (split into multiple 2021), M (multiple 2011 merged into one 2021), X (irregular reshape). """ import argparse from pathlib import Path import httpx import polars as pl BASE_URL = ( "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/" "LSOA11_LSOA21_LAD22_EW_LU_v5/FeatureServer/0/query" ) PAGE_SIZE = 2000 def download(output_path: Path) -> None: print("Downloading ONS LSOA 2011 → 2021 lookup...") rows: list[dict[str, str]] = [] offset = 0 while True: params = { "where": "1=1", "outFields": "LSOA11CD,LSOA21CD,CHGIND", "returnGeometry": "false", "orderByFields": "LSOA11CD", "f": "json", "resultRecordCount": str(PAGE_SIZE), "resultOffset": str(offset), } response = httpx.get(BASE_URL, params=params, timeout=60) response.raise_for_status() data = response.json() features = data.get("features", []) if not features: break for feat in features: attrs = feat.get("attributes", {}) if attrs.get("LSOA11CD") and attrs.get("LSOA21CD"): rows.append( { "lsoa11": attrs["LSOA11CD"], "lsoa21": attrs["LSOA21CD"], "chgind": attrs.get("CHGIND") or "U", } ) print(f" Fetched {len(features)} rows (offset={offset})") if not data.get("exceededTransferLimit") and len(features) < PAGE_SIZE: break offset += len(features) if not rows: raise RuntimeError("ONS lookup returned no rows") df = pl.DataFrame(rows) # England-only matches the rest of the pipeline. df = df.filter(pl.col("lsoa11").str.starts_with("E")) print(f"England LSOA mappings: {df.height}") print(f" CHGIND breakdown: {df.group_by('chgind').len().sort('chgind').to_dicts()}") output_path.parent.mkdir(parents=True, exist_ok=True) df.write_parquet(output_path, compression="zstd") print(f"Saved to {output_path}") def main() -> None: parser = argparse.ArgumentParser(description="Download ONS LSOA 2011 → 2021 lookup") parser.add_argument("--output", type=Path, required=True) args = parser.parse_args() download(args.output) if __name__ == "__main__": main()