perfect-postcode/pipeline/download/lsoa_2011_to_2021.py

"""Download the ONS LSOA 2011 → LSOA 2021 lookup.

Source: ONS Open Geography Portal (LSOA11_LSOA21_LAD22_EW_LU_v5)
License: Open Government Licence v3.0

The lookup tells us how 2011 LSOA boundaries map to 2021 ones. We use it to
remap older crime data (police.uk reported in 2011 codes pre-2022) into the
2021 codes the rest of the pipeline keys on, so the crime-over-time chart can
show the full history instead of only post-boundary-change years.

CHGIND values: U (unchanged), S (split into multiple 2021), M (multiple 2011
merged into one 2021), X (irregular reshape).
"""

import argparse
from pathlib import Path

import httpx
import polars as pl

BASE_URL = (
    "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/"
    "LSOA11_LSOA21_LAD22_EW_LU_v5/FeatureServer/0/query"
)
PAGE_SIZE = 2000


def download(output_path: Path) -> None:
    print("Downloading ONS LSOA 2011 → 2021 lookup...")
    rows: list[dict[str, str]] = []
    offset = 0
    while True:
        params = {
            "where": "1=1",
            "outFields": "LSOA11CD,LSOA21CD,CHGIND",
            "returnGeometry": "false",
            "orderByFields": "LSOA11CD",
            "f": "json",
            "resultRecordCount": str(PAGE_SIZE),
            "resultOffset": str(offset),
        }
        response = httpx.get(BASE_URL, params=params, timeout=60)
        response.raise_for_status()
        data = response.json()
        features = data.get("features", [])
        if not features:
            break
        for feat in features:
            attrs = feat.get("attributes", {})
            if attrs.get("LSOA11CD") and attrs.get("LSOA21CD"):
                rows.append(
                    {
                        "lsoa11": attrs["LSOA11CD"],
                        "lsoa21": attrs["LSOA21CD"],
                        "chgind": attrs.get("CHGIND") or "U",
                    }
                )
        print(f"  Fetched {len(features)} rows (offset={offset})")
        if not data.get("exceededTransferLimit") and len(features) < PAGE_SIZE:
            break
        offset += len(features)

    if not rows:
        raise RuntimeError("ONS lookup returned no rows")

    df = pl.DataFrame(rows)
    # England-only matches the rest of the pipeline.
    df = df.filter(pl.col("lsoa11").str.starts_with("E"))
    print(f"England LSOA mappings: {df.height}")
    print(f"  CHGIND breakdown: {df.group_by('chgind').len().sort('chgind').to_dicts()}")

    output_path.parent.mkdir(parents=True, exist_ok=True)
    df.write_parquet(output_path, compression="zstd")
    print(f"Saved to {output_path}")


def main() -> None:
    parser = argparse.ArgumentParser(description="Download ONS LSOA 2011 → 2021 lookup")
    parser.add_argument("--output", type=Path, required=True)
    args = parser.parse_args()
    download(args.output)


if __name__ == "__main__":
    main()