perfect-postcode/pipeline/download/uprn_lookup.py

"""Download National Statistics UPRN Lookup (December 2025, Epoch 123).

Maps Unique Property Reference Numbers (UPRNs) to administrative and
statistical geographies (wards, output areas, LSOAs, etc.) across GB.

Source: https://geoportal.statistics.gov.uk/datasets/ons::national-statistics-uprn-lookup-december-2025-epoch-123
License: Contains Royal Mail, Ordnance Survey, and ONS data (OGL v3.0)
"""

import argparse
import tempfile
from pathlib import Path

import polars as pl

from pipeline.local_temp import local_tmp_dir
from pipeline.utils import code_col_overrides, download, extract_zip

URL = "https://www.arcgis.com/sharing/rest/content/items/4e0b4b3fbc2540caae27e7be532e61be/data"


def find_csvs(extract_path: Path) -> list[Path]:
    """Find all NSUL regional CSVs inside the extracted archive."""
    csvs = sorted(extract_path.rglob("NSUL_*.csv"))
    if not csvs:
        csvs = sorted(extract_path.rglob("*.csv"))
    if not csvs:
        raise FileNotFoundError(f"No CSV files found in {extract_path}")
    print(f"Found {len(csvs)} CSV(s):")
    for f in csvs:
        print(f"  {f.name}")
    return csvs


def convert_to_parquet(csv_paths: list[Path], parquet_path: Path) -> None:
    # Some regional files infer different types for the same column (e.g.
    # ruc21ind is String in most but Int64 in YH), and string codes like "UN1"
    # appear deep in the data. Read all classification-index code columns as
    # String to avoid schema mismatches. NSUL renames the year suffixes each
    # release and polars silently ignores overrides for missing columns, so
    # match on the suffix-free stem (from the header) rather than hard-coding.
    names = pl.scan_csv(csv_paths[0]).collect_schema().names()
    code_cols = code_col_overrides(names)
    df = pl.concat(
        [
            pl.scan_csv(p, try_parse_dates=True, schema_overrides=code_cols)
            for p in csv_paths
        ]
    )
    print(f"Columns: {df.collect_schema().names()}")
    parquet_path.parent.mkdir(parents=True, exist_ok=True)
    df.sink_parquet(parquet_path, compression="zstd")
    n = pl.scan_parquet(parquet_path).select(pl.len()).collect().item()
    print(f"Saved {n:,} rows to {parquet_path}")


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Download National Statistics UPRN Lookup"
    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
    )
    args = parser.parse_args()

    with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
        zip_path = Path(cache_dir) / "uprn_lookup.zip"
        extract_path = Path(cache_dir) / "uprn_extracted"

        download(URL, zip_path, timeout=600)
        extract_zip(zip_path, extract_path)

        csv_paths = find_csvs(extract_path)
        convert_to_parquet(csv_paths, args.output)


if __name__ == "__main__":
    main()