"""Download National Statistics UPRN Lookup (December 2025, Epoch 123). Maps Unique Property Reference Numbers (UPRNs) to administrative and statistical geographies (wards, output areas, LSOAs, etc.) across GB. Source: https://geoportal.statistics.gov.uk/datasets/ons::national-statistics-uprn-lookup-december-2025-epoch-123 License: Contains Royal Mail, Ordnance Survey, and ONS data (OGL v3.0) """ import argparse import tempfile from pathlib import Path import polars as pl from pipeline.utils import download, extract_zip URL = "https://www.arcgis.com/sharing/rest/content/items/4e0b4b3fbc2540caae27e7be532e61be/data" def find_csvs(extract_path: Path) -> list[Path]: """Find all NSUL regional CSVs inside the extracted archive.""" csvs = sorted(extract_path.rglob("NSUL_*.csv")) if not csvs: csvs = sorted(extract_path.rglob("*.csv")) if not csvs: raise FileNotFoundError(f"No CSV files found in {extract_path}") print(f"Found {len(csvs)} CSV(s):") for f in csvs: print(f" {f.name}") return csvs def convert_to_parquet(csv_paths: list[Path], parquet_path: Path) -> None: # Some regional files infer different types for the same column (e.g. # ruc21ind is String in most but Int64 in YH). Read all code columns as # String to avoid schema mismatches. CODE_COLS = { "ruc21ind": pl.String, "oac21ind": pl.String, "imd19ind": pl.String, } df = pl.concat( [ pl.scan_csv(p, try_parse_dates=True, schema_overrides=CODE_COLS) for p in csv_paths ] ) print(f"Columns: {df.collect_schema().names()}") parquet_path.parent.mkdir(parents=True, exist_ok=True) df.sink_parquet(parquet_path, compression="zstd") n = pl.scan_parquet(parquet_path).select(pl.len()).collect().item() print(f"Saved {n:,} rows to {parquet_path}") def main() -> None: parser = argparse.ArgumentParser( description="Download National Statistics UPRN Lookup" ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet file path" ) args = parser.parse_args() with tempfile.TemporaryDirectory() as cache_dir: zip_path = Path(cache_dir) / "uprn_lookup.zip" extract_path = Path(cache_dir) / "uprn_extracted" download(URL, zip_path, timeout=600) extract_zip(zip_path, extract_path) csv_paths = find_csvs(extract_path) convert_to_parquet(csv_paths, args.output) if __name__ == "__main__": main()