perfect-postcode/pipeline/download/uprn_lookup.py

77 lines
2.5 KiB
Python

"""Download National Statistics UPRN Lookup (December 2025, Epoch 123).
Maps Unique Property Reference Numbers (UPRNs) to administrative and
statistical geographies (wards, output areas, LSOAs, etc.) across GB.
Source: https://geoportal.statistics.gov.uk/datasets/ons::national-statistics-uprn-lookup-december-2025-epoch-123
License: Contains Royal Mail, Ordnance Survey, and ONS data (OGL v3.0)
"""
import argparse
import tempfile
from pathlib import Path
import polars as pl
from pipeline.utils import download, extract_zip
URL = "https://www.arcgis.com/sharing/rest/content/items/4e0b4b3fbc2540caae27e7be532e61be/data"
def find_csvs(extract_path: Path) -> list[Path]:
"""Find all NSUL regional CSVs inside the extracted archive."""
csvs = sorted(extract_path.rglob("NSUL_*.csv"))
if not csvs:
csvs = sorted(extract_path.rglob("*.csv"))
if not csvs:
raise FileNotFoundError(f"No CSV files found in {extract_path}")
print(f"Found {len(csvs)} CSV(s):")
for f in csvs:
print(f" {f.name}")
return csvs
def convert_to_parquet(csv_paths: list[Path], parquet_path: Path) -> None:
# Some regional files infer different types for the same column (e.g.
# ruc21ind is String in most but Int64 in YH). Read all code columns as
# String to avoid schema mismatches.
CODE_COLS = {
"ruc21ind": pl.String,
"oac21ind": pl.String,
"imd19ind": pl.String,
}
df = pl.concat(
[
pl.scan_csv(p, try_parse_dates=True, schema_overrides=CODE_COLS)
for p in csv_paths
]
)
print(f"Columns: {df.collect_schema().names()}")
parquet_path.parent.mkdir(parents=True, exist_ok=True)
df.sink_parquet(parquet_path, compression="zstd")
n = pl.scan_parquet(parquet_path).select(pl.len()).collect().item()
print(f"Saved {n:,} rows to {parquet_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download National Statistics UPRN Lookup"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
zip_path = Path(cache_dir) / "uprn_lookup.zip"
extract_path = Path(cache_dir) / "uprn_extracted"
download(URL, zip_path, timeout=600)
extract_zip(zip_path, extract_path)
csv_paths = find_csvs(extract_path)
convert_to_parquet(csv_paths, args.output)
if __name__ == "__main__":
main()