perfect-postcode/pipeline/download/uprn_lookup.py
2026-06-02 20:14:32 +01:00

78 lines
2.8 KiB
Python

"""Download National Statistics UPRN Lookup (December 2025, Epoch 123).
Maps Unique Property Reference Numbers (UPRNs) to administrative and
statistical geographies (wards, output areas, LSOAs, etc.) across GB.
Source: https://geoportal.statistics.gov.uk/datasets/ons::national-statistics-uprn-lookup-december-2025-epoch-123
License: Contains Royal Mail, Ordnance Survey, and ONS data (OGL v3.0)
"""
import argparse
import tempfile
from pathlib import Path
import polars as pl
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import code_col_overrides, download, extract_zip
URL = "https://www.arcgis.com/sharing/rest/content/items/4e0b4b3fbc2540caae27e7be532e61be/data"
def find_csvs(extract_path: Path) -> list[Path]:
"""Find all NSUL regional CSVs inside the extracted archive."""
csvs = sorted(extract_path.rglob("NSUL_*.csv"))
if not csvs:
csvs = sorted(extract_path.rglob("*.csv"))
if not csvs:
raise FileNotFoundError(f"No CSV files found in {extract_path}")
print(f"Found {len(csvs)} CSV(s):")
for f in csvs:
print(f" {f.name}")
return csvs
def convert_to_parquet(csv_paths: list[Path], parquet_path: Path) -> None:
# Some regional files infer different types for the same column (e.g.
# ruc21ind is String in most but Int64 in YH), and string codes like "UN1"
# appear deep in the data. Read all classification-index code columns as
# String to avoid schema mismatches. NSUL renames the year suffixes each
# release and polars silently ignores overrides for missing columns, so
# match on the suffix-free stem (from the header) rather than hard-coding.
names = pl.scan_csv(csv_paths[0]).collect_schema().names()
code_cols = code_col_overrides(names)
df = pl.concat(
[
pl.scan_csv(p, try_parse_dates=True, schema_overrides=code_cols)
for p in csv_paths
]
)
print(f"Columns: {df.collect_schema().names()}")
parquet_path.parent.mkdir(parents=True, exist_ok=True)
df.sink_parquet(parquet_path, compression="zstd")
n = pl.scan_parquet(parquet_path).select(pl.len()).collect().item()
print(f"Saved {n:,} rows to {parquet_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download National Statistics UPRN Lookup"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
zip_path = Path(cache_dir) / "uprn_lookup.zip"
extract_path = Path(cache_dir) / "uprn_extracted"
download(URL, zip_path, timeout=600)
extract_zip(zip_path, extract_path)
csv_paths = find_csvs(extract_path)
convert_to_parquet(csv_paths, args.output)
if __name__ == "__main__":
main()