77 lines
2.5 KiB
Python
77 lines
2.5 KiB
Python
"""Download National Statistics UPRN Lookup (December 2025, Epoch 123).
|
|
|
|
Maps Unique Property Reference Numbers (UPRNs) to administrative and
|
|
statistical geographies (wards, output areas, LSOAs, etc.) across GB.
|
|
|
|
Source: https://geoportal.statistics.gov.uk/datasets/ons::national-statistics-uprn-lookup-december-2025-epoch-123
|
|
License: Contains Royal Mail, Ordnance Survey, and ONS data (OGL v3.0)
|
|
"""
|
|
|
|
import argparse
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import polars as pl
|
|
|
|
from pipeline.utils import download, extract_zip
|
|
|
|
URL = "https://www.arcgis.com/sharing/rest/content/items/4e0b4b3fbc2540caae27e7be532e61be/data"
|
|
|
|
|
|
def find_csvs(extract_path: Path) -> list[Path]:
|
|
"""Find all NSUL regional CSVs inside the extracted archive."""
|
|
csvs = sorted(extract_path.rglob("NSUL_*.csv"))
|
|
if not csvs:
|
|
csvs = sorted(extract_path.rglob("*.csv"))
|
|
if not csvs:
|
|
raise FileNotFoundError(f"No CSV files found in {extract_path}")
|
|
print(f"Found {len(csvs)} CSV(s):")
|
|
for f in csvs:
|
|
print(f" {f.name}")
|
|
return csvs
|
|
|
|
|
|
def convert_to_parquet(csv_paths: list[Path], parquet_path: Path) -> None:
|
|
# Some regional files infer different types for the same column (e.g.
|
|
# ruc21ind is String in most but Int64 in YH). Read all code columns as
|
|
# String to avoid schema mismatches.
|
|
CODE_COLS = {
|
|
"ruc21ind": pl.String,
|
|
"oac21ind": pl.String,
|
|
"imd19ind": pl.String,
|
|
}
|
|
df = pl.concat(
|
|
[
|
|
pl.scan_csv(p, try_parse_dates=True, schema_overrides=CODE_COLS)
|
|
for p in csv_paths
|
|
]
|
|
)
|
|
print(f"Columns: {df.collect_schema().names()}")
|
|
parquet_path.parent.mkdir(parents=True, exist_ok=True)
|
|
df.sink_parquet(parquet_path, compression="zstd")
|
|
n = pl.scan_parquet(parquet_path).select(pl.len()).collect().item()
|
|
print(f"Saved {n:,} rows to {parquet_path}")
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Download National Statistics UPRN Lookup"
|
|
)
|
|
parser.add_argument(
|
|
"--output", type=Path, required=True, help="Output parquet file path"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
with tempfile.TemporaryDirectory() as cache_dir:
|
|
zip_path = Path(cache_dir) / "uprn_lookup.zip"
|
|
extract_path = Path(cache_dir) / "uprn_extracted"
|
|
|
|
download(URL, zip_path, timeout=600)
|
|
extract_zip(zip_path, extract_path)
|
|
|
|
csv_paths = find_csvs(extract_path)
|
|
convert_to_parquet(csv_paths, args.output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|