perfect-postcode/pipeline/transform/postcode_boundaries/uprn.py
Andras Schmelczer 6cc7288126
Some checks failed
CI / Check (push) Has been cancelled
Build and publish Docker image / build-and-push (push) Has been cancelled
All good
2026-05-18 21:20:10 +01:00

88 lines
2.7 KiB
Python

from pathlib import Path
import numpy as np
import polars as pl
from pipeline.local_temp import local_tmp_dir
from .memory import release_memory
def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
"""Load UPRNs as a sorted polars DataFrame with OA offset lookup.
Returns (df, offsets) where offsets[oa_code] = (start_row, end_row).
Peak ~5GB during sort, steady state ~1.5GB (Arrow columnar with compact strings).
"""
import tempfile
print("Loading UPRN lookup...")
# Sort via streaming sink to avoid polars doubling memory during in-memory sort
with tempfile.NamedTemporaryFile(
suffix=".parquet", delete=False, dir=local_tmp_dir()
) as tmp:
tmp_path = Path(tmp.name)
(
pl.scan_parquet(uprn_path)
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
.filter(~pl.col("OA21CD").str.starts_with("S"))
.filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null())
.with_columns(pl.col("PCDS").str.strip_chars())
.filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != ""))
.sort("OA21CD")
.sink_parquet(tmp_path)
)
release_memory()
# Read the sorted data — only one copy in memory (~2GB)
df = pl.read_parquet(tmp_path)
tmp_path.unlink()
n = len(df)
print(f" Loaded {n:,} UPRNs (England & Wales)")
# Compute OA group offsets using polars (avoids 37M Python string creation)
boundary_df = (
df.lazy()
.with_row_index("_i")
.filter(
pl.col("OA21CD").shift(1).is_null()
| (pl.col("OA21CD") != pl.col("OA21CD").shift(1))
)
.select("_i", "OA21CD")
.collect()
)
starts_list = boundary_df["_i"].to_list()
oa_list = boundary_df["OA21CD"].to_list()
del boundary_df
offsets: dict[str, tuple[int, int]] = {}
for j in range(len(starts_list)):
end = starts_list[j + 1] if j + 1 < len(starts_list) else n
offsets[oa_list[j]] = (starts_list[j], end)
del starts_list, oa_list
# Drop OA column (no longer needed) to save ~400MB
df = df.select("GRIDGB1E", "GRIDGB1N", "PCDS")
release_memory()
print(f" Grouped into {len(offsets)} OAs")
return df, offsets
def get_oa_uprns(
df: pl.DataFrame, offsets: dict[str, tuple[int, int]], oa_code: str
) -> tuple[np.ndarray, list[str]]:
"""Get UPRN coordinates and postcodes for a single OA.
Returns (points_nx2, postcodes_list).
"""
s, e = offsets[oa_code]
sub = df[s:e]
points = np.column_stack(
[
sub["GRIDGB1E"].to_numpy(),
sub["GRIDGB1N"].to_numpy(),
]
)
postcodes = sub["PCDS"].to_list()
return points, postcodes