from pathlib import Path import numpy as np import polars as pl from .memory import release_memory def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]: """Load UPRNs as a sorted polars DataFrame with OA offset lookup. Returns (df, offsets) where offsets[oa_code] = (start_row, end_row). Peak ~5GB during sort, steady state ~1.5GB (Arrow columnar with compact strings). """ import tempfile print("Loading UPRN lookup...") # Sort via streaming sink to avoid polars doubling memory during in-memory sort with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp: tmp_path = Path(tmp.name) ( pl.scan_parquet(uprn_path) .select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD") .filter(~pl.col("OA21CD").str.starts_with("S")) .filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null()) .with_columns(pl.col("PCDS").str.strip_chars()) .filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != "")) .sort("OA21CD") .sink_parquet(tmp_path) ) release_memory() # Read the sorted data — only one copy in memory (~2GB) df = pl.read_parquet(tmp_path) tmp_path.unlink() n = len(df) print(f" Loaded {n:,} UPRNs (England & Wales)") # Compute OA group offsets using polars (avoids 37M Python string creation) boundary_df = ( df.lazy() .with_row_index("_i") .filter( pl.col("OA21CD").shift(1).is_null() | (pl.col("OA21CD") != pl.col("OA21CD").shift(1)) ) .select("_i", "OA21CD") .collect() ) starts_list = boundary_df["_i"].to_list() oa_list = boundary_df["OA21CD"].to_list() del boundary_df offsets: dict[str, tuple[int, int]] = {} for j in range(len(starts_list)): end = starts_list[j + 1] if j + 1 < len(starts_list) else n offsets[oa_list[j]] = (starts_list[j], end) del starts_list, oa_list # Drop OA column (no longer needed) to save ~400MB df = df.select("GRIDGB1E", "GRIDGB1N", "PCDS") release_memory() print(f" Grouped into {len(offsets)} OAs") return df, offsets def get_oa_uprns( df: pl.DataFrame, offsets: dict[str, tuple[int, int]], oa_code: str ) -> tuple[np.ndarray, list[str]]: """Get UPRN coordinates and postcodes for a single OA. Returns (points_nx2, postcodes_list). """ s, e = offsets[oa_code] sub = df[s:e] points = np.column_stack( [ sub["GRIDGB1E"].to_numpy(), sub["GRIDGB1N"].to_numpy(), ] ) postcodes = sub["PCDS"].to_list() return points, postcodes