perfect-postcode/pipeline/transform/postcode_boundaries/uprn.py

from pathlib import Path

import numpy as np
import polars as pl

from pipeline.local_temp import local_tmp_dir

from .memory import release_memory


def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
    """Load UPRNs as a sorted polars DataFrame with OA offset lookup.

    Returns (df, offsets) where offsets[oa_code] = (start_row, end_row).
    Peak ~5GB during sort, steady state ~1.5GB (Arrow columnar with compact strings).
    """
    import tempfile

    print("Loading UPRN lookup...")

    # Sort via streaming sink to avoid polars doubling memory during in-memory sort
    with tempfile.NamedTemporaryFile(
        suffix=".parquet", delete=False, dir=local_tmp_dir()
    ) as tmp:
        tmp_path = Path(tmp.name)
    (
        pl.scan_parquet(uprn_path)
        .select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
        .filter(~pl.col("OA21CD").str.starts_with("S"))
        .filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null())
        .with_columns(pl.col("PCDS").str.strip_chars())
        .filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != ""))
        .sort("OA21CD")
        .sink_parquet(tmp_path)
    )
    release_memory()

    # Read the sorted data — only one copy in memory (~2GB)
    df = pl.read_parquet(tmp_path)
    tmp_path.unlink()
    n = len(df)
    print(f"  Loaded {n:,} UPRNs (England & Wales)")

    # Compute OA group offsets using polars (avoids 37M Python string creation)
    boundary_df = (
        df.lazy()
        .with_row_index("_i")
        .filter(
            pl.col("OA21CD").shift(1).is_null()
            | (pl.col("OA21CD") != pl.col("OA21CD").shift(1))
        )
        .select("_i", "OA21CD")
        .collect()
    )
    starts_list = boundary_df["_i"].to_list()
    oa_list = boundary_df["OA21CD"].to_list()
    del boundary_df
    offsets: dict[str, tuple[int, int]] = {}
    for j in range(len(starts_list)):
        end = starts_list[j + 1] if j + 1 < len(starts_list) else n
        offsets[oa_list[j]] = (starts_list[j], end)
    del starts_list, oa_list

    # Drop OA column (no longer needed) to save ~400MB
    df = df.select("GRIDGB1E", "GRIDGB1N", "PCDS")
    release_memory()

    print(f"  Grouped into {len(offsets)} OAs")
    return df, offsets


def get_oa_uprns(
    df: pl.DataFrame, offsets: dict[str, tuple[int, int]], oa_code: str
) -> tuple[np.ndarray, list[str]]:
    """Get UPRN coordinates and postcodes for a single OA.

    Returns (points_nx2, postcodes_list).
    """
    s, e = offsets[oa_code]
    sub = df[s:e]
    points = np.column_stack(
        [
            sub["GRIDGB1E"].to_numpy(),
            sub["GRIDGB1N"].to_numpy(),
        ]
    )
    postcodes = sub["PCDS"].to_list()
    return points, postcodes