Add postcode boundary calculation

2026-02-07 21:22:53 +00:00 · 2026-02-07 21:22:53 +00:00 · f5e6894c0f
commit f5e6894c0f
parent f9bd218a3e
14 changed files with 1384 additions and 717 deletions
--- a/pipeline/transform/postcode_boundaries/uprn.py
+++ b/pipeline/transform/postcode_boundaries/uprn.py
@ -0,0 +1,84 @@
+from pathlib import Path
+
+import numpy as np
+import polars as pl
+
+from .memory import release_memory
+
+
+def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
+    """Load UPRNs as a sorted polars DataFrame with OA offset lookup.
+
+    Returns (df, offsets) where offsets[oa_code] = (start_row, end_row).
+    Peak ~5GB during sort, steady state ~1.5GB (Arrow columnar with compact strings).
+    """
+    import tempfile
+
+    print("Loading UPRN lookup...")
+
+    # Sort via streaming sink to avoid polars doubling memory during in-memory sort
+    with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
+        tmp_path = Path(tmp.name)
+    (
+        pl.scan_parquet(uprn_path)
+        .select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
+        .filter(~pl.col("OA21CD").str.starts_with("S"))
+        .filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null())
+        .with_columns(pl.col("PCDS").str.strip_chars())
+        .filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != ""))
+        .sort("OA21CD")
+        .sink_parquet(tmp_path)
+    )
+    release_memory()
+
+    # Read the sorted data — only one copy in memory (~2GB)
+    df = pl.read_parquet(tmp_path)
+    tmp_path.unlink()
+    n = len(df)
+    print(f"  Loaded {n:,} UPRNs (England & Wales)")
+
+    # Compute OA group offsets using polars (avoids 37M Python string creation)
+    boundary_df = (
+        df.lazy()
+        .with_row_index("_i")
+        .filter(
+            pl.col("OA21CD").shift(1).is_null()
+            | (pl.col("OA21CD") != pl.col("OA21CD").shift(1))
+        )
+        .select("_i", "OA21CD")
+        .collect()
+    )
+    starts_list = boundary_df["_i"].to_list()
+    oa_list = boundary_df["OA21CD"].to_list()
+    del boundary_df
+    offsets: dict[str, tuple[int, int]] = {}
+    for j in range(len(starts_list)):
+        end = starts_list[j + 1] if j + 1 < len(starts_list) else n
+        offsets[oa_list[j]] = (starts_list[j], end)
+    del starts_list, oa_list
+
+    # Drop OA column (no longer needed) to save ~400MB
+    df = df.select("GRIDGB1E", "GRIDGB1N", "PCDS")
+    release_memory()
+
+    print(f"  Grouped into {len(offsets)} OAs")
+    return df, offsets
+
+
+def get_oa_uprns(
+    df: pl.DataFrame, offsets: dict[str, tuple[int, int]], oa_code: str
+) -> tuple[np.ndarray, list[str]]:
+    """Get UPRN coordinates and postcodes for a single OA.
+
+    Returns (points_nx2, postcodes_list).
+    """
+    s, e = offsets[oa_code]
+    sub = df[s:e]
+    points = np.column_stack(
+        [
+            sub["GRIDGB1E"].to_numpy(),
+            sub["GRIDGB1N"].to_numpy(),
+        ]
+    )
+    postcodes = sub["PCDS"].to_list()
+    return points, postcodes