scraping and data

2026-05-31 15:36:33 +01:00 · 2026-05-31 15:36:33 +01:00 · 8688b7475e
commit 8688b7475e
parent d98819b569
43 changed files with 4920 additions and 531 deletions
--- a/pipeline/transform/postcode_boundaries/uprn.py
+++ b/pipeline/transform/postcode_boundaries/uprn.py
@ -4,11 +4,18 @@ import numpy as np
 import polars as pl

 from pipeline.local_temp import local_tmp_dir
+from pipeline.utils.postcode_mapping import build_postcode_mapping

 from .memory import release_memory


-def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
+def _canonical_postcode_expr(name: str) -> pl.Expr:
+    return pl.col(name).str.strip_chars().str.to_uppercase()
+
+
+def load_uprns(
+    uprn_path: Path, arcgis_path: Path | None = None
+) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
    """Load UPRNs as a sorted polars DataFrame with OA offset lookup.

    Returns (df, offsets) where offsets[oa_code] = (start_row, end_row).
@ -17,29 +24,46 @@ def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]
    import tempfile

    print("Loading UPRN lookup...")
+    mapping = None
+    if arcgis_path is not None:
+        mapping = (
+            build_postcode_mapping(arcgis_path)
+            .with_columns(
+                _canonical_postcode_expr("old_postcode").alias("old_postcode"),
+                _canonical_postcode_expr("new_postcode").alias("new_postcode"),
+            )
+            .unique("old_postcode")
+        )

    # Sort via streaming sink to avoid polars doubling memory during in-memory sort
    with tempfile.NamedTemporaryFile(
        suffix=".parquet", delete=False, dir=local_tmp_dir()
    ) as tmp:
        tmp_path = Path(tmp.name)
-    (
+    uprns = (
        pl.scan_parquet(uprn_path)
        .select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
-        .filter(~pl.col("OA21CD").str.starts_with("S"))
+        .filter(pl.col("OA21CD").str.starts_with("E"))
        .filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null())
-        .with_columns(pl.col("PCDS").str.strip_chars())
+        .with_columns(_canonical_postcode_expr("PCDS").alias("PCDS"))
        .filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != ""))
-        .sort("OA21CD")
-        .sink_parquet(tmp_path)
    )
+
+    if mapping is not None and mapping.height > 0:
+        uprns = (
+            uprns.join(mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left")
+            .with_columns(pl.coalesce("new_postcode", "PCDS").alias("PCDS"))
+            .select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
+        )
+
+    uprns.sort("OA21CD").sink_parquet(tmp_path)
    release_memory()

    # Read the sorted data — only one copy in memory (~2GB)
    df = pl.read_parquet(tmp_path)
    tmp_path.unlink()
    n = len(df)
-    print(f"  Loaded {n:,} UPRNs (England & Wales)")
+    print(f"  Loaded {n:,} UPRNs (England)")

    # Compute OA group offsets using polars (avoids 37M Python string creation)
    boundary_df = (