idk

2026-06-02 13:46:18 +01:00 · 2026-06-02 13:46:18 +01:00 · d43da9708c
commit d43da9708c
parent a04ac2d857
47 changed files with 4120 additions and 573 deletions
--- a/pipeline/transform/postcode_boundaries/uprn.py
+++ b/pipeline/transform/postcode_boundaries/uprn.py
@ -79,13 +79,42 @@ def load_uprns(
    )

    if mapping is not None and mapping.height > 0:
-        uprns = (
-            uprns.join(
-                mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left"
+        # Remap terminated postcodes to their nearest active successor. The
+        # successor generally lives in a DIFFERENT OA (and at different grid
+        # coordinates), so the remapped point must adopt the successor's
+        # authoritative OA/coords — keeping the terminated postcode's original
+        # OA would seed the successor into an OA it doesn't belong to, splitting
+        # its boundary across OAs. Genuine (non-remapped) UPRN rows keep their
+        # own OA, since a live postcode can legitimately span several OAs.
+        uprns = uprns.join(
+            mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left"
+        ).with_columns(pl.col("new_postcode").is_not_null().alias("_remapped"))
+        if active_postcode_points is not None:
+            successor_oa = active_postcode_points.rename(
+                {
+                    "PCDS": "new_postcode",
+                    "GRIDGB1E": "_succ_e",
+                    "GRIDGB1N": "_succ_n",
+                    "OA21CD": "_succ_oa",
+                }
            )
-            .with_columns(pl.coalesce("new_postcode", "PCDS").alias("PCDS"))
-            .select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
-        )
+            uprns = uprns.join(successor_oa, on="new_postcode", how="left").with_columns(
+                pl.when("_remapped")
+                .then(pl.col("_succ_e"))
+                .otherwise(pl.col("GRIDGB1E"))
+                .alias("GRIDGB1E"),
+                pl.when("_remapped")
+                .then(pl.col("_succ_n"))
+                .otherwise(pl.col("GRIDGB1N"))
+                .alias("GRIDGB1N"),
+                pl.when("_remapped")
+                .then(pl.col("_succ_oa"))
+                .otherwise(pl.col("OA21CD"))
+                .alias("OA21CD"),
+            )
+        uprns = uprns.with_columns(
+            pl.coalesce("new_postcode", "PCDS").alias("PCDS")
+        ).select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")

    if active_postcode_points is not None:
        active_postcodes = active_postcode_points.select("PCDS").unique()
@ -149,3 +178,37 @@ def get_oa_uprns(
    )
    postcodes = sub["PCDS"].to_list()
    return points, postcodes
+
+
+def extract_uprn_arrays(df: pl.DataFrame):
+    """Convert the UPRN DataFrame to fork-shareable numpy/Arrow arrays.
+
+    Returns ``(east, north, postcodes)``: two float64 ndarrays and a contiguous
+    pyarrow string Array. Multiprocessing workers slice these per OA via
+    :func:`get_oa_uprns_arrays` **without touching polars**, which avoids the
+    fork-after-threads deadlock hazard of polars' rayon pool. Being plain
+    numpy/Arrow buffers (not millions of Python objects), they are shared by
+    ``fork`` copy-on-write rather than duplicated ~1GB per worker.
+    """
+    import pyarrow as pa
+
+    east = np.ascontiguousarray(df["GRIDGB1E"].to_numpy(), dtype=np.float64)
+    north = np.ascontiguousarray(df["GRIDGB1N"].to_numpy(), dtype=np.float64)
+    postcodes = df["PCDS"].to_arrow()
+    if isinstance(postcodes, pa.ChunkedArray):
+        postcodes = postcodes.combine_chunks()
+    return east, north, postcodes
+
+
+def get_oa_uprns_arrays(
+    east: np.ndarray,
+    north: np.ndarray,
+    postcodes,
+    offsets: dict[str, tuple[int, int]],
+    oa_code: str,
+) -> tuple[np.ndarray, list[str]]:
+    """Like :func:`get_oa_uprns`, but slices the fork-shareable arrays from
+    :func:`extract_uprn_arrays` (no polars), so it is safe to call in workers."""
+    s, e = offsets[oa_code]
+    points = np.column_stack([east[s:e], north[s:e]])
+    return points, postcodes.slice(s, e - s).to_pylist()