This commit is contained in:
Andras Schmelczer 2026-06-02 13:46:18 +01:00
parent a04ac2d857
commit d43da9708c
47 changed files with 4120 additions and 573 deletions

View file

@ -79,13 +79,42 @@ def load_uprns(
)
if mapping is not None and mapping.height > 0:
uprns = (
uprns.join(
mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left"
# Remap terminated postcodes to their nearest active successor. The
# successor generally lives in a DIFFERENT OA (and at different grid
# coordinates), so the remapped point must adopt the successor's
# authoritative OA/coords — keeping the terminated postcode's original
# OA would seed the successor into an OA it doesn't belong to, splitting
# its boundary across OAs. Genuine (non-remapped) UPRN rows keep their
# own OA, since a live postcode can legitimately span several OAs.
uprns = uprns.join(
mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left"
).with_columns(pl.col("new_postcode").is_not_null().alias("_remapped"))
if active_postcode_points is not None:
successor_oa = active_postcode_points.rename(
{
"PCDS": "new_postcode",
"GRIDGB1E": "_succ_e",
"GRIDGB1N": "_succ_n",
"OA21CD": "_succ_oa",
}
)
.with_columns(pl.coalesce("new_postcode", "PCDS").alias("PCDS"))
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
)
uprns = uprns.join(successor_oa, on="new_postcode", how="left").with_columns(
pl.when("_remapped")
.then(pl.col("_succ_e"))
.otherwise(pl.col("GRIDGB1E"))
.alias("GRIDGB1E"),
pl.when("_remapped")
.then(pl.col("_succ_n"))
.otherwise(pl.col("GRIDGB1N"))
.alias("GRIDGB1N"),
pl.when("_remapped")
.then(pl.col("_succ_oa"))
.otherwise(pl.col("OA21CD"))
.alias("OA21CD"),
)
uprns = uprns.with_columns(
pl.coalesce("new_postcode", "PCDS").alias("PCDS")
).select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
if active_postcode_points is not None:
active_postcodes = active_postcode_points.select("PCDS").unique()
@ -149,3 +178,37 @@ def get_oa_uprns(
)
postcodes = sub["PCDS"].to_list()
return points, postcodes
def extract_uprn_arrays(df: pl.DataFrame):
"""Convert the UPRN DataFrame to fork-shareable numpy/Arrow arrays.
Returns ``(east, north, postcodes)``: two float64 ndarrays and a contiguous
pyarrow string Array. Multiprocessing workers slice these per OA via
:func:`get_oa_uprns_arrays` **without touching polars**, which avoids the
fork-after-threads deadlock hazard of polars' rayon pool. Being plain
numpy/Arrow buffers (not millions of Python objects), they are shared by
``fork`` copy-on-write rather than duplicated ~1GB per worker.
"""
import pyarrow as pa
east = np.ascontiguousarray(df["GRIDGB1E"].to_numpy(), dtype=np.float64)
north = np.ascontiguousarray(df["GRIDGB1N"].to_numpy(), dtype=np.float64)
postcodes = df["PCDS"].to_arrow()
if isinstance(postcodes, pa.ChunkedArray):
postcodes = postcodes.combine_chunks()
return east, north, postcodes
def get_oa_uprns_arrays(
east: np.ndarray,
north: np.ndarray,
postcodes,
offsets: dict[str, tuple[int, int]],
oa_code: str,
) -> tuple[np.ndarray, list[str]]:
"""Like :func:`get_oa_uprns`, but slices the fork-shareable arrays from
:func:`extract_uprn_arrays` (no polars), so it is safe to call in workers."""
s, e = offsets[oa_code]
points = np.column_stack([east[s:e], north[s:e]])
return points, postcodes.slice(s, e - s).to_pylist()