idk
This commit is contained in:
parent
a04ac2d857
commit
d43da9708c
47 changed files with 4120 additions and 573 deletions
|
|
@ -79,13 +79,42 @@ def load_uprns(
|
|||
)
|
||||
|
||||
if mapping is not None and mapping.height > 0:
|
||||
uprns = (
|
||||
uprns.join(
|
||||
mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left"
|
||||
# Remap terminated postcodes to their nearest active successor. The
|
||||
# successor generally lives in a DIFFERENT OA (and at different grid
|
||||
# coordinates), so the remapped point must adopt the successor's
|
||||
# authoritative OA/coords — keeping the terminated postcode's original
|
||||
# OA would seed the successor into an OA it doesn't belong to, splitting
|
||||
# its boundary across OAs. Genuine (non-remapped) UPRN rows keep their
|
||||
# own OA, since a live postcode can legitimately span several OAs.
|
||||
uprns = uprns.join(
|
||||
mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left"
|
||||
).with_columns(pl.col("new_postcode").is_not_null().alias("_remapped"))
|
||||
if active_postcode_points is not None:
|
||||
successor_oa = active_postcode_points.rename(
|
||||
{
|
||||
"PCDS": "new_postcode",
|
||||
"GRIDGB1E": "_succ_e",
|
||||
"GRIDGB1N": "_succ_n",
|
||||
"OA21CD": "_succ_oa",
|
||||
}
|
||||
)
|
||||
.with_columns(pl.coalesce("new_postcode", "PCDS").alias("PCDS"))
|
||||
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
|
||||
)
|
||||
uprns = uprns.join(successor_oa, on="new_postcode", how="left").with_columns(
|
||||
pl.when("_remapped")
|
||||
.then(pl.col("_succ_e"))
|
||||
.otherwise(pl.col("GRIDGB1E"))
|
||||
.alias("GRIDGB1E"),
|
||||
pl.when("_remapped")
|
||||
.then(pl.col("_succ_n"))
|
||||
.otherwise(pl.col("GRIDGB1N"))
|
||||
.alias("GRIDGB1N"),
|
||||
pl.when("_remapped")
|
||||
.then(pl.col("_succ_oa"))
|
||||
.otherwise(pl.col("OA21CD"))
|
||||
.alias("OA21CD"),
|
||||
)
|
||||
uprns = uprns.with_columns(
|
||||
pl.coalesce("new_postcode", "PCDS").alias("PCDS")
|
||||
).select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
|
||||
|
||||
if active_postcode_points is not None:
|
||||
active_postcodes = active_postcode_points.select("PCDS").unique()
|
||||
|
|
@ -149,3 +178,37 @@ def get_oa_uprns(
|
|||
)
|
||||
postcodes = sub["PCDS"].to_list()
|
||||
return points, postcodes
|
||||
|
||||
|
||||
def extract_uprn_arrays(df: pl.DataFrame):
|
||||
"""Convert the UPRN DataFrame to fork-shareable numpy/Arrow arrays.
|
||||
|
||||
Returns ``(east, north, postcodes)``: two float64 ndarrays and a contiguous
|
||||
pyarrow string Array. Multiprocessing workers slice these per OA via
|
||||
:func:`get_oa_uprns_arrays` **without touching polars**, which avoids the
|
||||
fork-after-threads deadlock hazard of polars' rayon pool. Being plain
|
||||
numpy/Arrow buffers (not millions of Python objects), they are shared by
|
||||
``fork`` copy-on-write rather than duplicated ~1GB per worker.
|
||||
"""
|
||||
import pyarrow as pa
|
||||
|
||||
east = np.ascontiguousarray(df["GRIDGB1E"].to_numpy(), dtype=np.float64)
|
||||
north = np.ascontiguousarray(df["GRIDGB1N"].to_numpy(), dtype=np.float64)
|
||||
postcodes = df["PCDS"].to_arrow()
|
||||
if isinstance(postcodes, pa.ChunkedArray):
|
||||
postcodes = postcodes.combine_chunks()
|
||||
return east, north, postcodes
|
||||
|
||||
|
||||
def get_oa_uprns_arrays(
|
||||
east: np.ndarray,
|
||||
north: np.ndarray,
|
||||
postcodes,
|
||||
offsets: dict[str, tuple[int, int]],
|
||||
oa_code: str,
|
||||
) -> tuple[np.ndarray, list[str]]:
|
||||
"""Like :func:`get_oa_uprns`, but slices the fork-shareable arrays from
|
||||
:func:`extract_uprn_arrays` (no polars), so it is safe to call in workers."""
|
||||
s, e = offsets[oa_code]
|
||||
points = np.column_stack([east[s:e], north[s:e]])
|
||||
return points, postcodes.slice(s, e - s).to_pylist()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue