Add postcode boundary calculation
This commit is contained in:
parent
f9bd218a3e
commit
f5e6894c0f
14 changed files with 1384 additions and 717 deletions
84
pipeline/transform/postcode_boundaries/uprn.py
Normal file
84
pipeline/transform/postcode_boundaries/uprn.py
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from .memory import release_memory
|
||||
|
||||
|
||||
def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
|
||||
"""Load UPRNs as a sorted polars DataFrame with OA offset lookup.
|
||||
|
||||
Returns (df, offsets) where offsets[oa_code] = (start_row, end_row).
|
||||
Peak ~5GB during sort, steady state ~1.5GB (Arrow columnar with compact strings).
|
||||
"""
|
||||
import tempfile
|
||||
|
||||
print("Loading UPRN lookup...")
|
||||
|
||||
# Sort via streaming sink to avoid polars doubling memory during in-memory sort
|
||||
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
|
||||
tmp_path = Path(tmp.name)
|
||||
(
|
||||
pl.scan_parquet(uprn_path)
|
||||
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
|
||||
.filter(~pl.col("OA21CD").str.starts_with("S"))
|
||||
.filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null())
|
||||
.with_columns(pl.col("PCDS").str.strip_chars())
|
||||
.filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != ""))
|
||||
.sort("OA21CD")
|
||||
.sink_parquet(tmp_path)
|
||||
)
|
||||
release_memory()
|
||||
|
||||
# Read the sorted data — only one copy in memory (~2GB)
|
||||
df = pl.read_parquet(tmp_path)
|
||||
tmp_path.unlink()
|
||||
n = len(df)
|
||||
print(f" Loaded {n:,} UPRNs (England & Wales)")
|
||||
|
||||
# Compute OA group offsets using polars (avoids 37M Python string creation)
|
||||
boundary_df = (
|
||||
df.lazy()
|
||||
.with_row_index("_i")
|
||||
.filter(
|
||||
pl.col("OA21CD").shift(1).is_null()
|
||||
| (pl.col("OA21CD") != pl.col("OA21CD").shift(1))
|
||||
)
|
||||
.select("_i", "OA21CD")
|
||||
.collect()
|
||||
)
|
||||
starts_list = boundary_df["_i"].to_list()
|
||||
oa_list = boundary_df["OA21CD"].to_list()
|
||||
del boundary_df
|
||||
offsets: dict[str, tuple[int, int]] = {}
|
||||
for j in range(len(starts_list)):
|
||||
end = starts_list[j + 1] if j + 1 < len(starts_list) else n
|
||||
offsets[oa_list[j]] = (starts_list[j], end)
|
||||
del starts_list, oa_list
|
||||
|
||||
# Drop OA column (no longer needed) to save ~400MB
|
||||
df = df.select("GRIDGB1E", "GRIDGB1N", "PCDS")
|
||||
release_memory()
|
||||
|
||||
print(f" Grouped into {len(offsets)} OAs")
|
||||
return df, offsets
|
||||
|
||||
|
||||
def get_oa_uprns(
|
||||
df: pl.DataFrame, offsets: dict[str, tuple[int, int]], oa_code: str
|
||||
) -> tuple[np.ndarray, list[str]]:
|
||||
"""Get UPRN coordinates and postcodes for a single OA.
|
||||
|
||||
Returns (points_nx2, postcodes_list).
|
||||
"""
|
||||
s, e = offsets[oa_code]
|
||||
sub = df[s:e]
|
||||
points = np.column_stack(
|
||||
[
|
||||
sub["GRIDGB1E"].to_numpy(),
|
||||
sub["GRIDGB1N"].to_numpy(),
|
||||
]
|
||||
)
|
||||
postcodes = sub["PCDS"].to_list()
|
||||
return points, postcodes
|
||||
Loading…
Add table
Add a link
Reference in a new issue