84 lines
2.6 KiB
Python
84 lines
2.6 KiB
Python
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import polars as pl
|
|
|
|
from .memory import release_memory
|
|
|
|
|
|
def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
|
|
"""Load UPRNs as a sorted polars DataFrame with OA offset lookup.
|
|
|
|
Returns (df, offsets) where offsets[oa_code] = (start_row, end_row).
|
|
Peak ~5GB during sort, steady state ~1.5GB (Arrow columnar with compact strings).
|
|
"""
|
|
import tempfile
|
|
|
|
print("Loading UPRN lookup...")
|
|
|
|
# Sort via streaming sink to avoid polars doubling memory during in-memory sort
|
|
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
|
|
tmp_path = Path(tmp.name)
|
|
(
|
|
pl.scan_parquet(uprn_path)
|
|
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
|
|
.filter(~pl.col("OA21CD").str.starts_with("S"))
|
|
.filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null())
|
|
.with_columns(pl.col("PCDS").str.strip_chars())
|
|
.filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != ""))
|
|
.sort("OA21CD")
|
|
.sink_parquet(tmp_path)
|
|
)
|
|
release_memory()
|
|
|
|
# Read the sorted data — only one copy in memory (~2GB)
|
|
df = pl.read_parquet(tmp_path)
|
|
tmp_path.unlink()
|
|
n = len(df)
|
|
print(f" Loaded {n:,} UPRNs (England & Wales)")
|
|
|
|
# Compute OA group offsets using polars (avoids 37M Python string creation)
|
|
boundary_df = (
|
|
df.lazy()
|
|
.with_row_index("_i")
|
|
.filter(
|
|
pl.col("OA21CD").shift(1).is_null()
|
|
| (pl.col("OA21CD") != pl.col("OA21CD").shift(1))
|
|
)
|
|
.select("_i", "OA21CD")
|
|
.collect()
|
|
)
|
|
starts_list = boundary_df["_i"].to_list()
|
|
oa_list = boundary_df["OA21CD"].to_list()
|
|
del boundary_df
|
|
offsets: dict[str, tuple[int, int]] = {}
|
|
for j in range(len(starts_list)):
|
|
end = starts_list[j + 1] if j + 1 < len(starts_list) else n
|
|
offsets[oa_list[j]] = (starts_list[j], end)
|
|
del starts_list, oa_list
|
|
|
|
# Drop OA column (no longer needed) to save ~400MB
|
|
df = df.select("GRIDGB1E", "GRIDGB1N", "PCDS")
|
|
release_memory()
|
|
|
|
print(f" Grouped into {len(offsets)} OAs")
|
|
return df, offsets
|
|
|
|
|
|
def get_oa_uprns(
|
|
df: pl.DataFrame, offsets: dict[str, tuple[int, int]], oa_code: str
|
|
) -> tuple[np.ndarray, list[str]]:
|
|
"""Get UPRN coordinates and postcodes for a single OA.
|
|
|
|
Returns (points_nx2, postcodes_list).
|
|
"""
|
|
s, e = offsets[oa_code]
|
|
sub = df[s:e]
|
|
points = np.column_stack(
|
|
[
|
|
sub["GRIDGB1E"].to_numpy(),
|
|
sub["GRIDGB1N"].to_numpy(),
|
|
]
|
|
)
|
|
postcodes = sub["PCDS"].to_list()
|
|
return points, postcodes
|