scraping and data
This commit is contained in:
parent
d98819b569
commit
8688b7475e
43 changed files with 4920 additions and 531 deletions
|
|
@ -4,11 +4,18 @@ import numpy as np
|
|||
import polars as pl
|
||||
|
||||
from pipeline.local_temp import local_tmp_dir
|
||||
from pipeline.utils.postcode_mapping import build_postcode_mapping
|
||||
|
||||
from .memory import release_memory
|
||||
|
||||
|
||||
def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
|
||||
def _canonical_postcode_expr(name: str) -> pl.Expr:
|
||||
return pl.col(name).str.strip_chars().str.to_uppercase()
|
||||
|
||||
|
||||
def load_uprns(
|
||||
uprn_path: Path, arcgis_path: Path | None = None
|
||||
) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
|
||||
"""Load UPRNs as a sorted polars DataFrame with OA offset lookup.
|
||||
|
||||
Returns (df, offsets) where offsets[oa_code] = (start_row, end_row).
|
||||
|
|
@ -17,29 +24,46 @@ def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]
|
|||
import tempfile
|
||||
|
||||
print("Loading UPRN lookup...")
|
||||
mapping = None
|
||||
if arcgis_path is not None:
|
||||
mapping = (
|
||||
build_postcode_mapping(arcgis_path)
|
||||
.with_columns(
|
||||
_canonical_postcode_expr("old_postcode").alias("old_postcode"),
|
||||
_canonical_postcode_expr("new_postcode").alias("new_postcode"),
|
||||
)
|
||||
.unique("old_postcode")
|
||||
)
|
||||
|
||||
# Sort via streaming sink to avoid polars doubling memory during in-memory sort
|
||||
with tempfile.NamedTemporaryFile(
|
||||
suffix=".parquet", delete=False, dir=local_tmp_dir()
|
||||
) as tmp:
|
||||
tmp_path = Path(tmp.name)
|
||||
(
|
||||
uprns = (
|
||||
pl.scan_parquet(uprn_path)
|
||||
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
|
||||
.filter(~pl.col("OA21CD").str.starts_with("S"))
|
||||
.filter(pl.col("OA21CD").str.starts_with("E"))
|
||||
.filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null())
|
||||
.with_columns(pl.col("PCDS").str.strip_chars())
|
||||
.with_columns(_canonical_postcode_expr("PCDS").alias("PCDS"))
|
||||
.filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != ""))
|
||||
.sort("OA21CD")
|
||||
.sink_parquet(tmp_path)
|
||||
)
|
||||
|
||||
if mapping is not None and mapping.height > 0:
|
||||
uprns = (
|
||||
uprns.join(mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left")
|
||||
.with_columns(pl.coalesce("new_postcode", "PCDS").alias("PCDS"))
|
||||
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
|
||||
)
|
||||
|
||||
uprns.sort("OA21CD").sink_parquet(tmp_path)
|
||||
release_memory()
|
||||
|
||||
# Read the sorted data — only one copy in memory (~2GB)
|
||||
df = pl.read_parquet(tmp_path)
|
||||
tmp_path.unlink()
|
||||
n = len(df)
|
||||
print(f" Loaded {n:,} UPRNs (England & Wales)")
|
||||
print(f" Loaded {n:,} UPRNs (England)")
|
||||
|
||||
# Compute OA group offsets using polars (avoids 37M Python string creation)
|
||||
boundary_df = (
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue