improve
This commit is contained in:
parent
8688b7475e
commit
e8345cbdc1
40 changed files with 1980 additions and 904 deletions
|
|
@ -13,6 +13,33 @@ def _canonical_postcode_expr(name: str) -> pl.Expr:
|
|||
return pl.col(name).str.strip_chars().str.to_uppercase()
|
||||
|
||||
|
||||
def _active_english_arcgis_postcodes(arcgis_path: Path) -> pl.LazyFrame:
|
||||
return (
|
||||
pl.read_parquet(
|
||||
arcgis_path,
|
||||
columns=["pcds", "east1m", "north1m", "oa21cd", "ctry25cd", "doterm"],
|
||||
)
|
||||
.lazy()
|
||||
.filter(pl.col("ctry25cd") == "E92000001")
|
||||
.filter(pl.col("doterm").cast(pl.Utf8).is_null())
|
||||
.select(
|
||||
_canonical_postcode_expr("pcds").alias("PCDS"),
|
||||
pl.col("east1m").cast(pl.Float64).alias("GRIDGB1E"),
|
||||
pl.col("north1m").cast(pl.Float64).alias("GRIDGB1N"),
|
||||
pl.col("oa21cd").alias("OA21CD"),
|
||||
)
|
||||
.filter(
|
||||
pl.col("PCDS").is_not_null()
|
||||
& (pl.col("PCDS") != "")
|
||||
& pl.col("GRIDGB1E").is_not_null()
|
||||
& pl.col("GRIDGB1N").is_not_null()
|
||||
& pl.col("OA21CD").is_not_null()
|
||||
& pl.col("OA21CD").str.starts_with("E")
|
||||
)
|
||||
.unique("PCDS")
|
||||
)
|
||||
|
||||
|
||||
def load_uprns(
|
||||
uprn_path: Path, arcgis_path: Path | None = None
|
||||
) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
|
||||
|
|
@ -25,6 +52,7 @@ def load_uprns(
|
|||
|
||||
print("Loading UPRN lookup...")
|
||||
mapping = None
|
||||
active_postcode_points = None
|
||||
if arcgis_path is not None:
|
||||
mapping = (
|
||||
build_postcode_mapping(arcgis_path)
|
||||
|
|
@ -34,6 +62,7 @@ def load_uprns(
|
|||
)
|
||||
.unique("old_postcode")
|
||||
)
|
||||
active_postcode_points = _active_english_arcgis_postcodes(arcgis_path)
|
||||
|
||||
# Sort via streaming sink to avoid polars doubling memory during in-memory sort
|
||||
with tempfile.NamedTemporaryFile(
|
||||
|
|
@ -51,11 +80,21 @@ def load_uprns(
|
|||
|
||||
if mapping is not None and mapping.height > 0:
|
||||
uprns = (
|
||||
uprns.join(mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left")
|
||||
uprns.join(
|
||||
mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left"
|
||||
)
|
||||
.with_columns(pl.coalesce("new_postcode", "PCDS").alias("PCDS"))
|
||||
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
|
||||
)
|
||||
|
||||
if active_postcode_points is not None:
|
||||
active_postcodes = active_postcode_points.select("PCDS").unique()
|
||||
uprns = uprns.join(active_postcodes, on="PCDS", how="semi")
|
||||
missing_active = active_postcode_points.join(
|
||||
uprns.select("PCDS").unique(), on="PCDS", how="anti"
|
||||
).select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
|
||||
uprns = pl.concat([uprns, missing_active], how="vertical_relaxed")
|
||||
|
||||
uprns.sort("OA21CD").sink_parquet(tmp_path)
|
||||
release_memory()
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue