Fix data pipelines once and for all

This commit is contained in:
Andras Schmelczer 2026-06-10 21:27:32 +01:00
parent 08560476c5
commit 4012e4e047
46 changed files with 4508 additions and 855 deletions

View file

@ -109,6 +109,27 @@ def _clean_number(column: str, dtype: pl.DataType) -> pl.Expr:
return _clean_string(column).cast(dtype, strict=False)
def _join_address_parts(*columns: str) -> pl.Expr:
"""Join address components into one display address, single-spaced.
Price-paid SAON/PAON/STREET are EMPTY STRINGS (not null) when absent
saon is "" on ~88% of rows and ``concat_str(..., ignore_nulls=True)``
skips only nulls, so empty components still contributed their separator
(``' 10 PALACE GREEN'``, doubled spaces when a middle part was empty).
Convert ``''``null per component so ignore_nulls works as intended, then
defensively collapse residual whitespace runs and strip the result. A
fully-empty address becomes null (dropped by the downstream
``pp_address.is_not_null()`` filter) instead of whitespace junk.
"""
joined = pl.concat_str(
[_clean_string(column) for column in columns],
separator=" ",
ignore_nulls=True,
)
cleaned = joined.str.replace_all(r"\s+", " ").str.strip_chars()
return pl.when(cleaned == "").then(None).otherwise(cleaned)
def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
return (
raw.select(
@ -436,11 +457,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
)
.filter(pl.col("pp_property_type") != "Other")
.with_columns(
pl.concat_str(
[pl.col("saon"), pl.col("paon"), pl.col("street")],
separator=" ",
ignore_nulls=True,
).alias("pp_address"),
_join_address_parts("saon", "paon", "street").alias("pp_address"),
)
.with_columns(
normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),