Fix data pipelines once and for all
This commit is contained in:
parent
08560476c5
commit
4012e4e047
46 changed files with 4508 additions and 855 deletions
|
|
@ -109,6 +109,27 @@ def _clean_number(column: str, dtype: pl.DataType) -> pl.Expr:
|
|||
return _clean_string(column).cast(dtype, strict=False)
|
||||
|
||||
|
||||
def _join_address_parts(*columns: str) -> pl.Expr:
|
||||
"""Join address components into one display address, single-spaced.
|
||||
|
||||
Price-paid SAON/PAON/STREET are EMPTY STRINGS (not null) when absent —
|
||||
saon is "" on ~88% of rows — and ``concat_str(..., ignore_nulls=True)``
|
||||
skips only nulls, so empty components still contributed their separator
|
||||
(``' 10 PALACE GREEN'``, doubled spaces when a middle part was empty).
|
||||
Convert ``''``→null per component so ignore_nulls works as intended, then
|
||||
defensively collapse residual whitespace runs and strip the result. A
|
||||
fully-empty address becomes null (dropped by the downstream
|
||||
``pp_address.is_not_null()`` filter) instead of whitespace junk.
|
||||
"""
|
||||
joined = pl.concat_str(
|
||||
[_clean_string(column) for column in columns],
|
||||
separator=" ",
|
||||
ignore_nulls=True,
|
||||
)
|
||||
cleaned = joined.str.replace_all(r"\s+", " ").str.strip_chars()
|
||||
return pl.when(cleaned == "").then(None).otherwise(cleaned)
|
||||
|
||||
|
||||
def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
|
||||
return (
|
||||
raw.select(
|
||||
|
|
@ -436,11 +457,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
)
|
||||
.filter(pl.col("pp_property_type") != "Other")
|
||||
.with_columns(
|
||||
pl.concat_str(
|
||||
[pl.col("saon"), pl.col("paon"), pl.col("street")],
|
||||
separator=" ",
|
||||
ignore_nulls=True,
|
||||
).alias("pp_address"),
|
||||
_join_address_parts("saon", "paon", "street").alias("pp_address"),
|
||||
)
|
||||
.with_columns(
|
||||
normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue