This commit is contained in:
Andras Schmelczer 2026-06-02 20:14:32 +01:00
parent fbfebc651c
commit aab85fe32e
33 changed files with 2016 additions and 283 deletions

View file

@ -106,7 +106,14 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
.alias("potential_energy_rating"),
_clean_string("property_type").alias("epc_property_type"),
_clean_string("built_form").alias("built_form"),
_clean_string("inspection_date").alias("inspection_date"),
# Parse to a real Date once (unparseable/blank -> null) so dedup can
# sort newest-first with nulls_last and _event_year can use dt.year();
# a lexicographic string sort would let a null/garbled date win under
# Polars' default nulls-first descending order. EPC inspection dates
# are ISO (YYYY-MM-DD).
_clean_string("inspection_date")
.str.to_date(format="%Y-%m-%d", strict=False)
.alias("inspection_date"),
_clean_number("total_floor_area", pl.Float64).alias("total_floor_area"),
_clean_number("number_habitable_rooms", pl.Int16).alias(
"number_habitable_rooms"
@ -247,9 +254,11 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
normalize_postcode_key(pl.col("epc_postcode")).alias("_epc_match_postcode"),
)
# Dedup fork: keep latest certificate per property (existing logic)
# Dedup fork: keep latest certificate per property. inspection_date is a typed
# Date (see _select_epc_columns); nulls_last keeps a real-dated cert ahead of a
# null/unparseable-dated one so the genuinely newest certificate is chosen.
epc = (
epc_base.sort("inspection_date", descending=True)
epc_base.sort("inspection_date", descending=True, nulls_last=True)
.group_by("_epc_match_address", "_epc_match_postcode")
.first()
.drop("tenure")
@ -303,11 +312,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
)
.filter(pl.col("_event").is_not_null())
.with_columns(
pl.col("inspection_date")
.cast(pl.String)
.str.slice(0, 4)
.cast(pl.Int32)
.alias("_event_year"),
pl.col("inspection_date").dt.year().cast(pl.Int32).alias("_event_year"),
)
.group_by("_epc_match_address", "_epc_match_postcode")
.agg(