idgf
This commit is contained in:
parent
fbfebc651c
commit
aab85fe32e
33 changed files with 2016 additions and 283 deletions
|
|
@ -106,7 +106,14 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
|
|||
.alias("potential_energy_rating"),
|
||||
_clean_string("property_type").alias("epc_property_type"),
|
||||
_clean_string("built_form").alias("built_form"),
|
||||
_clean_string("inspection_date").alias("inspection_date"),
|
||||
# Parse to a real Date once (unparseable/blank -> null) so dedup can
|
||||
# sort newest-first with nulls_last and _event_year can use dt.year();
|
||||
# a lexicographic string sort would let a null/garbled date win under
|
||||
# Polars' default nulls-first descending order. EPC inspection dates
|
||||
# are ISO (YYYY-MM-DD).
|
||||
_clean_string("inspection_date")
|
||||
.str.to_date(format="%Y-%m-%d", strict=False)
|
||||
.alias("inspection_date"),
|
||||
_clean_number("total_floor_area", pl.Float64).alias("total_floor_area"),
|
||||
_clean_number("number_habitable_rooms", pl.Int16).alias(
|
||||
"number_habitable_rooms"
|
||||
|
|
@ -247,9 +254,11 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
normalize_postcode_key(pl.col("epc_postcode")).alias("_epc_match_postcode"),
|
||||
)
|
||||
|
||||
# Dedup fork: keep latest certificate per property (existing logic)
|
||||
# Dedup fork: keep latest certificate per property. inspection_date is a typed
|
||||
# Date (see _select_epc_columns); nulls_last keeps a real-dated cert ahead of a
|
||||
# null/unparseable-dated one so the genuinely newest certificate is chosen.
|
||||
epc = (
|
||||
epc_base.sort("inspection_date", descending=True)
|
||||
epc_base.sort("inspection_date", descending=True, nulls_last=True)
|
||||
.group_by("_epc_match_address", "_epc_match_postcode")
|
||||
.first()
|
||||
.drop("tenure")
|
||||
|
|
@ -303,11 +312,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
)
|
||||
.filter(pl.col("_event").is_not_null())
|
||||
.with_columns(
|
||||
pl.col("inspection_date")
|
||||
.cast(pl.String)
|
||||
.str.slice(0, 4)
|
||||
.cast(pl.Int32)
|
||||
.alias("_event_year"),
|
||||
pl.col("inspection_date").dt.year().cast(pl.Int32).alias("_event_year"),
|
||||
)
|
||||
.group_by("_epc_match_address", "_epc_match_postcode")
|
||||
.agg(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue