This commit is contained in:
Andras Schmelczer 2026-06-02 20:14:32 +01:00
parent fbfebc651c
commit aab85fe32e
33 changed files with 2016 additions and 283 deletions

View file

@ -58,7 +58,7 @@ def test_scan_epc_certificates_supports_legacy_uppercase_csv(tmp_path: Path):
"potential_energy_rating": "B",
"epc_property_type": "House",
"built_form": "Mid-Terrace",
"inspection_date": "2024-01-02",
"inspection_date": date(2024, 1, 2),
"total_floor_area": 84.5,
"number_habitable_rooms": None,
"floor_height": 2.4,
@ -179,6 +179,65 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
assert df.get_column("historical_prices").list.len().to_list() == [2]
def test_run_dedup_prefers_valid_dated_cert_over_garbled_date(tmp_path: Path):
# Two certificates for the same property. The cert with the garbled,
# unparseable inspection_date must NOT be chosen as "latest": a string sort
# nulls-first would have picked it, attaching a stale rating/floor area. The
# valid-dated cert wins, so its rating ("C") and floor area (85) survive.
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
csv_buffer = io.StringIO()
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
writer.writeheader()
writer.writerows(
[
_row(
current_energy_rating="c",
inspection_date="2024-01-01",
total_floor_area="85",
),
# Same property; an unparseable date (OCR/garbled). Under a raw
# string descending sort "not-a-date" outranks the ISO date and
# wins the dedup, but as a null Date it loses.
_row(
current_energy_rating="g",
inspection_date="not-a-date",
total_floor_area="40",
),
]
)
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [250_000],
"date_of_transfer": [date(2024, 2, 3)],
"property_type": ["T"],
"postcode": ["AA1 1AA"],
"paon": ["1"],
"saon": [None],
"street": ["Example Street"],
"locality": [None],
"town_city": ["Exampletown"],
"duration": ["F"],
"old_new": ["N"],
"ppd_category": ["A"],
}
).write_parquet(price_paid_path)
output_path = tmp_path / "epc-pp.parquet"
_run(zip_path, price_paid_path, output_path, tmp_path)
df = pl.read_parquet(output_path)
assert df.height == 1
# The valid-dated cert's facts are kept; the garbled-date cert is NOT chosen.
assert df.select("current_energy_rating", "total_floor_area").to_dicts() == [
{"current_energy_rating": "C", "total_floor_area": 85.0}
]
def test_run_excludes_price_paid_rows_without_full_postcode(tmp_path: Path):
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive: