idgf
This commit is contained in:
parent
fbfebc651c
commit
aab85fe32e
33 changed files with 2016 additions and 283 deletions
|
|
@ -58,7 +58,7 @@ def test_scan_epc_certificates_supports_legacy_uppercase_csv(tmp_path: Path):
|
|||
"potential_energy_rating": "B",
|
||||
"epc_property_type": "House",
|
||||
"built_form": "Mid-Terrace",
|
||||
"inspection_date": "2024-01-02",
|
||||
"inspection_date": date(2024, 1, 2),
|
||||
"total_floor_area": 84.5,
|
||||
"number_habitable_rooms": None,
|
||||
"floor_height": 2.4,
|
||||
|
|
@ -179,6 +179,65 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
|
|||
assert df.get_column("historical_prices").list.len().to_list() == [2]
|
||||
|
||||
|
||||
def test_run_dedup_prefers_valid_dated_cert_over_garbled_date(tmp_path: Path):
|
||||
# Two certificates for the same property. The cert with the garbled,
|
||||
# unparseable inspection_date must NOT be chosen as "latest": a string sort
|
||||
# nulls-first would have picked it, attaching a stale rating/floor area. The
|
||||
# valid-dated cert wins, so its rating ("C") and floor area (85) survive.
|
||||
zip_path = tmp_path / "domestic-csv.zip"
|
||||
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
||||
csv_buffer = io.StringIO()
|
||||
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
||||
writer.writeheader()
|
||||
writer.writerows(
|
||||
[
|
||||
_row(
|
||||
current_energy_rating="c",
|
||||
inspection_date="2024-01-01",
|
||||
total_floor_area="85",
|
||||
),
|
||||
# Same property; an unparseable date (OCR/garbled). Under a raw
|
||||
# string descending sort "not-a-date" outranks the ISO date and
|
||||
# wins the dedup, but as a null Date it loses.
|
||||
_row(
|
||||
current_energy_rating="g",
|
||||
inspection_date="not-a-date",
|
||||
total_floor_area="40",
|
||||
),
|
||||
]
|
||||
)
|
||||
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
||||
|
||||
price_paid_path = tmp_path / "price-paid.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"price": [250_000],
|
||||
"date_of_transfer": [date(2024, 2, 3)],
|
||||
"property_type": ["T"],
|
||||
"postcode": ["AA1 1AA"],
|
||||
"paon": ["1"],
|
||||
"saon": [None],
|
||||
"street": ["Example Street"],
|
||||
"locality": [None],
|
||||
"town_city": ["Exampletown"],
|
||||
"duration": ["F"],
|
||||
"old_new": ["N"],
|
||||
"ppd_category": ["A"],
|
||||
}
|
||||
).write_parquet(price_paid_path)
|
||||
|
||||
output_path = tmp_path / "epc-pp.parquet"
|
||||
_run(zip_path, price_paid_path, output_path, tmp_path)
|
||||
|
||||
df = pl.read_parquet(output_path)
|
||||
|
||||
assert df.height == 1
|
||||
# The valid-dated cert's facts are kept; the garbled-date cert is NOT chosen.
|
||||
assert df.select("current_energy_rating", "total_floor_area").to_dicts() == [
|
||||
{"current_energy_rating": "C", "total_floor_area": 85.0}
|
||||
]
|
||||
|
||||
|
||||
def test_run_excludes_price_paid_rows_without_full_postcode(tmp_path: Path):
|
||||
zip_path = tmp_path / "domestic-csv.zip"
|
||||
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue