This commit is contained in:
Andras Schmelczer 2026-05-14 08:09:19 +01:00
parent a8165249a4
commit a4103b0896
64 changed files with 5376 additions and 3832 deletions

View file

@ -136,17 +136,17 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [250_000],
"date_of_transfer": [date(2024, 2, 3)],
"property_type": ["T"],
"postcode": ["AA1 1AA"],
"paon": ["1"],
"saon": [None],
"street": ["Example Street"],
"locality": [None],
"town_city": ["Exampletown"],
"duration": ["F"],
"old_new": ["N"],
"price": [200_000, 250_000],
"date_of_transfer": [date(2020, 2, 3), date(2024, 2, 3)],
"property_type": ["T", "T"],
"postcode": ["AA1 1AA", "AA1 1AA"],
"paon": ["1", "1"],
"saon": [None, None],
"street": ["Example-Street", "Example Street"],
"locality": [None, None],
"town_city": ["Exampletown", "Exampletown"],
"duration": ["F", "F"],
"old_new": ["N", "N"],
}
).write_parquet(price_paid_path)
@ -172,3 +172,85 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
}
]
assert df.get_column("renovation_history").list.len().to_list() == [1]
assert df.get_column("historical_prices").list.len().to_list() == [2]
def test_run_excludes_price_paid_rows_without_full_postcode(tmp_path: Path):
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
csv_buffer = io.StringIO()
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
writer.writeheader()
writer.writerow(_row())
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [250_000, 300_000],
"date_of_transfer": [date(2024, 2, 3), date(2024, 2, 4)],
"property_type": ["T", "T"],
"postcode": ["AA1 1AA", ""],
"paon": ["1", "2"],
"saon": [None, None],
"street": ["Example Street", "Example Street"],
"locality": [None, None],
"town_city": ["Exampletown", "Exampletown"],
"duration": ["F", "F"],
"old_new": ["N", "N"],
}
).write_parquet(price_paid_path)
output_path = tmp_path / "epc-pp.parquet"
_run(zip_path, price_paid_path, output_path, tmp_path)
df = pl.read_parquet(output_path)
assert df["postcode"].to_list() == ["AA1 1AA"]
def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path):
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
csv_buffer = io.StringIO()
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
writer.writeheader()
writer.writerow(_row(address="1 Totally Different Road"))
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [250_000],
"date_of_transfer": [date(2024, 2, 3)],
"property_type": ["T"],
"postcode": ["AA1 1AA"],
"paon": ["1"],
"saon": [None],
"street": ["Example Street"],
"locality": [None],
"town_city": ["Exampletown"],
"duration": ["F"],
"old_new": ["N"],
}
).write_parquet(price_paid_path)
output_path = tmp_path / "epc-pp.parquet"
_run(zip_path, price_paid_path, output_path, tmp_path)
df = pl.read_parquet(output_path)
assert df.height == 1
assert df.select(
"pp_address",
"epc_address",
"total_floor_area",
"current_energy_rating",
).to_dicts() == [
{
"pp_address": "1 Example Street",
"epc_address": None,
"total_floor_area": None,
"current_energy_rating": None,
}
]