This commit is contained in:
Andras Schmelczer 2026-06-02 13:46:18 +01:00
parent a04ac2d857
commit d43da9708c
47 changed files with 4120 additions and 573 deletions

View file

@ -149,6 +149,7 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
"town_city": ["Exampletown", "Exampletown"],
"duration": ["F", "F"],
"old_new": ["N", "N"],
"ppd_category": ["A", "A"],
}
).write_parquet(price_paid_path)
@ -201,6 +202,7 @@ def test_run_excludes_price_paid_rows_without_full_postcode(tmp_path: Path):
"town_city": ["Exampletown", "Exampletown"],
"duration": ["F", "F"],
"old_new": ["N", "N"],
"ppd_category": ["A", "A"],
}
).write_parquet(price_paid_path)
@ -235,6 +237,7 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
"town_city": ["Exampletown"],
"duration": ["F"],
"old_new": ["N"],
"ppd_category": ["A"],
}
).write_parquet(price_paid_path)
@ -259,6 +262,93 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
]
def test_run_excludes_category_b_sales_from_price_aggregations(tmp_path: Path):
# Category B entries (repossessions, bulk/portfolio, power-of-sale) must not
# pollute latest_price / historical_prices, but the property still survives
# via its standard Category A sales.
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
csv_buffer = io.StringIO()
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
writer.writeheader()
writer.writerow(_row())
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [200_000, 250_000, 5_000_000],
"date_of_transfer": [date(2020, 2, 3), date(2022, 2, 3), date(2024, 2, 3)],
"property_type": ["T", "T", "T"],
"postcode": ["AA1 1AA", "AA1 1AA", "AA1 1AA"],
"paon": ["1", "1", "1"],
"saon": [None, None, None],
"street": ["Example Street", "Example Street", "Example Street"],
"locality": [None, None, None],
"town_city": ["Exampletown", "Exampletown", "Exampletown"],
"duration": ["F", "F", "F"],
"old_new": ["N", "N", "N"],
# The latest (5M) sale is a Category B bulk/portfolio transfer.
"ppd_category": ["A", "A", "B"],
}
).write_parquet(price_paid_path)
output_path = tmp_path / "epc-pp.parquet"
_run(zip_path, price_paid_path, output_path, tmp_path)
df = pl.read_parquet(output_path)
assert df.height == 1
# Only the two Category A sales survive; the 5M Category B transfer is dropped.
assert df.get_column("latest_price").to_list() == [250_000]
assert df.get_column("historical_prices").list.len().to_list() == [2]
def test_run_new_build_keeps_early_first_transfer_when_sub_min_price(tmp_path: Path):
# A new-build whose earliest sale is below MIN_PRICE must still take that early
# year as its EXACT construction date, while latest_price uses only the
# quality-passing (>=MIN_PRICE) sale.
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
csv_buffer = io.StringIO()
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
writer.writeheader()
writer.writerow(_row())
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [30_000, 300_000],
"date_of_transfer": [date(2015, 2, 3), date(2022, 2, 3)],
"property_type": ["T", "T"],
"postcode": ["AA1 1AA", "AA1 1AA"],
"paon": ["1", "1"],
"saon": [None, None],
"street": ["Example Street", "Example Street"],
"locality": [None, None],
"town_city": ["Exampletown", "Exampletown"],
"duration": ["F", "F"],
"old_new": ["Y", "Y"],
"ppd_category": ["A", "A"],
}
).write_parquet(price_paid_path)
output_path = tmp_path / "epc-pp.parquet"
_run(zip_path, price_paid_path, output_path, tmp_path)
df = pl.read_parquet(output_path)
assert df.height == 1
# Construction year is the genuine earliest transfer (2015), flagged EXACT,
# even though that sale is below MIN_PRICE.
assert df.get_column("construction_age_band").to_list() == [2015]
assert df.get_column("is_construction_date_approximate").to_list() == [0]
# latest_price uses only the >=MIN_PRICE sale; the sub-MIN sale is excluded.
assert df.get_column("latest_price").to_list() == [300_000]
assert df.get_column("historical_prices").list.len().to_list() == [1]
def test_epc_band_to_year_uses_midpoint_and_clamps():
import polars as pl