try
Some checks failed
CI / Check (push) Failing after 3m22s
Build and publish Docker image / build-and-push (push) Successful in 7m25s

This commit is contained in:
Andras Schmelczer 2026-06-04 22:34:26 +01:00
parent 843d14b7ba
commit c938b71904
13 changed files with 698 additions and 109 deletions

View file

@ -378,7 +378,10 @@ def test_run_new_build_keeps_early_first_transfer_when_sub_min_price(tmp_path: P
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [30_000, 300_000],
# 5_000 is below MIN_PRICE (10_000) — a nominal/junk transfer that
# must still anchor the construction year but stay out of the price
# aggregations.
"price": [5_000, 300_000],
"date_of_transfer": [date(2015, 2, 3), date(2022, 2, 3)],
"property_type": ["T", "T"],
"postcode": ["AA1 1AA", "AA1 1AA"],
@ -408,6 +411,48 @@ def test_run_new_build_keeps_early_first_transfer_when_sub_min_price(tmp_path: P
assert df.get_column("historical_prices").list.len().to_list() == [1]
def test_run_keeps_sale_above_lowered_min_price(tmp_path: Path):
# A genuine cheap sale of 30_000 sits between the OLD floor (50k) and the
# NEW floor (10k): it must now be RETAINED in the price aggregations. This
# pins the 50k->10k change — it fails on the pre-fix 50k floor (where 30k was
# excluded, giving historical_prices length 1 / latest_price 250_000).
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
csv_buffer = io.StringIO()
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
writer.writeheader()
writer.writerow(_row())
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [250_000, 30_000],
"date_of_transfer": [date(2018, 2, 3), date(2022, 2, 3)],
"property_type": ["T", "T"],
"postcode": ["AA1 1AA", "AA1 1AA"],
"paon": ["1", "1"],
"saon": [None, None],
"street": ["Example Street", "Example Street"],
"locality": [None, None],
"town_city": ["Exampletown", "Exampletown"],
"duration": ["F", "F"],
"old_new": ["N", "N"],
"ppd_category": ["A", "A"],
}
).write_parquet(price_paid_path)
output_path = tmp_path / "epc-pp.parquet"
_run(zip_path, price_paid_path, output_path, tmp_path)
df = pl.read_parquet(output_path)
assert df.height == 1
# Both sales now survive the 10k floor; the 30_000 (2022) is the most recent.
assert df.get_column("historical_prices").list.len().to_list() == [2]
assert df.get_column("latest_price").to_list() == [30_000]
def test_epc_band_to_year_uses_midpoint_and_clamps():
import polars as pl