787 lines
29 KiB
Python
787 lines
29 KiB
Python
import csv
|
|
import io
|
|
import zipfile
|
|
from datetime import date
|
|
from pathlib import Path
|
|
|
|
import polars as pl
|
|
|
|
from pipeline.transform.join_epc_pp import (
|
|
EPC_SOURCE_COLUMNS,
|
|
_join_address_parts,
|
|
_run,
|
|
_scan_epc_certificates,
|
|
flag_price_outliers,
|
|
)
|
|
|
|
|
|
def _write_csv(path: Path, fieldnames: list[str], rows: list[dict[str, str]]) -> None:
|
|
with path.open("w", newline="") as file:
|
|
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(rows)
|
|
|
|
|
|
def _row(**overrides: str) -> dict[str, str]:
|
|
row = {
|
|
"address": "1 Example Street",
|
|
"address1": "1 Example Street",
|
|
"address2": "Hale",
|
|
"postcode": " aa1 1aa ",
|
|
"uprn": "100012345678",
|
|
"current_energy_rating": "c",
|
|
"potential_energy_rating": "b",
|
|
"property_type": "House",
|
|
"built_form": "Mid-Terrace",
|
|
"inspection_date": "2024-01-02",
|
|
"total_floor_area": "84.5",
|
|
"number_habitable_rooms": "5",
|
|
"floor_height": "2.4",
|
|
"construction_age_band": "England and Wales: 1950-1966",
|
|
"tenure": "owner-occupied",
|
|
}
|
|
row.update(overrides)
|
|
return row
|
|
|
|
|
|
def test_scan_epc_certificates_supports_legacy_uppercase_csv(tmp_path: Path):
|
|
csv_path = tmp_path / "certificates.csv"
|
|
fieldnames = [column.upper() for column in EPC_SOURCE_COLUMNS]
|
|
row = {column.upper(): value for column, value in _row().items()}
|
|
row["NUMBER_HABITABLE_ROOMS"] = "0"
|
|
_write_csv(csv_path, fieldnames, [row])
|
|
|
|
df = _scan_epc_certificates(csv_path, tmp_path).collect()
|
|
|
|
assert df.to_dicts() == [
|
|
{
|
|
"epc_address": "1 Example Street",
|
|
"epc_address_a1": "1 Example Street",
|
|
"epc_address_a12": "1 Example Street Hale",
|
|
"epc_postcode": "AA1 1AA",
|
|
"uprn": "100012345678",
|
|
"current_energy_rating": "C",
|
|
"potential_energy_rating": "B",
|
|
"epc_property_type": "House",
|
|
"built_form": "Mid-Terrace",
|
|
"inspection_date": date(2024, 1, 2),
|
|
"total_floor_area": 84.5,
|
|
"number_habitable_rooms": None,
|
|
"floor_height": 2.4,
|
|
"construction_age_band": "England and Wales: 1950-1966",
|
|
"tenure": "owner-occupied",
|
|
}
|
|
]
|
|
|
|
|
|
def test_scan_epc_certificates_supports_domestic_zip(tmp_path: Path):
|
|
zip_path = tmp_path / "domestic-csv.zip"
|
|
rows_2023 = [_row(address="2 Example Street", inspection_date="2023-03-04")]
|
|
rows_2024 = [
|
|
_row(
|
|
address="3 Example Street",
|
|
postcode="BB2 2BB",
|
|
inspection_date="2024-05-06",
|
|
total_floor_area="",
|
|
tenure="Rented (social)",
|
|
)
|
|
]
|
|
|
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
|
for member_name, rows in [
|
|
("certificates-2023.csv", rows_2023),
|
|
("nested/certificates-2024.csv", rows_2024),
|
|
]:
|
|
csv_text = [",".join(EPC_SOURCE_COLUMNS)]
|
|
csv_text.extend(
|
|
",".join(row[column] for column in EPC_SOURCE_COLUMNS) for row in rows
|
|
)
|
|
archive.writestr(member_name, "\n".join(csv_text) + "\n")
|
|
archive.writestr("recommendations-2024.csv", "address,postcode\nignored,X\n")
|
|
|
|
df = _scan_epc_certificates(zip_path, tmp_path).sort("inspection_date").collect()
|
|
|
|
assert df.select("epc_address", "epc_postcode", "total_floor_area").to_dicts() == [
|
|
{
|
|
"epc_address": "2 Example Street",
|
|
"epc_postcode": "AA1 1AA",
|
|
"total_floor_area": 84.5,
|
|
},
|
|
{
|
|
"epc_address": "3 Example Street",
|
|
"epc_postcode": "BB2 2BB",
|
|
"total_floor_area": None,
|
|
},
|
|
]
|
|
assert df.get_column("tenure").to_list() == ["owner-occupied", "Rented (social)"]
|
|
assert df.schema["number_habitable_rooms"] == pl.Int16
|
|
|
|
|
|
def test_join_address_parts_empty_string_components():
|
|
# Price-paid SAON/PAON/STREET are empty strings (not null) when absent;
|
|
# concat_str(ignore_nulls=True) alone leaked the separator into the
|
|
# display address (' 10 PALACE GREEN') and doubled it for empty middle
|
|
# components. Empty/whitespace-only parts must contribute nothing.
|
|
df = pl.DataFrame(
|
|
{
|
|
"saon": ["", "FLAT 1", "FLAT 1", "FLAT 21", "", None, " ", " FLAT 2"],
|
|
"paon": ["10", "10", "", "82", "", None, "10", "11 "],
|
|
"street": [
|
|
"PALACE GREEN",
|
|
"HIGH STREET",
|
|
"HIGH STREET",
|
|
"",
|
|
"",
|
|
None,
|
|
"PALACE GREEN",
|
|
"STATION ROAD",
|
|
],
|
|
}
|
|
)
|
|
out = df.select(
|
|
_join_address_parts("saon", "paon", "street").alias("address")
|
|
).get_column("address")
|
|
|
|
assert out.to_list() == [
|
|
"10 PALACE GREEN", # empty saon -> no leading space
|
|
"FLAT 1 10 HIGH STREET", # normal three-part address is unchanged
|
|
"FLAT 1 HIGH STREET", # empty middle component -> no double space
|
|
"FLAT 21 82", # empty street -> no trailing space
|
|
None, # all-empty -> null, not whitespace junk
|
|
None, # all-null -> null
|
|
"10 PALACE GREEN", # whitespace-only component treated as empty
|
|
"FLAT 2 11 STATION ROAD", # per-component padding is stripped
|
|
]
|
|
# Invariant: every produced address is trimmed and single-spaced.
|
|
produced = out.drop_nulls()
|
|
assert produced.str.starts_with(" ").sum() == 0
|
|
assert produced.str.ends_with(" ").sum() == 0
|
|
assert produced.str.contains(" ", literal=True).sum() == 0
|
|
|
|
|
|
def test_run_builds_clean_pp_address_from_empty_string_saon(tmp_path: Path):
|
|
# Real price-paid rows carry saon == "" (not null) on ~88% of rows; the
|
|
# published pp_address must not inherit a leading separator from it.
|
|
zip_path = tmp_path / "domestic-csv.zip"
|
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
|
csv_buffer = io.StringIO()
|
|
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
|
writer.writeheader()
|
|
writer.writerow(_row())
|
|
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
|
|
|
price_paid_path = tmp_path / "price-paid.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
"price": [250_000],
|
|
"date_of_transfer": [date(2024, 2, 3)],
|
|
"property_type": ["T"],
|
|
"postcode": ["AA1 1AA"],
|
|
"paon": ["1"],
|
|
"saon": [""],
|
|
"street": ["Example Street"],
|
|
"locality": [""],
|
|
"town_city": ["Exampletown"],
|
|
"duration": ["F"],
|
|
"old_new": ["N"],
|
|
"ppd_category": ["A"],
|
|
}
|
|
).write_parquet(price_paid_path)
|
|
|
|
output_path = tmp_path / "epc-pp.parquet"
|
|
_run(zip_path, price_paid_path, output_path, tmp_path)
|
|
|
|
df = pl.read_parquet(output_path)
|
|
|
|
assert df.height == 1
|
|
# No leading space, and the clean address still matches its EPC record.
|
|
assert df.select("pp_address", "epc_address").to_dicts() == [
|
|
{"pp_address": "1 Example Street", "epc_address": "1 Example Street"}
|
|
]
|
|
|
|
|
|
def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
|
|
zip_path = tmp_path / "domestic-csv.zip"
|
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
|
csv_buffer = io.StringIO()
|
|
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
|
writer.writeheader()
|
|
writer.writerows(
|
|
[
|
|
_row(
|
|
current_energy_rating="d",
|
|
inspection_date="2023-01-01",
|
|
total_floor_area="80",
|
|
tenure="Rented (social)",
|
|
),
|
|
_row(
|
|
current_energy_rating="c",
|
|
inspection_date="2024-01-01",
|
|
total_floor_area="85",
|
|
tenure="owner-occupied",
|
|
),
|
|
]
|
|
)
|
|
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
|
|
|
price_paid_path = tmp_path / "price-paid.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
"price": [200_000, 250_000],
|
|
"date_of_transfer": [date(2020, 2, 3), date(2024, 2, 3)],
|
|
"property_type": ["T", "T"],
|
|
"postcode": ["AA1 1AA", "AA1 1AA"],
|
|
"paon": ["1", "1"],
|
|
"saon": [None, None],
|
|
"street": ["Example-Street", "Example Street"],
|
|
"locality": [None, None],
|
|
"town_city": ["Exampletown", "Exampletown"],
|
|
"duration": ["F", "F"],
|
|
"old_new": ["N", "N"],
|
|
"ppd_category": ["A", "A"],
|
|
}
|
|
).write_parquet(price_paid_path)
|
|
|
|
output_path = tmp_path / "epc-pp.parquet"
|
|
_run(zip_path, price_paid_path, output_path, tmp_path)
|
|
|
|
df = pl.read_parquet(output_path)
|
|
|
|
assert df.height == 1
|
|
assert df.select(
|
|
"epc_address",
|
|
"current_energy_rating",
|
|
"total_floor_area",
|
|
"construction_age_band",
|
|
"was_council_house",
|
|
).to_dicts() == [
|
|
{
|
|
"epc_address": "1 Example Street",
|
|
"current_energy_rating": "C",
|
|
"total_floor_area": 85.0,
|
|
# Band midpoint of 1950-1966, not the lower bound.
|
|
"construction_age_band": 1958,
|
|
"was_council_house": "Yes",
|
|
}
|
|
]
|
|
assert df.get_column("renovation_history").list.len().to_list() == [1]
|
|
assert df.get_column("historical_prices").list.len().to_list() == [2]
|
|
# Audit trail: the accepted fuzzy match's score is published (100 = exact
|
|
# post-normalisation address match).
|
|
assert df.get_column("epc_match_score").to_list() == [100]
|
|
|
|
|
|
def test_run_dedup_prefers_valid_dated_cert_over_garbled_date(tmp_path: Path):
|
|
# Two certificates for the same property. The cert with the garbled,
|
|
# unparseable inspection_date must NOT be chosen as "latest": a string sort
|
|
# nulls-first would have picked it, attaching a stale rating/floor area. The
|
|
# valid-dated cert wins, so its rating ("C") and floor area (85) survive.
|
|
zip_path = tmp_path / "domestic-csv.zip"
|
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
|
csv_buffer = io.StringIO()
|
|
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
|
writer.writeheader()
|
|
writer.writerows(
|
|
[
|
|
_row(
|
|
current_energy_rating="c",
|
|
inspection_date="2024-01-01",
|
|
total_floor_area="85",
|
|
),
|
|
# Same property; an unparseable date (OCR/garbled). Under a raw
|
|
# string descending sort "not-a-date" outranks the ISO date and
|
|
# wins the dedup, but as a null Date it loses.
|
|
_row(
|
|
current_energy_rating="g",
|
|
inspection_date="not-a-date",
|
|
total_floor_area="40",
|
|
),
|
|
]
|
|
)
|
|
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
|
|
|
price_paid_path = tmp_path / "price-paid.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
"price": [250_000],
|
|
"date_of_transfer": [date(2024, 2, 3)],
|
|
"property_type": ["T"],
|
|
"postcode": ["AA1 1AA"],
|
|
"paon": ["1"],
|
|
"saon": [None],
|
|
"street": ["Example Street"],
|
|
"locality": [None],
|
|
"town_city": ["Exampletown"],
|
|
"duration": ["F"],
|
|
"old_new": ["N"],
|
|
"ppd_category": ["A"],
|
|
}
|
|
).write_parquet(price_paid_path)
|
|
|
|
output_path = tmp_path / "epc-pp.parquet"
|
|
_run(zip_path, price_paid_path, output_path, tmp_path)
|
|
|
|
df = pl.read_parquet(output_path)
|
|
|
|
assert df.height == 1
|
|
# The valid-dated cert's facts are kept; the garbled-date cert is NOT chosen.
|
|
assert df.select("current_energy_rating", "total_floor_area").to_dicts() == [
|
|
{"current_energy_rating": "C", "total_floor_area": 85.0}
|
|
]
|
|
|
|
|
|
def test_run_excludes_price_paid_rows_without_full_postcode(tmp_path: Path):
|
|
zip_path = tmp_path / "domestic-csv.zip"
|
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
|
csv_buffer = io.StringIO()
|
|
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
|
writer.writeheader()
|
|
writer.writerow(_row())
|
|
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
|
|
|
price_paid_path = tmp_path / "price-paid.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
"price": [250_000, 300_000],
|
|
"date_of_transfer": [date(2024, 2, 3), date(2024, 2, 4)],
|
|
"property_type": ["T", "T"],
|
|
"postcode": ["AA1 1AA", ""],
|
|
"paon": ["1", "2"],
|
|
"saon": [None, None],
|
|
"street": ["Example Street", "Example Street"],
|
|
"locality": [None, None],
|
|
"town_city": ["Exampletown", "Exampletown"],
|
|
"duration": ["F", "F"],
|
|
"old_new": ["N", "N"],
|
|
"ppd_category": ["A", "A"],
|
|
}
|
|
).write_parquet(price_paid_path)
|
|
|
|
output_path = tmp_path / "epc-pp.parquet"
|
|
_run(zip_path, price_paid_path, output_path, tmp_path)
|
|
|
|
df = pl.read_parquet(output_path)
|
|
|
|
assert df["postcode"].to_list() == ["AA1 1AA"]
|
|
|
|
|
|
def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path):
|
|
zip_path = tmp_path / "domestic-csv.zip"
|
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
|
csv_buffer = io.StringIO()
|
|
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
|
writer.writeheader()
|
|
writer.writerow(_row(address="1 Totally Different Road"))
|
|
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
|
|
|
price_paid_path = tmp_path / "price-paid.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
"price": [250_000],
|
|
"date_of_transfer": [date(2024, 2, 3)],
|
|
"property_type": ["T"],
|
|
"postcode": ["AA1 1AA"],
|
|
"paon": ["1"],
|
|
"saon": [None],
|
|
"street": ["Example Street"],
|
|
"locality": [None],
|
|
"town_city": ["Exampletown"],
|
|
"duration": ["F"],
|
|
"old_new": ["N"],
|
|
"ppd_category": ["A"],
|
|
}
|
|
).write_parquet(price_paid_path)
|
|
|
|
output_path = tmp_path / "epc-pp.parquet"
|
|
_run(zip_path, price_paid_path, output_path, tmp_path)
|
|
|
|
df = pl.read_parquet(output_path)
|
|
|
|
assert df.height == 1
|
|
assert df.select(
|
|
"pp_address",
|
|
"epc_address",
|
|
"total_floor_area",
|
|
"current_energy_rating",
|
|
"epc_match_score",
|
|
).to_dicts() == [
|
|
{
|
|
"pp_address": "1 Example Street",
|
|
"epc_address": None,
|
|
"total_floor_area": None,
|
|
"current_energy_rating": None,
|
|
# No accepted match -> no score.
|
|
"epc_match_score": None,
|
|
}
|
|
]
|
|
|
|
|
|
def test_run_excludes_category_b_sales_from_price_aggregations(tmp_path: Path):
|
|
# Category B entries (repossessions, bulk/portfolio, power-of-sale) must not
|
|
# pollute latest_price / historical_prices, but the property still survives
|
|
# via its standard Category A sales.
|
|
zip_path = tmp_path / "domestic-csv.zip"
|
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
|
csv_buffer = io.StringIO()
|
|
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
|
writer.writeheader()
|
|
writer.writerow(_row())
|
|
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
|
|
|
price_paid_path = tmp_path / "price-paid.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
"price": [200_000, 250_000, 5_000_000],
|
|
"date_of_transfer": [date(2020, 2, 3), date(2022, 2, 3), date(2024, 2, 3)],
|
|
"property_type": ["T", "T", "T"],
|
|
"postcode": ["AA1 1AA", "AA1 1AA", "AA1 1AA"],
|
|
"paon": ["1", "1", "1"],
|
|
"saon": [None, None, None],
|
|
"street": ["Example Street", "Example Street", "Example Street"],
|
|
"locality": [None, None, None],
|
|
"town_city": ["Exampletown", "Exampletown", "Exampletown"],
|
|
"duration": ["F", "F", "F"],
|
|
"old_new": ["N", "N", "N"],
|
|
# The latest (5M) sale is a Category B bulk/portfolio transfer.
|
|
"ppd_category": ["A", "A", "B"],
|
|
}
|
|
).write_parquet(price_paid_path)
|
|
|
|
output_path = tmp_path / "epc-pp.parquet"
|
|
_run(zip_path, price_paid_path, output_path, tmp_path)
|
|
|
|
df = pl.read_parquet(output_path)
|
|
|
|
assert df.height == 1
|
|
# Only the two Category A sales survive; the 5M Category B transfer is dropped.
|
|
assert df.get_column("latest_price").to_list() == [250_000]
|
|
assert df.get_column("historical_prices").list.len().to_list() == [2]
|
|
|
|
|
|
def test_run_new_build_keeps_early_first_transfer_when_sub_min_price(tmp_path: Path):
|
|
# A new-build whose earliest sale is below MIN_PRICE must still take that early
|
|
# year as its EXACT construction date, while latest_price uses only the
|
|
# quality-passing (>=MIN_PRICE) sale.
|
|
zip_path = tmp_path / "domestic-csv.zip"
|
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
|
csv_buffer = io.StringIO()
|
|
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
|
writer.writeheader()
|
|
writer.writerow(_row())
|
|
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
|
|
|
price_paid_path = tmp_path / "price-paid.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
# 5_000 is below MIN_PRICE (10_000) — a nominal/junk transfer that
|
|
# must still anchor the construction year but stay out of the price
|
|
# aggregations.
|
|
"price": [5_000, 300_000],
|
|
"date_of_transfer": [date(2015, 2, 3), date(2022, 2, 3)],
|
|
"property_type": ["T", "T"],
|
|
"postcode": ["AA1 1AA", "AA1 1AA"],
|
|
"paon": ["1", "1"],
|
|
"saon": [None, None],
|
|
"street": ["Example Street", "Example Street"],
|
|
"locality": [None, None],
|
|
"town_city": ["Exampletown", "Exampletown"],
|
|
"duration": ["F", "F"],
|
|
"old_new": ["Y", "Y"],
|
|
"ppd_category": ["A", "A"],
|
|
}
|
|
).write_parquet(price_paid_path)
|
|
|
|
output_path = tmp_path / "epc-pp.parquet"
|
|
_run(zip_path, price_paid_path, output_path, tmp_path)
|
|
|
|
df = pl.read_parquet(output_path)
|
|
|
|
assert df.height == 1
|
|
# Construction year is the genuine earliest transfer (2015), flagged EXACT,
|
|
# even though that sale is below MIN_PRICE.
|
|
assert df.get_column("construction_age_band").to_list() == [2015]
|
|
assert df.get_column("is_construction_date_approximate").to_list() == [0]
|
|
# latest_price uses only the >=MIN_PRICE sale; the sub-MIN sale is excluded.
|
|
assert df.get_column("latest_price").to_list() == [300_000]
|
|
assert df.get_column("historical_prices").list.len().to_list() == [1]
|
|
|
|
|
|
def test_run_keeps_sale_above_lowered_min_price(tmp_path: Path):
|
|
# A genuine cheap sale of 30_000 sits between the OLD floor (50k) and the
|
|
# NEW floor (10k): it must now be RETAINED in the price aggregations. This
|
|
# pins the 50k->10k change — it fails on the pre-fix 50k floor (where 30k was
|
|
# excluded, giving historical_prices length 1 / latest_price 250_000).
|
|
zip_path = tmp_path / "domestic-csv.zip"
|
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
|
csv_buffer = io.StringIO()
|
|
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
|
writer.writeheader()
|
|
writer.writerow(_row())
|
|
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
|
|
|
price_paid_path = tmp_path / "price-paid.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
"price": [250_000, 30_000],
|
|
"date_of_transfer": [date(2018, 2, 3), date(2022, 2, 3)],
|
|
"property_type": ["T", "T"],
|
|
"postcode": ["AA1 1AA", "AA1 1AA"],
|
|
"paon": ["1", "1"],
|
|
"saon": [None, None],
|
|
"street": ["Example Street", "Example Street"],
|
|
"locality": [None, None],
|
|
"town_city": ["Exampletown", "Exampletown"],
|
|
"duration": ["F", "F"],
|
|
"old_new": ["N", "N"],
|
|
"ppd_category": ["A", "A"],
|
|
}
|
|
).write_parquet(price_paid_path)
|
|
|
|
output_path = tmp_path / "epc-pp.parquet"
|
|
_run(zip_path, price_paid_path, output_path, tmp_path)
|
|
|
|
df = pl.read_parquet(output_path)
|
|
|
|
assert df.height == 1
|
|
# Both sales now survive the 10k floor; the 30_000 (2022) is the most recent.
|
|
assert df.get_column("historical_prices").list.len().to_list() == [2]
|
|
assert df.get_column("latest_price").to_list() == [30_000]
|
|
|
|
|
|
def _write_epc_zip(zip_path: Path) -> None:
|
|
"""Write a minimal domestic zip with the default certificate row."""
|
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
|
csv_buffer = io.StringIO()
|
|
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
|
writer.writeheader()
|
|
writer.writerow(_row())
|
|
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
|
|
|
|
|
def _price_paid_frame(
|
|
prices: list[int],
|
|
dates: list[date],
|
|
ppd_categories: list[str] | None = None,
|
|
) -> pl.DataFrame:
|
|
"""One property ("1 Example Street, AA1 1AA") with the given sales."""
|
|
count = len(prices)
|
|
return pl.DataFrame(
|
|
{
|
|
"price": prices,
|
|
"date_of_transfer": dates,
|
|
"property_type": ["T"] * count,
|
|
"postcode": ["AA1 1AA"] * count,
|
|
"paon": ["1"] * count,
|
|
"saon": [None] * count,
|
|
"street": ["Example Street"] * count,
|
|
"locality": [None] * count,
|
|
"town_city": ["Exampletown"] * count,
|
|
"duration": ["F"] * count,
|
|
"old_new": ["N"] * count,
|
|
"ppd_category": ppd_categories or ["A"] * count,
|
|
}
|
|
)
|
|
|
|
|
|
def test_run_collapses_duplicate_transactions(tmp_path: Path):
|
|
# Price-paid lodges the same completed sale twice under distinct
|
|
# transaction ids; the duplicate must appear ONCE in historical_prices
|
|
# rather than double-counting the sale.
|
|
zip_path = tmp_path / "domestic-csv.zip"
|
|
_write_epc_zip(zip_path)
|
|
|
|
price_paid_path = tmp_path / "price-paid.parquet"
|
|
_price_paid_frame(
|
|
prices=[200_000, 250_000, 250_000],
|
|
dates=[date(2020, 2, 3), date(2024, 2, 3), date(2024, 2, 3)],
|
|
).write_parquet(price_paid_path)
|
|
|
|
output_path = tmp_path / "epc-pp.parquet"
|
|
_run(zip_path, price_paid_path, output_path, tmp_path)
|
|
|
|
df = pl.read_parquet(output_path)
|
|
|
|
assert df.height == 1
|
|
# The duplicated 250_000 sale collapses to one entry; two distinct sales.
|
|
assert df.get_column("historical_prices").to_list() == [
|
|
[
|
|
{"year": 2020, "month": 2, "price": 200_000},
|
|
{"year": 2024, "month": 2, "price": 250_000},
|
|
]
|
|
]
|
|
assert df.get_column("latest_price").to_list() == [250_000]
|
|
|
|
|
|
def test_run_excludes_implausible_price_jump_but_keeps_property(tmp_path: Path):
|
|
# The 13 QUICKSETTS HR2 7PP case: £140,000 in 2016 then "£207,500,000" in
|
|
# 2026 (clearly £207,500 with extra digits, lodged as category A). The
|
|
# garbage sale must vanish from latest_price / historical_prices while the
|
|
# property row itself survives on its genuine sale.
|
|
zip_path = tmp_path / "domestic-csv.zip"
|
|
_write_epc_zip(zip_path)
|
|
|
|
price_paid_path = tmp_path / "price-paid.parquet"
|
|
_price_paid_frame(
|
|
prices=[140_000, 207_500_000],
|
|
dates=[date(2016, 6, 1), date(2026, 6, 1)],
|
|
).write_parquet(price_paid_path)
|
|
|
|
output_path = tmp_path / "epc-pp.parquet"
|
|
_run(zip_path, price_paid_path, output_path, tmp_path)
|
|
|
|
df = pl.read_parquet(output_path)
|
|
|
|
assert df.height == 1
|
|
assert df.get_column("latest_price").to_list() == [140_000]
|
|
assert df.get_column("historical_prices").to_list() == [
|
|
[{"year": 2016, "month": 6, "price": 140_000}]
|
|
]
|
|
|
|
|
|
def test_run_keeps_genuine_long_horizon_appreciation(tmp_path: Path):
|
|
# x30 over 31 years is extreme but genuine (prime-London territory); the
|
|
# time-aware threshold (12 * 1.10**31 ≈ 230) must leave it untouched.
|
|
zip_path = tmp_path / "domestic-csv.zip"
|
|
_write_epc_zip(zip_path)
|
|
|
|
price_paid_path = tmp_path / "price-paid.parquet"
|
|
_price_paid_frame(
|
|
prices=[20_000, 600_000],
|
|
dates=[date(1995, 3, 1), date(2026, 3, 1)],
|
|
).write_parquet(price_paid_path)
|
|
|
|
output_path = tmp_path / "epc-pp.parquet"
|
|
_run(zip_path, price_paid_path, output_path, tmp_path)
|
|
|
|
df = pl.read_parquet(output_path)
|
|
|
|
assert df.height == 1
|
|
assert df.get_column("historical_prices").list.len().to_list() == [2]
|
|
assert df.get_column("latest_price").to_list() == [600_000]
|
|
|
|
|
|
def test_run_keeps_right_to_buy_style_jump(tmp_path: Path):
|
|
# A x12 jump on a cheap property (discounted right-to-buy purchase then an
|
|
# open-market resale) is legitimate; the JUMP_MIN_PRICE floor keeps such
|
|
# sales safe from the jump guard.
|
|
zip_path = tmp_path / "domestic-csv.zip"
|
|
_write_epc_zip(zip_path)
|
|
|
|
price_paid_path = tmp_path / "price-paid.parquet"
|
|
_price_paid_frame(
|
|
prices=[15_000, 180_000],
|
|
dates=[date(1998, 5, 1), date(2003, 5, 1)],
|
|
).write_parquet(price_paid_path)
|
|
|
|
output_path = tmp_path / "epc-pp.parquet"
|
|
_run(zip_path, price_paid_path, output_path, tmp_path)
|
|
|
|
df = pl.read_parquet(output_path)
|
|
|
|
assert df.height == 1
|
|
assert df.get_column("historical_prices").list.len().to_list() == [2]
|
|
assert df.get_column("latest_price").to_list() == [180_000]
|
|
|
|
|
|
def _slim_sales(rows: list[tuple[str, date, int]]) -> pl.DataFrame:
|
|
return pl.DataFrame(
|
|
{
|
|
"_pp_group_address": [address for address, _, _ in rows],
|
|
"_pp_group_postcode": ["AA11AA"] * len(rows),
|
|
"date_of_transfer": [transfer_date for _, transfer_date, _ in rows],
|
|
"price": [price for _, _, price in rows],
|
|
}
|
|
)
|
|
|
|
|
|
def test_flag_price_outliers_up_rule_flags_spike_after_normal_sale():
|
|
# x1,482 over 10 years against a threshold of 12 * 1.10**10 ≈ 31: the
|
|
# HIGHER sale is flagged, the genuine earlier sale is left alone.
|
|
outliers = flag_price_outliers(
|
|
_slim_sales(
|
|
[
|
|
("13 QUICKSETTS", date(2016, 6, 1), 140_000),
|
|
("13 QUICKSETTS", date(2026, 6, 1), 207_500_000),
|
|
]
|
|
)
|
|
)
|
|
|
|
assert outliers.to_dicts() == [
|
|
{
|
|
"_pp_group_address": "13 QUICKSETTS",
|
|
"_pp_group_postcode": "AA11AA",
|
|
"date_of_transfer": date(2026, 6, 1),
|
|
"price": 207_500_000,
|
|
"_price_outlier": True,
|
|
}
|
|
]
|
|
|
|
|
|
def test_flag_price_outliers_down_rule_flags_spike_before_normal_sale():
|
|
# The garbage sale comes FIRST, so it has no previous sale to compare
|
|
# against; the down rule (next sale collapses to under 1/threshold of this
|
|
# one) must catch it instead.
|
|
outliers = flag_price_outliers(
|
|
_slim_sales(
|
|
[
|
|
("5 EXAMPLE ROAD", date(2016, 6, 1), 250_000_000),
|
|
("5 EXAMPLE ROAD", date(2017, 6, 1), 140_000),
|
|
]
|
|
)
|
|
)
|
|
|
|
assert outliers.get_column("price").to_list() == [250_000_000]
|
|
|
|
|
|
def test_flag_price_outliers_min_price_floor_protects_cheap_properties():
|
|
# x40 in under six months exceeds the relative threshold (~12.6 at the
|
|
# half-year floor), but the flagged price (600k) is below JUMP_MIN_PRICE,
|
|
# so nothing is flagged: the absolute floor is load-bearing here.
|
|
outliers = flag_price_outliers(
|
|
_slim_sales(
|
|
[
|
|
("9 CHEAP STREET", date(2000, 1, 1), 15_000),
|
|
("9 CHEAP STREET", date(2000, 6, 1), 600_000),
|
|
]
|
|
)
|
|
)
|
|
|
|
assert outliers.height == 0
|
|
|
|
|
|
def test_flag_price_outliers_spares_expensive_long_horizon_growth():
|
|
# x30 over 31 years on a now-£4.5M property clears the £2M floor but stays
|
|
# under the time-aware threshold (12 * 1.10**31 ≈ 230): not flagged.
|
|
outliers = flag_price_outliers(
|
|
_slim_sales(
|
|
[
|
|
("1 PRIME PLACE", date(1995, 1, 1), 150_000),
|
|
("1 PRIME PLACE", date(2026, 1, 1), 4_500_000),
|
|
]
|
|
)
|
|
)
|
|
|
|
assert outliers.height == 0
|
|
|
|
|
|
def test_epc_band_to_year_uses_midpoint_and_clamps():
|
|
import polars as pl
|
|
|
|
from pipeline.transform.join_epc_pp import epc_band_to_year
|
|
|
|
df = pl.DataFrame(
|
|
{
|
|
"b": [
|
|
"England and Wales: 1950-1966", # midpoint 1958
|
|
"1900-1929", # midpoint 1914
|
|
"England and Wales: before 1900", # too wide -> null
|
|
"2012 onwards", # single year
|
|
"1012", # implausible -> null
|
|
"2202", # implausible -> null
|
|
None, # null -> null
|
|
"1958", # already-numeric-as-string -> pass through
|
|
]
|
|
}
|
|
)
|
|
years = df.select(epc_band_to_year(pl.col("b")).alias("y"))["y"].to_list()
|
|
assert years == [1958, 1914, None, 2012, None, None, None, 1958]
|