Fable findings in data
This commit is contained in:
parent
b98bc6d611
commit
6a33b03fdf
20 changed files with 1502 additions and 274 deletions
|
|
@ -11,6 +11,7 @@ from pipeline.transform.join_epc_pp import (
|
|||
_join_address_parts,
|
||||
_run,
|
||||
_scan_epc_certificates,
|
||||
flag_price_outliers,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -261,6 +262,9 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
|
|||
]
|
||||
assert df.get_column("renovation_history").list.len().to_list() == [1]
|
||||
assert df.get_column("historical_prices").list.len().to_list() == [2]
|
||||
# Audit trail: the accepted fuzzy match's score is published (100 = exact
|
||||
# post-normalisation address match).
|
||||
assert df.get_column("epc_match_score").to_list() == [100]
|
||||
|
||||
|
||||
def test_run_dedup_prefers_valid_dated_cert_over_garbled_date(tmp_path: Path):
|
||||
|
|
@ -395,12 +399,15 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
|
|||
"epc_address",
|
||||
"total_floor_area",
|
||||
"current_energy_rating",
|
||||
"epc_match_score",
|
||||
).to_dicts() == [
|
||||
{
|
||||
"pp_address": "1 Example Street",
|
||||
"epc_address": None,
|
||||
"total_floor_area": None,
|
||||
"current_energy_rating": None,
|
||||
# No accepted match -> no score.
|
||||
"epc_match_score": None,
|
||||
}
|
||||
]
|
||||
|
||||
|
|
@ -537,6 +544,222 @@ def test_run_keeps_sale_above_lowered_min_price(tmp_path: Path):
|
|||
assert df.get_column("latest_price").to_list() == [30_000]
|
||||
|
||||
|
||||
def _write_epc_zip(zip_path: Path) -> None:
|
||||
"""Write a minimal domestic zip with the default certificate row."""
|
||||
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
||||
csv_buffer = io.StringIO()
|
||||
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
||||
writer.writeheader()
|
||||
writer.writerow(_row())
|
||||
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
||||
|
||||
|
||||
def _price_paid_frame(
|
||||
prices: list[int],
|
||||
dates: list[date],
|
||||
ppd_categories: list[str] | None = None,
|
||||
) -> pl.DataFrame:
|
||||
"""One property ("1 Example Street, AA1 1AA") with the given sales."""
|
||||
count = len(prices)
|
||||
return pl.DataFrame(
|
||||
{
|
||||
"price": prices,
|
||||
"date_of_transfer": dates,
|
||||
"property_type": ["T"] * count,
|
||||
"postcode": ["AA1 1AA"] * count,
|
||||
"paon": ["1"] * count,
|
||||
"saon": [None] * count,
|
||||
"street": ["Example Street"] * count,
|
||||
"locality": [None] * count,
|
||||
"town_city": ["Exampletown"] * count,
|
||||
"duration": ["F"] * count,
|
||||
"old_new": ["N"] * count,
|
||||
"ppd_category": ppd_categories or ["A"] * count,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def test_run_collapses_duplicate_transactions(tmp_path: Path):
|
||||
# Price-paid lodges the same completed sale twice under distinct
|
||||
# transaction ids; the duplicate must appear ONCE in historical_prices
|
||||
# rather than double-counting the sale.
|
||||
zip_path = tmp_path / "domestic-csv.zip"
|
||||
_write_epc_zip(zip_path)
|
||||
|
||||
price_paid_path = tmp_path / "price-paid.parquet"
|
||||
_price_paid_frame(
|
||||
prices=[200_000, 250_000, 250_000],
|
||||
dates=[date(2020, 2, 3), date(2024, 2, 3), date(2024, 2, 3)],
|
||||
).write_parquet(price_paid_path)
|
||||
|
||||
output_path = tmp_path / "epc-pp.parquet"
|
||||
_run(zip_path, price_paid_path, output_path, tmp_path)
|
||||
|
||||
df = pl.read_parquet(output_path)
|
||||
|
||||
assert df.height == 1
|
||||
# The duplicated 250_000 sale collapses to one entry; two distinct sales.
|
||||
assert df.get_column("historical_prices").to_list() == [
|
||||
[
|
||||
{"year": 2020, "month": 2, "price": 200_000},
|
||||
{"year": 2024, "month": 2, "price": 250_000},
|
||||
]
|
||||
]
|
||||
assert df.get_column("latest_price").to_list() == [250_000]
|
||||
|
||||
|
||||
def test_run_excludes_implausible_price_jump_but_keeps_property(tmp_path: Path):
|
||||
# The 13 QUICKSETTS HR2 7PP case: £140,000 in 2016 then "£207,500,000" in
|
||||
# 2026 (clearly £207,500 with extra digits, lodged as category A). The
|
||||
# garbage sale must vanish from latest_price / historical_prices while the
|
||||
# property row itself survives on its genuine sale.
|
||||
zip_path = tmp_path / "domestic-csv.zip"
|
||||
_write_epc_zip(zip_path)
|
||||
|
||||
price_paid_path = tmp_path / "price-paid.parquet"
|
||||
_price_paid_frame(
|
||||
prices=[140_000, 207_500_000],
|
||||
dates=[date(2016, 6, 1), date(2026, 6, 1)],
|
||||
).write_parquet(price_paid_path)
|
||||
|
||||
output_path = tmp_path / "epc-pp.parquet"
|
||||
_run(zip_path, price_paid_path, output_path, tmp_path)
|
||||
|
||||
df = pl.read_parquet(output_path)
|
||||
|
||||
assert df.height == 1
|
||||
assert df.get_column("latest_price").to_list() == [140_000]
|
||||
assert df.get_column("historical_prices").to_list() == [
|
||||
[{"year": 2016, "month": 6, "price": 140_000}]
|
||||
]
|
||||
|
||||
|
||||
def test_run_keeps_genuine_long_horizon_appreciation(tmp_path: Path):
|
||||
# x30 over 31 years is extreme but genuine (prime-London territory); the
|
||||
# time-aware threshold (12 * 1.10**31 ≈ 230) must leave it untouched.
|
||||
zip_path = tmp_path / "domestic-csv.zip"
|
||||
_write_epc_zip(zip_path)
|
||||
|
||||
price_paid_path = tmp_path / "price-paid.parquet"
|
||||
_price_paid_frame(
|
||||
prices=[20_000, 600_000],
|
||||
dates=[date(1995, 3, 1), date(2026, 3, 1)],
|
||||
).write_parquet(price_paid_path)
|
||||
|
||||
output_path = tmp_path / "epc-pp.parquet"
|
||||
_run(zip_path, price_paid_path, output_path, tmp_path)
|
||||
|
||||
df = pl.read_parquet(output_path)
|
||||
|
||||
assert df.height == 1
|
||||
assert df.get_column("historical_prices").list.len().to_list() == [2]
|
||||
assert df.get_column("latest_price").to_list() == [600_000]
|
||||
|
||||
|
||||
def test_run_keeps_right_to_buy_style_jump(tmp_path: Path):
|
||||
# A x12 jump on a cheap property (discounted right-to-buy purchase then an
|
||||
# open-market resale) is legitimate; the JUMP_MIN_PRICE floor keeps such
|
||||
# sales safe from the jump guard.
|
||||
zip_path = tmp_path / "domestic-csv.zip"
|
||||
_write_epc_zip(zip_path)
|
||||
|
||||
price_paid_path = tmp_path / "price-paid.parquet"
|
||||
_price_paid_frame(
|
||||
prices=[15_000, 180_000],
|
||||
dates=[date(1998, 5, 1), date(2003, 5, 1)],
|
||||
).write_parquet(price_paid_path)
|
||||
|
||||
output_path = tmp_path / "epc-pp.parquet"
|
||||
_run(zip_path, price_paid_path, output_path, tmp_path)
|
||||
|
||||
df = pl.read_parquet(output_path)
|
||||
|
||||
assert df.height == 1
|
||||
assert df.get_column("historical_prices").list.len().to_list() == [2]
|
||||
assert df.get_column("latest_price").to_list() == [180_000]
|
||||
|
||||
|
||||
def _slim_sales(rows: list[tuple[str, date, int]]) -> pl.DataFrame:
|
||||
return pl.DataFrame(
|
||||
{
|
||||
"_pp_group_address": [address for address, _, _ in rows],
|
||||
"_pp_group_postcode": ["AA11AA"] * len(rows),
|
||||
"date_of_transfer": [transfer_date for _, transfer_date, _ in rows],
|
||||
"price": [price for _, _, price in rows],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def test_flag_price_outliers_up_rule_flags_spike_after_normal_sale():
|
||||
# x1,482 over 10 years against a threshold of 12 * 1.10**10 ≈ 31: the
|
||||
# HIGHER sale is flagged, the genuine earlier sale is left alone.
|
||||
outliers = flag_price_outliers(
|
||||
_slim_sales(
|
||||
[
|
||||
("13 QUICKSETTS", date(2016, 6, 1), 140_000),
|
||||
("13 QUICKSETTS", date(2026, 6, 1), 207_500_000),
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
assert outliers.to_dicts() == [
|
||||
{
|
||||
"_pp_group_address": "13 QUICKSETTS",
|
||||
"_pp_group_postcode": "AA11AA",
|
||||
"date_of_transfer": date(2026, 6, 1),
|
||||
"price": 207_500_000,
|
||||
"_price_outlier": True,
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def test_flag_price_outliers_down_rule_flags_spike_before_normal_sale():
|
||||
# The garbage sale comes FIRST, so it has no previous sale to compare
|
||||
# against; the down rule (next sale collapses to under 1/threshold of this
|
||||
# one) must catch it instead.
|
||||
outliers = flag_price_outliers(
|
||||
_slim_sales(
|
||||
[
|
||||
("5 EXAMPLE ROAD", date(2016, 6, 1), 250_000_000),
|
||||
("5 EXAMPLE ROAD", date(2017, 6, 1), 140_000),
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
assert outliers.get_column("price").to_list() == [250_000_000]
|
||||
|
||||
|
||||
def test_flag_price_outliers_min_price_floor_protects_cheap_properties():
|
||||
# x40 in under six months exceeds the relative threshold (~12.6 at the
|
||||
# half-year floor), but the flagged price (600k) is below JUMP_MIN_PRICE,
|
||||
# so nothing is flagged: the absolute floor is load-bearing here.
|
||||
outliers = flag_price_outliers(
|
||||
_slim_sales(
|
||||
[
|
||||
("9 CHEAP STREET", date(2000, 1, 1), 15_000),
|
||||
("9 CHEAP STREET", date(2000, 6, 1), 600_000),
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
assert outliers.height == 0
|
||||
|
||||
|
||||
def test_flag_price_outliers_spares_expensive_long_horizon_growth():
|
||||
# x30 over 31 years on a now-£4.5M property clears the £2M floor but stays
|
||||
# under the time-aware threshold (12 * 1.10**31 ≈ 230): not flagged.
|
||||
outliers = flag_price_outliers(
|
||||
_slim_sales(
|
||||
[
|
||||
("1 PRIME PLACE", date(1995, 1, 1), 150_000),
|
||||
("1 PRIME PLACE", date(2026, 1, 1), 4_500_000),
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
assert outliers.height == 0
|
||||
|
||||
|
||||
def test_epc_band_to_year_uses_midpoint_and_clamps():
|
||||
import polars as pl
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue