perfect-postcode/pipeline/transform/price_estimation/test_estimate.py

104 lines
4.2 KiB
Python

"""Tests for the floor-area-less estimate guard in estimate.py.
The per-sqm plausibility guard cannot fire when floor area is null/zero, which
let commercial blocks misfiled as dwellings keep absurd headline estimates
(e.g. a GBP 175M "Detached" in SW1W). apply_floorless_estimate_guard nulls a
floorless estimate only when it exceeds max(FLOORLESS_ESTIMATE_P99_MULT x the
district's recent p99 sale price, FLOORLESS_ESTIMATE_MIN_CAP), and leaves
rows it cannot judge (no recent district sales) alone.
"""
from datetime import date
import polars as pl
from pipeline.transform.price_estimation.estimate import (
FLOORLESS_P99_LOOKBACK_YEARS,
apply_floorless_estimate_guard,
)
from pipeline.transform.price_estimation.utils import CURRENT_YEAR
RECENT = date(CURRENT_YEAR - 1, 6, 1) # inside the p99 look-back window
STALE = date(CURRENT_YEAR - FLOORLESS_P99_LOOKBACK_YEARS - 5, 6, 1) # outside
def _guard_input(rows):
"""Frame with the columns the guard reads, in (id, sector, estimate,
floor_area, last_price, last_date) row order. Pool rows (null estimate)
only feed the per-district p99 reference."""
return pl.DataFrame(
rows,
schema={
"id": pl.Int64,
"_sector": pl.String,
"Estimated current price": pl.Float64,
"Total floor area (sqm)": pl.Float64,
"Last known price": pl.Float64,
"Date of last transaction": pl.Date,
},
orient="row",
)
def _estimate_for(result: pl.DataFrame, row_id: int):
return result.filter(pl.col("id") == row_id)["Estimated current price"][0]
def test_floorless_guard_nulls_and_keeps_the_right_rows():
rows = [
# SW1W pool: 5 recent sales at 3M -> district p99 = 3M, cap = 6M.
*[(100 + i, "SW1W 9", None, None, 3_000_000.0, RECENT) for i in range(5)],
# 175M floorless estimate, 29x the 6M cap -> nulled.
(1, "SW1W 9", 175_000_000.0, None, None, None),
# Zero floor area counts as floorless (psm guard can't fire) -> nulled.
(2, "SW1W 8", 175_000_000.0, 0.0, None, None),
# 5M floorless is under the 2 x p99 cap -> kept.
(3, "SW1W 9", 5_000_000.0, None, None, None),
# Floor area PRESENT: never touched by this guard, however absurd
# (the per-sqm guard owns that case).
(4, "SW1W 9", 175_000_000.0, 93.0, None, None),
# ZZ1 pool: cheap district, p99 = 500k -> cap = max(1M, 2M) = 2M.
*[(200 + i, "ZZ1 4", None, None, 500_000.0, RECENT) for i in range(5)],
# Genuine mansion in a cheap district: above 2 x p99 but below the
# absolute 2M floor -> kept.
(5, "ZZ1 4", 1_500_000.0, None, None, None),
# Above both the absolute floor and 2 x p99 -> nulled.
(6, "ZZ1 4", 2_500_000.0, None, None, None),
# XX9's only sale is outside the look-back window -> null p99 ->
# cannot judge -> kept, even at 50M.
(300, "XX9 1", None, None, 4_000_000.0, STALE),
(7, "XX9 1", 50_000_000.0, None, None, None),
# No sector at all -> no district reference -> kept.
(8, None, 50_000_000.0, None, None, None),
]
result = apply_floorless_estimate_guard(_guard_input(rows))
assert _estimate_for(result, 1) is None
assert _estimate_for(result, 2) is None
assert _estimate_for(result, 3) == 5_000_000.0
assert _estimate_for(result, 4) == 175_000_000.0
assert _estimate_for(result, 5) == 1_500_000.0
assert _estimate_for(result, 6) is None
assert _estimate_for(result, 7) == 50_000_000.0
assert _estimate_for(result, 8) == 50_000_000.0
def test_floorless_guard_preserves_schema_and_rows():
"""The guard adds no columns, drops no rows, and leaves non-estimate
columns untouched (it runs in-pipeline before temp-column dropping)."""
df = _guard_input(
[
(1, "SW1W 9", None, None, 3_000_000.0, RECENT),
(2, "SW1W 9", 175_000_000.0, None, None, None),
]
)
result = apply_floorless_estimate_guard(df)
assert result.columns == df.columns
assert len(result) == len(df)
assert result["id"].to_list() == df["id"].to_list()
assert result.drop("Estimated current price").equals(
df.drop("Estimated current price")
)