104 lines
4.2 KiB
Python
104 lines
4.2 KiB
Python
"""Tests for the floor-area-less estimate guard in estimate.py.
|
|
|
|
The per-sqm plausibility guard cannot fire when floor area is null/zero, which
|
|
let commercial blocks misfiled as dwellings keep absurd headline estimates
|
|
(e.g. a GBP 175M "Detached" in SW1W). apply_floorless_estimate_guard nulls a
|
|
floorless estimate only when it exceeds max(FLOORLESS_ESTIMATE_P99_MULT x the
|
|
district's recent p99 sale price, FLOORLESS_ESTIMATE_MIN_CAP), and leaves
|
|
rows it cannot judge (no recent district sales) alone.
|
|
"""
|
|
|
|
from datetime import date
|
|
|
|
import polars as pl
|
|
|
|
from pipeline.transform.price_estimation.estimate import (
|
|
FLOORLESS_P99_LOOKBACK_YEARS,
|
|
apply_floorless_estimate_guard,
|
|
)
|
|
from pipeline.transform.price_estimation.utils import CURRENT_YEAR
|
|
|
|
RECENT = date(CURRENT_YEAR - 1, 6, 1) # inside the p99 look-back window
|
|
STALE = date(CURRENT_YEAR - FLOORLESS_P99_LOOKBACK_YEARS - 5, 6, 1) # outside
|
|
|
|
|
|
def _guard_input(rows):
|
|
"""Frame with the columns the guard reads, in (id, sector, estimate,
|
|
floor_area, last_price, last_date) row order. Pool rows (null estimate)
|
|
only feed the per-district p99 reference."""
|
|
return pl.DataFrame(
|
|
rows,
|
|
schema={
|
|
"id": pl.Int64,
|
|
"_sector": pl.String,
|
|
"Estimated current price": pl.Float64,
|
|
"Total floor area (sqm)": pl.Float64,
|
|
"Last known price": pl.Float64,
|
|
"Date of last transaction": pl.Date,
|
|
},
|
|
orient="row",
|
|
)
|
|
|
|
|
|
def _estimate_for(result: pl.DataFrame, row_id: int):
|
|
return result.filter(pl.col("id") == row_id)["Estimated current price"][0]
|
|
|
|
|
|
def test_floorless_guard_nulls_and_keeps_the_right_rows():
|
|
rows = [
|
|
# SW1W pool: 5 recent sales at 3M -> district p99 = 3M, cap = 6M.
|
|
*[(100 + i, "SW1W 9", None, None, 3_000_000.0, RECENT) for i in range(5)],
|
|
# 175M floorless estimate, 29x the 6M cap -> nulled.
|
|
(1, "SW1W 9", 175_000_000.0, None, None, None),
|
|
# Zero floor area counts as floorless (psm guard can't fire) -> nulled.
|
|
(2, "SW1W 8", 175_000_000.0, 0.0, None, None),
|
|
# 5M floorless is under the 2 x p99 cap -> kept.
|
|
(3, "SW1W 9", 5_000_000.0, None, None, None),
|
|
# Floor area PRESENT: never touched by this guard, however absurd
|
|
# (the per-sqm guard owns that case).
|
|
(4, "SW1W 9", 175_000_000.0, 93.0, None, None),
|
|
# ZZ1 pool: cheap district, p99 = 500k -> cap = max(1M, 2M) = 2M.
|
|
*[(200 + i, "ZZ1 4", None, None, 500_000.0, RECENT) for i in range(5)],
|
|
# Genuine mansion in a cheap district: above 2 x p99 but below the
|
|
# absolute 2M floor -> kept.
|
|
(5, "ZZ1 4", 1_500_000.0, None, None, None),
|
|
# Above both the absolute floor and 2 x p99 -> nulled.
|
|
(6, "ZZ1 4", 2_500_000.0, None, None, None),
|
|
# XX9's only sale is outside the look-back window -> null p99 ->
|
|
# cannot judge -> kept, even at 50M.
|
|
(300, "XX9 1", None, None, 4_000_000.0, STALE),
|
|
(7, "XX9 1", 50_000_000.0, None, None, None),
|
|
# No sector at all -> no district reference -> kept.
|
|
(8, None, 50_000_000.0, None, None, None),
|
|
]
|
|
|
|
result = apply_floorless_estimate_guard(_guard_input(rows))
|
|
|
|
assert _estimate_for(result, 1) is None
|
|
assert _estimate_for(result, 2) is None
|
|
assert _estimate_for(result, 3) == 5_000_000.0
|
|
assert _estimate_for(result, 4) == 175_000_000.0
|
|
assert _estimate_for(result, 5) == 1_500_000.0
|
|
assert _estimate_for(result, 6) is None
|
|
assert _estimate_for(result, 7) == 50_000_000.0
|
|
assert _estimate_for(result, 8) == 50_000_000.0
|
|
|
|
|
|
def test_floorless_guard_preserves_schema_and_rows():
|
|
"""The guard adds no columns, drops no rows, and leaves non-estimate
|
|
columns untouched (it runs in-pipeline before temp-column dropping)."""
|
|
df = _guard_input(
|
|
[
|
|
(1, "SW1W 9", None, None, 3_000_000.0, RECENT),
|
|
(2, "SW1W 9", 175_000_000.0, None, None, None),
|
|
]
|
|
)
|
|
|
|
result = apply_floorless_estimate_guard(df)
|
|
|
|
assert result.columns == df.columns
|
|
assert len(result) == len(df)
|
|
assert result["id"].to_list() == df["id"].to_list()
|
|
assert result.drop("Estimated current price").equals(
|
|
df.drop("Estimated current price")
|
|
)
|