perfect-postcode/pipeline/transform/price_estimation/test_estimate.py

"""Tests for the floor-area-less estimate guard in estimate.py.

The per-sqm plausibility guard cannot fire when floor area is null/zero, which
let commercial blocks misfiled as dwellings keep absurd headline estimates
(e.g. a GBP 175M "Detached" in SW1W). apply_floorless_estimate_guard nulls a
floorless estimate only when it exceeds max(FLOORLESS_ESTIMATE_P99_MULT x the
district's recent p99 sale price, FLOORLESS_ESTIMATE_MIN_CAP), and leaves
rows it cannot judge (no recent district sales) alone.
"""

from datetime import date

import polars as pl

from pipeline.transform.price_estimation.estimate import (
    FLOORLESS_P99_LOOKBACK_YEARS,
    apply_floorless_estimate_guard,
)
from pipeline.transform.price_estimation.utils import CURRENT_YEAR

RECENT = date(CURRENT_YEAR - 1, 6, 1)  # inside the p99 look-back window
STALE = date(CURRENT_YEAR - FLOORLESS_P99_LOOKBACK_YEARS - 5, 6, 1)  # outside


def _guard_input(rows):
    """Frame with the columns the guard reads, in (id, sector, estimate,
    floor_area, last_price, last_date) row order. Pool rows (null estimate)
    only feed the per-district p99 reference."""
    return pl.DataFrame(
        rows,
        schema={
            "id": pl.Int64,
            "_sector": pl.String,
            "Estimated current price": pl.Float64,
            "Total floor area (sqm)": pl.Float64,
            "Last known price": pl.Float64,
            "Date of last transaction": pl.Date,
        },
        orient="row",
    )


def _estimate_for(result: pl.DataFrame, row_id: int):
    return result.filter(pl.col("id") == row_id)["Estimated current price"][0]


def test_floorless_guard_nulls_and_keeps_the_right_rows():
    rows = [
        # SW1W pool: 5 recent sales at 3M -> district p99 = 3M, cap = 6M.
        *[(100 + i, "SW1W 9", None, None, 3_000_000.0, RECENT) for i in range(5)],
        # 175M floorless estimate, 29x the 6M cap -> nulled.
        (1, "SW1W 9", 175_000_000.0, None, None, None),
        # Zero floor area counts as floorless (psm guard can't fire) -> nulled.
        (2, "SW1W 8", 175_000_000.0, 0.0, None, None),
        # 5M floorless is under the 2 x p99 cap -> kept.
        (3, "SW1W 9", 5_000_000.0, None, None, None),
        # Floor area PRESENT: never touched by this guard, however absurd
        # (the per-sqm guard owns that case).
        (4, "SW1W 9", 175_000_000.0, 93.0, None, None),
        # ZZ1 pool: cheap district, p99 = 500k -> cap = max(1M, 2M) = 2M.
        *[(200 + i, "ZZ1 4", None, None, 500_000.0, RECENT) for i in range(5)],
        # Genuine mansion in a cheap district: above 2 x p99 but below the
        # absolute 2M floor -> kept.
        (5, "ZZ1 4", 1_500_000.0, None, None, None),
        # Above both the absolute floor and 2 x p99 -> nulled.
        (6, "ZZ1 4", 2_500_000.0, None, None, None),
        # XX9's only sale is outside the look-back window -> null p99 ->
        # cannot judge -> kept, even at 50M.
        (300, "XX9 1", None, None, 4_000_000.0, STALE),
        (7, "XX9 1", 50_000_000.0, None, None, None),
        # No sector at all -> no district reference -> kept.
        (8, None, 50_000_000.0, None, None, None),
    ]

    result = apply_floorless_estimate_guard(_guard_input(rows))

    assert _estimate_for(result, 1) is None
    assert _estimate_for(result, 2) is None
    assert _estimate_for(result, 3) == 5_000_000.0
    assert _estimate_for(result, 4) == 175_000_000.0
    assert _estimate_for(result, 5) == 1_500_000.0
    assert _estimate_for(result, 6) is None
    assert _estimate_for(result, 7) == 50_000_000.0
    assert _estimate_for(result, 8) == 50_000_000.0


def test_floorless_guard_preserves_schema_and_rows():
    """The guard adds no columns, drops no rows, and leaves non-estimate
    columns untouched (it runs in-pipeline before temp-column dropping)."""
    df = _guard_input(
        [
            (1, "SW1W 9", None, None, 3_000_000.0, RECENT),
            (2, "SW1W 9", 175_000_000.0, None, None, None),
        ]
    )

    result = apply_floorless_estimate_guard(df)

    assert result.columns == df.columns
    assert len(result) == len(df)
    assert result["id"].to_list() == df["id"].to_list()
    assert result.drop("Estimated current price").equals(
        df.drop("Estimated current price")
    )