Fable findings in data

2026-06-11 07:49:23 +01:00 · 2026-06-11 07:49:23 +01:00 · 6a33b03fdf
commit 6a33b03fdf
parent b98bc6d611
20 changed files with 1502 additions and 274 deletions
--- a/pipeline/transform/price_estimation/estimate.py
+++ b/pipeline/transform/price_estimation/estimate.py
@ -25,6 +25,7 @@ from pipeline.transform.price_estimation.knn import (
 )
 from pipeline.transform.price_estimation.utils import (
    CURRENT_FRAC_YEAR,
+    CURRENT_YEAR,
    MAX_LOG_ADJUSTMENT,
    interpolate_log_index,
    sector_expr,
@ -41,6 +42,87 @@ MIN_KNN_TO_INDEX_RATIO = 0.5
 # only catching outliers.
 MAX_ESTIMATE_TO_LAST_PRICE_RATIO = 20.0

+# Guard for rows with NO usable floor area: the per-sqm plausibility check
+# cannot fire there, which let commercial blocks misfiled as dwellings keep
+# absurd headline estimates (e.g. a GBP 175M "Detached" in SW1W). Without
+# floor area we cannot psm-check, so the only sanity reference left is what
+# the local market actually pays: beyond this multiple of the district's
+# recent 99th-percentile sale price the estimate is unreliable and misleading,
+# so it is nulled rather than shown.
+FLOORLESS_ESTIMATE_P99_MULT = 2.0
+# Never null a floorless estimate below this absolute value: genuine mansions
+# in cheap districts can legitimately exceed 2x their district's recent p99,
+# but a sub-GBP 2M estimate is within the plausible single-dwelling range
+# anywhere in the UK, so it survives regardless of the local p99.
+FLOORLESS_ESTIMATE_MIN_CAP = 2_000_000.0
+# Look-back window for the district p99 reference: long enough that thin
+# districts accumulate a usable sale sample, short enough that the reference
+# reflects today's price level rather than a pre-boom one.
+FLOORLESS_P99_LOOKBACK_YEARS = 10
+
+
+def apply_floorless_estimate_guard(df: pl.DataFrame) -> pl.DataFrame:
+    """Null floor-area-less estimates far above their district's recent sales.
+
+    Builds a per-district reference from the SAME frame -- the 99th percentile
+    of `Last known price` over sales in the last FLOORLESS_P99_LOOKBACK_YEARS
+    -- and nulls `Estimated current price` where the floor area is null/zero
+    AND the estimate exceeds max(FLOORLESS_ESTIMATE_P99_MULT * p99,
+    FLOORLESS_ESTIMATE_MIN_CAP). Districts with no recent sales yield a null
+    p99 and are left alone: with neither a psm check nor a local reference we
+    cannot judge the estimate, and nulling on the absolute cap alone would be
+    too aggressive. Expects the `_sector` helper column; rows with floor area
+    present are never touched (the psm guard covers them).
+    """
+    # District = sector minus the trailing sector digit group, matching the
+    # rsplit semantics of utils.hierarchy_keys ("SW1W 9" -> "SW1W").
+    district = pl.col("_sector").str.replace(r"\s+\d+$", "")
+
+    district_p99 = (
+        df.lazy()
+        .filter(
+            pl.col("Last known price").is_not_null(),
+            pl.col("Date of last transaction").dt.year()
+            >= CURRENT_YEAR - FLOORLESS_P99_LOOKBACK_YEARS,
+        )
+        .group_by(district.alias("_district"))
+        .agg(
+            pl.col("Last known price")
+            .cast(pl.Float64)
+            .quantile(0.99)
+            .alias("_district_p99")
+        )
+        .collect()
+    )
+
+    df = df.with_columns(district.alias("_district")).join(
+        district_p99, on="_district", how="left", maintain_order="left"
+    )
+
+    floorless = pl.col("Total floor area (sqm)").is_null() | (
+        pl.col("Total floor area (sqm)") <= 0
+    )
+    cap = pl.max_horizontal(
+        FLOORLESS_ESTIMATE_P99_MULT * pl.col("_district_p99"),
+        pl.lit(FLOORLESS_ESTIMATE_MIN_CAP),
+    )
+    implausible = (
+        pl.col("Estimated current price").is_not_null()
+        & floorless
+        & pl.col("_district_p99").is_not_null()
+        & (pl.col("Estimated current price") > cap)
+    )
+
+    n_nulled = df.select(implausible.sum()).item()
+    print(f"  Floorless-estimate guard: nulled {n_nulled:,} estimates")
+
+    return df.with_columns(
+        pl.when(implausible)
+        .then(None)
+        .otherwise(pl.col("Estimated current price"))
+        .alias("Estimated current price"),
+    ).drop("_district", "_district_p99")
+

 def guarded_blend_estimates(
    index_est: np.ndarray,
@ -249,9 +331,16 @@ def main():
        .alias("Estimated current price"),
    )

+    # Floor-area-less rows escape the per-sqm guard above entirely; cap them
+    # against their district's recent sale prices instead (see
+    # apply_floorless_estimate_guard). Must run before temp columns
+    # (_sector) are dropped.
+    df = apply_floorless_estimate_guard(df)
+
    # Derive estimated price per sqm where both estimated price and floor area
    # exist. Now that the implausible-psm estimates are nulled above, the band
-    # filter here mainly guards the floor-area>0 case.
+    # filter here mainly guards the floor-area>0 case. (The floorless guard
+    # never touches floor-area-present rows, so this derivation is unaffected.)
    _est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
    df = df.with_columns(
        pl.when(
--- a/pipeline/transform/price_estimation/index.py
+++ b/pipeline/transform/price_estimation/index.py
@ -17,11 +17,13 @@ from scipy.sparse.linalg import lsqr
 from tqdm import tqdm

 from pipeline.transform.price_estimation.shrinkage import (
+    MAX_STEP_DEVIATION_PER_YEAR,
    blend_dicts,
    hierarchical_shrinkage,
    lift_onto_parent,
    shrink_dicts,
    spatial_smooth,
+    winsorize_steps,
 )
 from pipeline.transform.price_estimation.utils import (
    CURRENT_YEAR,
@ -485,8 +487,20 @@ def build_index(
        input_path, min_year, max_year, max_sale_year=estimation_cap
    )

-    # Precompute hierarchy
-    all_sectors = pairs["sector"].unique().to_list()
+    # Precompute hierarchy. The sector universe is the UNION of sectors with
+    # repeat-sale pairs and every sector in the postcode universe (centroids
+    # is keyed by every sector derived from postcode.parquet): a sector whose
+    # properties never resold still gets a full index row via the district ->
+    # area -> national fallback in hierarchical_shrinkage (then spatial
+    # smoothing and forward fill). Restricting the universe to pairs-only
+    # sectors silently dropped ~15% of live sectors from the output, nulling
+    # every per-sector lookup and estimate there. n_pairs = 0 marks the
+    # synthesised cells.
+    all_sectors = sorted(set(pairs["sector"].unique().to_list()) | set(centroids))
+    if sectors is not None:
+        # Debug scoping restricts the universe too, not just the pairs.
+        scoped = set(sectors)
+        all_sectors = [s for s in all_sectors if s in scoped]
    sector_to_dist = {}
    dist_to_area = {}
    for s in all_sectors:
@ -562,10 +576,23 @@ def build_index(
            sector_shrunk, centroids, sector_n, blend_dicts
        )

-        # Forward fill
+        # Winsorise per-year steps against the national index, then forward
+        # fill. The support-scaled smoothness prior still under-penalises
+        # years identified by 1-2 pairs in thin early histories (observed:
+        # x9.7 single-year jumps in city-centre regeneration sectors);
+        # clamping each step to within +/-MAX_STEP_DEVIATION_PER_YEAR of the
+        # national move over the same span removes those artefacts while
+        # leaving genuine sector-vs-national divergence (well inside the
+        # band) untouched.
        for sec in all_sectors:
            sector_smoothed[sec] = forward_fill(
-                sector_smoothed.get(sec, hedonic_idx), min_year, max_year
+                winsorize_steps(
+                    sector_smoothed.get(sec, hedonic_idx),
+                    national_shrunk,
+                    MAX_STEP_DEVIATION_PER_YEAR,
+                ),
+                min_year,
+                max_year,
            )

        final[tg] = sector_smoothed
--- a/pipeline/transform/price_estimation/shrinkage.py
+++ b/pipeline/transform/price_estimation/shrinkage.py
@ -12,6 +12,18 @@ V = TypeVar("V")
 SPATIAL_NEIGHBORS = 5
 SPATIAL_BLEND_K = 30

+# Hard band on a sector's per-year index move RELATIVE to its parent (the
+# national index), enforced by winsorize_steps after spatial smoothing. The
+# support-scaled temporal smoothness prior still under-penalises years
+# identified by only 1-2 repeat-sale pairs in thin early histories, leaving
+# artefacts like a x9.7 single-year jump (log +2.27, sector "M3 1"
+# 1998->1999). A sector may genuinely outpace the nation -- regeneration, new
+# transport links -- but those stories play out over multiple years, not as a
+# one-year x9.7 step. +/-0.40 log/yr (~x1.5 in a year) relative to the
+# national move keeps every plausible genuine sector-level divergence while
+# clamping thin-year data artefacts.
+MAX_STEP_DEVIATION_PER_YEAR = 0.40
+

 def _base_value(index: dict[int, float], base_year: int) -> float:
    """Value of an index dict at `base_year`, with forward/back-fill for gaps.
@ -75,6 +87,42 @@ def lift_onto_parent(
    return {y: v + offset for y, v in child.items()}


+def winsorize_steps(
+    child: dict[int, float],
+    parent: dict[int, float],
+    max_dev_per_year: float,
+) -> dict[int, float]:
+    """Clamp a child's per-year index steps to within a band of the parent's.
+
+    For each consecutive pair of solved years (y_prev, y) the child's per-year
+    rate r = (child[y] - child[y_prev]) / (y - y_prev) is winsorised into
+    [p - max_dev_per_year, p + max_dev_per_year], where p is the parent's
+    per-year rate over the same span (via _base_value, so gaps in the parent's
+    coverage are forward/back-filled rather than crashing). The series is then
+    rebuilt cumulatively from the FIRST year's value, so:
+      - the first year's level is preserved;
+      - non-outlier steps are preserved exactly (later years simply shift by
+        whatever the clamped steps removed);
+      - a multi-year gap is judged on its per-year rate, not as one giant
+        single-year move, so genuine level changes across gaps survive.
+
+    A child with <2 years has no steps to clamp; an empty parent only occurs
+    in degenerate paths (build_index always passes the national index) -- both
+    are returned unchanged.
+    """
+    if len(child) < 2 or not parent:
+        return child
+    years = sorted(child)
+    result = {years[0]: child[years[0]]}
+    for y_prev, y in zip(years[:-1], years[1:]):
+        span = y - y_prev
+        r = (child[y] - child[y_prev]) / span
+        p = (_base_value(parent, y) - _base_value(parent, y_prev)) / span
+        r = min(max(r, p - max_dev_per_year), p + max_dev_per_year)
+        result[y] = result[y_prev] + r * span
+    return result
+
+
 def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
    """Shrink dict values toward parent using n/(n+k) weighting.

--- a/pipeline/transform/price_estimation/test_index.py
+++ b/pipeline/transform/price_estimation/test_index.py
@ -1,14 +1,18 @@
+from datetime import date
+
 import numpy as np
 import polars as pl

 from pipeline.transform.price_estimation import index as index_mod
 from pipeline.transform.price_estimation.index import (
    MAX_EXTRAPOLATION_SLOPE,
+    build_index,
    compute_indices_for_level,
    extract_pairs,
    forward_fill,
    solve_robust_index,
 )
+from pipeline.transform.price_estimation.utils import CURRENT_YEAR, TYPE_GROUPS


 def _pairs_from_path(true_levels: dict[int, float]):
@ -269,3 +273,82 @@ def test_n_pairs_counts_only_cross_year_pairs():

    assert "g" in indices
    assert n_pairs["g"] == 8  # not 11
+
+
+def _write_universe_fixtures(tmp_path):
+    """Properties with repeat sales only in sector 'AB1 2', plus a postcode
+    universe that also contains the pairless sector 'AB1 3'."""
+    props = pl.DataFrame(
+        {
+            "Postcode": [f"AB1 2A{c}" for c in "ABCDEF"],
+            "Property type": ["Detached"] * 6,
+            "Total floor area (sqm)": [80.0] * 6,
+            "Last known price": [130_000] * 6,
+            "Date of last transaction": [date(2021, 6, 1)] * 6,
+            # 6 repeat-sale pairs 2018 -> 2021, log_ratio ~0.26 (well within
+            # the flat and annualised outlier caps), comfortably >= MIN_PAIRS.
+            "historical_prices": [
+                [
+                    {"year": 2018, "month": 1, "price": 100_000},
+                    {"year": 2021, "month": 6, "price": 130_000},
+                ]
+            ]
+            * 6,
+        }
+    )
+    props_path = tmp_path / "props.parquet"
+    props.write_parquet(props_path)
+
+    postcodes = pl.DataFrame(
+        {
+            "Postcode": ["AB1 2AA", "AB1 2AB", "AB1 3AA"],
+            "lat": [57.10, 57.10, 57.20],
+            "lon": [-2.10, -2.10, -2.20],
+        }
+    )
+    pc_path = tmp_path / "postcodes.parquet"
+    postcodes.write_parquet(pc_path)
+    return props_path, pc_path
+
+
+def test_build_index_covers_pairless_sectors_from_postcode_universe(tmp_path):
+    """FIX: the sector universe is pairs-sectors UNION postcode-universe
+    sectors, not just sectors that happened to have a repeat sale (which
+    silently dropped ~15% of live sectors from the output). A pairless sector
+    present in postcode.parquet must get index rows via the hierarchy
+    fallback: n_pairs == 0 marks the synthesised cells, with full year
+    coverage after forward fill."""
+    props_path, pc_path = _write_universe_fixtures(tmp_path)
+
+    result = build_index(props_path, postcodes_path=pc_path)
+
+    pairless = result.filter(pl.col("sector") == "AB1 3")
+    assert len(pairless) > 0
+    assert set(pairless["type_group"]) == {"All", *TYPE_GROUPS}
+    assert pairless["n_pairs"].to_list() == [0] * len(pairless)
+    assert pairless["log_index"].is_not_null().all()
+    # Full year coverage (min pair year .. CURRENT_YEAR) for the solved type
+    # groups. (Type groups with <MIN_PAIRS pairs take the hedonic-fallback
+    # skip branch, which only emits hedonic years -- unchanged behaviour.)
+    expected_years = set(range(2018, CURRENT_YEAR + 1))
+    for tg in ("All", "Detached"):
+        years = set(pairless.filter(pl.col("type_group") == tg)["year"])
+        assert years == expected_years
+
+    # The pairless sector inherits its district's index: same values as the
+    # sector that actually has pairs (no other siblings to dilute it here).
+    with_pairs = result.filter(pl.col("sector") == "AB1 2")
+    assert (
+        with_pairs.filter(pl.col("type_group") == "All")["n_pairs"].to_list()
+        == [6] * (CURRENT_YEAR - 2018 + 1)
+    )
+
+
+def test_build_index_sectors_scoping_restricts_universe(tmp_path):
+    """Debug scoping via sectors=[...] restricts the output universe too --
+    not just the pairs -- so a scoped run does not emit every centroid sector."""
+    props_path, pc_path = _write_universe_fixtures(tmp_path)
+
+    result = build_index(props_path, postcodes_path=pc_path, sectors=["AB1 2"])
+
+    assert set(result["sector"]) == {"AB1 2"}
--- a/pipeline/transform/price_estimation/test_shrinkage.py
+++ b/pipeline/transform/price_estimation/test_shrinkage.py
@ -10,12 +10,17 @@ Note: re-anchoring each cell to the *global* base year is a no-op on real data
 (a cell anchored to 0 at its own earliest year already reads 0 there, and the
 global base is never later), which is why the fix lifts onto the *parent* at the
 child's own start year instead.
+
+Also covers winsorize_steps, the post-smoothing per-year step clamp against the
+national index (fix: violent single-year index jumps in thin early years).
 """

 from pipeline.transform.price_estimation.shrinkage import (
+    MAX_STEP_DEVIATION_PER_YEAR,
    hierarchical_shrinkage,
    lift_onto_parent,
    shrink_dicts,
+    winsorize_steps,
 )
 from pipeline.transform.price_estimation.utils import SHRINKAGE_K

@ -115,3 +120,60 @@ def test_hierarchical_shrinkage_lift_fn_only_changes_spanning_comparisons():
    assert (with_lift[2024] - with_lift[2008]) > (
        without_lift[2024] - without_lift[2008]
    ) + 0.1
+
+
+def test_winsorize_clamps_thin_year_spike_and_shifts_later_years():
+    """A "M3 1"-style single-year spike (x9.7, log +2.27) is clamped to
+    parent_rate + max_dev; the first year's level is preserved, and later
+    years keep their OWN steps (the tail shifts down rigidly by whatever the
+    clamped step removed)."""
+    child = {1995: 0.0, 1998: 0.2, 1999: 2.47, 2000: 2.5}
+    parent = {y: 0.1 * (y - 1995) for y in range(1995, 2001)}  # flat-ish 0.1/yr
+
+    out = winsorize_steps(child, parent, MAX_STEP_DEVIATION_PER_YEAR)
+
+    assert out[1995] == child[1995]  # first year preserved
+    # 1995->1998: 0.0667/yr, well within 0.1 +/- 0.40 -> untouched.
+    assert abs(out[1998] - child[1998]) < 1e-12
+    # 1998->1999: 2.27/yr clamped to parent_rate + max_dev = 0.1 + 0.40.
+    assert abs((out[1999] - out[1998]) - (0.1 + MAX_STEP_DEVIATION_PER_YEAR)) < 1e-12
+    # 1999->2000: the in-band +0.03 step survives; the level shifts down with
+    # the clamped 1999.
+    assert abs((out[2000] - out[1999]) - (child[2000] - child[1999])) < 1e-12
+    assert abs(out[2000] - 0.73) < 1e-12
+
+
+def test_winsorize_preserves_genuine_moves():
+    """Steps within parent_rate +/- max_dev pass through (numerically) unchanged."""
+    child = {2000: 0.0, 2001: 0.35, 2002: 0.40, 2003: 0.20}
+    parent = {y: 0.05 * (y - 2000) for y in range(2000, 2004)}
+
+    out = winsorize_steps(child, parent, MAX_STEP_DEVIATION_PER_YEAR)
+
+    assert set(out) == set(child)
+    assert max(abs(out[y] - child[y]) for y in child) < 1e-12
+
+
+def test_winsorize_judges_gap_steps_on_per_year_rate():
+    """A step across a multi-year gap is judged on its PER-YEAR rate (with
+    gap-tolerant parent lookup via _base_value), not as one giant single-year
+    move: +1.0 over 5 years (0.2/yr) is in-band even though +1.0 in one year
+    would be clamped."""
+    child = {1995: 0.0, 2000: 1.0}
+    # Parent lacks both endpoint years: 1995 back-fills to its earliest value
+    # (0.0), 2000 forward-fills from 1999 (0.3) -> parent rate 0.06/yr.
+    parent = {1996: 0.0, 1999: 0.3}
+
+    out = winsorize_steps(child, parent, MAX_STEP_DEVIATION_PER_YEAR)
+
+    assert out == child
+
+
+def test_winsorize_degenerate_inputs_unchanged():
+    """<2 child years -> no steps to clamp; an empty parent only occurs in
+    degenerate paths (build_index always passes the national index) -> child
+    is returned unchanged, never clamped against an arbitrary rate."""
+    assert winsorize_steps({}, {2000: 0.0, 2001: 0.1}, 0.4) == {}
+    assert winsorize_steps({2000: 0.5}, {2000: 0.0, 2001: 0.1}, 0.4) == {2000: 0.5}
+    spiky = {2000: 0.0, 2001: 5.0}
+    assert winsorize_steps(spiky, {}, 0.4) == spiky