Fable findings in data

This commit is contained in:
Andras Schmelczer 2026-06-11 07:49:23 +01:00
parent b98bc6d611
commit 6a33b03fdf
20 changed files with 1502 additions and 274 deletions

View file

@ -25,6 +25,7 @@ from pipeline.transform.price_estimation.knn import (
)
from pipeline.transform.price_estimation.utils import (
CURRENT_FRAC_YEAR,
CURRENT_YEAR,
MAX_LOG_ADJUSTMENT,
interpolate_log_index,
sector_expr,
@ -41,6 +42,87 @@ MIN_KNN_TO_INDEX_RATIO = 0.5
# only catching outliers.
MAX_ESTIMATE_TO_LAST_PRICE_RATIO = 20.0
# Guard for rows with NO usable floor area: the per-sqm plausibility check
# cannot fire there, which let commercial blocks misfiled as dwellings keep
# absurd headline estimates (e.g. a GBP 175M "Detached" in SW1W). Without
# floor area we cannot psm-check, so the only sanity reference left is what
# the local market actually pays: beyond this multiple of the district's
# recent 99th-percentile sale price the estimate is unreliable and misleading,
# so it is nulled rather than shown.
FLOORLESS_ESTIMATE_P99_MULT = 2.0
# Never null a floorless estimate below this absolute value: genuine mansions
# in cheap districts can legitimately exceed 2x their district's recent p99,
# but a sub-GBP 2M estimate is within the plausible single-dwelling range
# anywhere in the UK, so it survives regardless of the local p99.
FLOORLESS_ESTIMATE_MIN_CAP = 2_000_000.0
# Look-back window for the district p99 reference: long enough that thin
# districts accumulate a usable sale sample, short enough that the reference
# reflects today's price level rather than a pre-boom one.
FLOORLESS_P99_LOOKBACK_YEARS = 10
def apply_floorless_estimate_guard(df: pl.DataFrame) -> pl.DataFrame:
"""Null floor-area-less estimates far above their district's recent sales.
Builds a per-district reference from the SAME frame -- the 99th percentile
of `Last known price` over sales in the last FLOORLESS_P99_LOOKBACK_YEARS
-- and nulls `Estimated current price` where the floor area is null/zero
AND the estimate exceeds max(FLOORLESS_ESTIMATE_P99_MULT * p99,
FLOORLESS_ESTIMATE_MIN_CAP). Districts with no recent sales yield a null
p99 and are left alone: with neither a psm check nor a local reference we
cannot judge the estimate, and nulling on the absolute cap alone would be
too aggressive. Expects the `_sector` helper column; rows with floor area
present are never touched (the psm guard covers them).
"""
# District = sector minus the trailing sector digit group, matching the
# rsplit semantics of utils.hierarchy_keys ("SW1W 9" -> "SW1W").
district = pl.col("_sector").str.replace(r"\s+\d+$", "")
district_p99 = (
df.lazy()
.filter(
pl.col("Last known price").is_not_null(),
pl.col("Date of last transaction").dt.year()
>= CURRENT_YEAR - FLOORLESS_P99_LOOKBACK_YEARS,
)
.group_by(district.alias("_district"))
.agg(
pl.col("Last known price")
.cast(pl.Float64)
.quantile(0.99)
.alias("_district_p99")
)
.collect()
)
df = df.with_columns(district.alias("_district")).join(
district_p99, on="_district", how="left", maintain_order="left"
)
floorless = pl.col("Total floor area (sqm)").is_null() | (
pl.col("Total floor area (sqm)") <= 0
)
cap = pl.max_horizontal(
FLOORLESS_ESTIMATE_P99_MULT * pl.col("_district_p99"),
pl.lit(FLOORLESS_ESTIMATE_MIN_CAP),
)
implausible = (
pl.col("Estimated current price").is_not_null()
& floorless
& pl.col("_district_p99").is_not_null()
& (pl.col("Estimated current price") > cap)
)
n_nulled = df.select(implausible.sum()).item()
print(f" Floorless-estimate guard: nulled {n_nulled:,} estimates")
return df.with_columns(
pl.when(implausible)
.then(None)
.otherwise(pl.col("Estimated current price"))
.alias("Estimated current price"),
).drop("_district", "_district_p99")
def guarded_blend_estimates(
index_est: np.ndarray,
@ -249,9 +331,16 @@ def main():
.alias("Estimated current price"),
)
# Floor-area-less rows escape the per-sqm guard above entirely; cap them
# against their district's recent sale prices instead (see
# apply_floorless_estimate_guard). Must run before temp columns
# (_sector) are dropped.
df = apply_floorless_estimate_guard(df)
# Derive estimated price per sqm where both estimated price and floor area
# exist. Now that the implausible-psm estimates are nulled above, the band
# filter here mainly guards the floor-area>0 case.
# filter here mainly guards the floor-area>0 case. (The floorless guard
# never touches floor-area-present rows, so this derivation is unaffected.)
_est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
df = df.with_columns(
pl.when(

View file

@ -17,11 +17,13 @@ from scipy.sparse.linalg import lsqr
from tqdm import tqdm
from pipeline.transform.price_estimation.shrinkage import (
MAX_STEP_DEVIATION_PER_YEAR,
blend_dicts,
hierarchical_shrinkage,
lift_onto_parent,
shrink_dicts,
spatial_smooth,
winsorize_steps,
)
from pipeline.transform.price_estimation.utils import (
CURRENT_YEAR,
@ -485,8 +487,20 @@ def build_index(
input_path, min_year, max_year, max_sale_year=estimation_cap
)
# Precompute hierarchy
all_sectors = pairs["sector"].unique().to_list()
# Precompute hierarchy. The sector universe is the UNION of sectors with
# repeat-sale pairs and every sector in the postcode universe (centroids
# is keyed by every sector derived from postcode.parquet): a sector whose
# properties never resold still gets a full index row via the district ->
# area -> national fallback in hierarchical_shrinkage (then spatial
# smoothing and forward fill). Restricting the universe to pairs-only
# sectors silently dropped ~15% of live sectors from the output, nulling
# every per-sector lookup and estimate there. n_pairs = 0 marks the
# synthesised cells.
all_sectors = sorted(set(pairs["sector"].unique().to_list()) | set(centroids))
if sectors is not None:
# Debug scoping restricts the universe too, not just the pairs.
scoped = set(sectors)
all_sectors = [s for s in all_sectors if s in scoped]
sector_to_dist = {}
dist_to_area = {}
for s in all_sectors:
@ -562,10 +576,23 @@ def build_index(
sector_shrunk, centroids, sector_n, blend_dicts
)
# Forward fill
# Winsorise per-year steps against the national index, then forward
# fill. The support-scaled smoothness prior still under-penalises
# years identified by 1-2 pairs in thin early histories (observed:
# x9.7 single-year jumps in city-centre regeneration sectors);
# clamping each step to within +/-MAX_STEP_DEVIATION_PER_YEAR of the
# national move over the same span removes those artefacts while
# leaving genuine sector-vs-national divergence (well inside the
# band) untouched.
for sec in all_sectors:
sector_smoothed[sec] = forward_fill(
sector_smoothed.get(sec, hedonic_idx), min_year, max_year
winsorize_steps(
sector_smoothed.get(sec, hedonic_idx),
national_shrunk,
MAX_STEP_DEVIATION_PER_YEAR,
),
min_year,
max_year,
)
final[tg] = sector_smoothed

View file

@ -12,6 +12,18 @@ V = TypeVar("V")
SPATIAL_NEIGHBORS = 5
SPATIAL_BLEND_K = 30
# Hard band on a sector's per-year index move RELATIVE to its parent (the
# national index), enforced by winsorize_steps after spatial smoothing. The
# support-scaled temporal smoothness prior still under-penalises years
# identified by only 1-2 repeat-sale pairs in thin early histories, leaving
# artefacts like a x9.7 single-year jump (log +2.27, sector "M3 1"
# 1998->1999). A sector may genuinely outpace the nation -- regeneration, new
# transport links -- but those stories play out over multiple years, not as a
# one-year x9.7 step. +/-0.40 log/yr (~x1.5 in a year) relative to the
# national move keeps every plausible genuine sector-level divergence while
# clamping thin-year data artefacts.
MAX_STEP_DEVIATION_PER_YEAR = 0.40
def _base_value(index: dict[int, float], base_year: int) -> float:
"""Value of an index dict at `base_year`, with forward/back-fill for gaps.
@ -75,6 +87,42 @@ def lift_onto_parent(
return {y: v + offset for y, v in child.items()}
def winsorize_steps(
child: dict[int, float],
parent: dict[int, float],
max_dev_per_year: float,
) -> dict[int, float]:
"""Clamp a child's per-year index steps to within a band of the parent's.
For each consecutive pair of solved years (y_prev, y) the child's per-year
rate r = (child[y] - child[y_prev]) / (y - y_prev) is winsorised into
[p - max_dev_per_year, p + max_dev_per_year], where p is the parent's
per-year rate over the same span (via _base_value, so gaps in the parent's
coverage are forward/back-filled rather than crashing). The series is then
rebuilt cumulatively from the FIRST year's value, so:
- the first year's level is preserved;
- non-outlier steps are preserved exactly (later years simply shift by
whatever the clamped steps removed);
- a multi-year gap is judged on its per-year rate, not as one giant
single-year move, so genuine level changes across gaps survive.
A child with <2 years has no steps to clamp; an empty parent only occurs
in degenerate paths (build_index always passes the national index) -- both
are returned unchanged.
"""
if len(child) < 2 or not parent:
return child
years = sorted(child)
result = {years[0]: child[years[0]]}
for y_prev, y in zip(years[:-1], years[1:]):
span = y - y_prev
r = (child[y] - child[y_prev]) / span
p = (_base_value(parent, y) - _base_value(parent, y_prev)) / span
r = min(max(r, p - max_dev_per_year), p + max_dev_per_year)
result[y] = result[y_prev] + r * span
return result
def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
"""Shrink dict values toward parent using n/(n+k) weighting.

View file

@ -1,14 +1,18 @@
from datetime import date
import numpy as np
import polars as pl
from pipeline.transform.price_estimation import index as index_mod
from pipeline.transform.price_estimation.index import (
MAX_EXTRAPOLATION_SLOPE,
build_index,
compute_indices_for_level,
extract_pairs,
forward_fill,
solve_robust_index,
)
from pipeline.transform.price_estimation.utils import CURRENT_YEAR, TYPE_GROUPS
def _pairs_from_path(true_levels: dict[int, float]):
@ -269,3 +273,82 @@ def test_n_pairs_counts_only_cross_year_pairs():
assert "g" in indices
assert n_pairs["g"] == 8 # not 11
def _write_universe_fixtures(tmp_path):
"""Properties with repeat sales only in sector 'AB1 2', plus a postcode
universe that also contains the pairless sector 'AB1 3'."""
props = pl.DataFrame(
{
"Postcode": [f"AB1 2A{c}" for c in "ABCDEF"],
"Property type": ["Detached"] * 6,
"Total floor area (sqm)": [80.0] * 6,
"Last known price": [130_000] * 6,
"Date of last transaction": [date(2021, 6, 1)] * 6,
# 6 repeat-sale pairs 2018 -> 2021, log_ratio ~0.26 (well within
# the flat and annualised outlier caps), comfortably >= MIN_PAIRS.
"historical_prices": [
[
{"year": 2018, "month": 1, "price": 100_000},
{"year": 2021, "month": 6, "price": 130_000},
]
]
* 6,
}
)
props_path = tmp_path / "props.parquet"
props.write_parquet(props_path)
postcodes = pl.DataFrame(
{
"Postcode": ["AB1 2AA", "AB1 2AB", "AB1 3AA"],
"lat": [57.10, 57.10, 57.20],
"lon": [-2.10, -2.10, -2.20],
}
)
pc_path = tmp_path / "postcodes.parquet"
postcodes.write_parquet(pc_path)
return props_path, pc_path
def test_build_index_covers_pairless_sectors_from_postcode_universe(tmp_path):
"""FIX: the sector universe is pairs-sectors UNION postcode-universe
sectors, not just sectors that happened to have a repeat sale (which
silently dropped ~15% of live sectors from the output). A pairless sector
present in postcode.parquet must get index rows via the hierarchy
fallback: n_pairs == 0 marks the synthesised cells, with full year
coverage after forward fill."""
props_path, pc_path = _write_universe_fixtures(tmp_path)
result = build_index(props_path, postcodes_path=pc_path)
pairless = result.filter(pl.col("sector") == "AB1 3")
assert len(pairless) > 0
assert set(pairless["type_group"]) == {"All", *TYPE_GROUPS}
assert pairless["n_pairs"].to_list() == [0] * len(pairless)
assert pairless["log_index"].is_not_null().all()
# Full year coverage (min pair year .. CURRENT_YEAR) for the solved type
# groups. (Type groups with <MIN_PAIRS pairs take the hedonic-fallback
# skip branch, which only emits hedonic years -- unchanged behaviour.)
expected_years = set(range(2018, CURRENT_YEAR + 1))
for tg in ("All", "Detached"):
years = set(pairless.filter(pl.col("type_group") == tg)["year"])
assert years == expected_years
# The pairless sector inherits its district's index: same values as the
# sector that actually has pairs (no other siblings to dilute it here).
with_pairs = result.filter(pl.col("sector") == "AB1 2")
assert (
with_pairs.filter(pl.col("type_group") == "All")["n_pairs"].to_list()
== [6] * (CURRENT_YEAR - 2018 + 1)
)
def test_build_index_sectors_scoping_restricts_universe(tmp_path):
"""Debug scoping via sectors=[...] restricts the output universe too --
not just the pairs -- so a scoped run does not emit every centroid sector."""
props_path, pc_path = _write_universe_fixtures(tmp_path)
result = build_index(props_path, postcodes_path=pc_path, sectors=["AB1 2"])
assert set(result["sector"]) == {"AB1 2"}

View file

@ -10,12 +10,17 @@ Note: re-anchoring each cell to the *global* base year is a no-op on real data
(a cell anchored to 0 at its own earliest year already reads 0 there, and the
global base is never later), which is why the fix lifts onto the *parent* at the
child's own start year instead.
Also covers winsorize_steps, the post-smoothing per-year step clamp against the
national index (fix: violent single-year index jumps in thin early years).
"""
from pipeline.transform.price_estimation.shrinkage import (
MAX_STEP_DEVIATION_PER_YEAR,
hierarchical_shrinkage,
lift_onto_parent,
shrink_dicts,
winsorize_steps,
)
from pipeline.transform.price_estimation.utils import SHRINKAGE_K
@ -115,3 +120,60 @@ def test_hierarchical_shrinkage_lift_fn_only_changes_spanning_comparisons():
assert (with_lift[2024] - with_lift[2008]) > (
without_lift[2024] - without_lift[2008]
) + 0.1
def test_winsorize_clamps_thin_year_spike_and_shifts_later_years():
"""A "M3 1"-style single-year spike (x9.7, log +2.27) is clamped to
parent_rate + max_dev; the first year's level is preserved, and later
years keep their OWN steps (the tail shifts down rigidly by whatever the
clamped step removed)."""
child = {1995: 0.0, 1998: 0.2, 1999: 2.47, 2000: 2.5}
parent = {y: 0.1 * (y - 1995) for y in range(1995, 2001)} # flat-ish 0.1/yr
out = winsorize_steps(child, parent, MAX_STEP_DEVIATION_PER_YEAR)
assert out[1995] == child[1995] # first year preserved
# 1995->1998: 0.0667/yr, well within 0.1 +/- 0.40 -> untouched.
assert abs(out[1998] - child[1998]) < 1e-12
# 1998->1999: 2.27/yr clamped to parent_rate + max_dev = 0.1 + 0.40.
assert abs((out[1999] - out[1998]) - (0.1 + MAX_STEP_DEVIATION_PER_YEAR)) < 1e-12
# 1999->2000: the in-band +0.03 step survives; the level shifts down with
# the clamped 1999.
assert abs((out[2000] - out[1999]) - (child[2000] - child[1999])) < 1e-12
assert abs(out[2000] - 0.73) < 1e-12
def test_winsorize_preserves_genuine_moves():
"""Steps within parent_rate +/- max_dev pass through (numerically) unchanged."""
child = {2000: 0.0, 2001: 0.35, 2002: 0.40, 2003: 0.20}
parent = {y: 0.05 * (y - 2000) for y in range(2000, 2004)}
out = winsorize_steps(child, parent, MAX_STEP_DEVIATION_PER_YEAR)
assert set(out) == set(child)
assert max(abs(out[y] - child[y]) for y in child) < 1e-12
def test_winsorize_judges_gap_steps_on_per_year_rate():
"""A step across a multi-year gap is judged on its PER-YEAR rate (with
gap-tolerant parent lookup via _base_value), not as one giant single-year
move: +1.0 over 5 years (0.2/yr) is in-band even though +1.0 in one year
would be clamped."""
child = {1995: 0.0, 2000: 1.0}
# Parent lacks both endpoint years: 1995 back-fills to its earliest value
# (0.0), 2000 forward-fills from 1999 (0.3) -> parent rate 0.06/yr.
parent = {1996: 0.0, 1999: 0.3}
out = winsorize_steps(child, parent, MAX_STEP_DEVIATION_PER_YEAR)
assert out == child
def test_winsorize_degenerate_inputs_unchanged():
"""<2 child years -> no steps to clamp; an empty parent only occurs in
degenerate paths (build_index always passes the national index) -> child
is returned unchanged, never clamped against an arbitrary rate."""
assert winsorize_steps({}, {2000: 0.0, 2001: 0.1}, 0.4) == {}
assert winsorize_steps({2000: 0.5}, {2000: 0.0, 2001: 0.1}, 0.4) == {2000: 0.5}
spiky = {2000: 0.0, 2001: 5.0}
assert winsorize_steps(spiky, {}, 0.4) == spiky