Fable findings in data
This commit is contained in:
parent
b98bc6d611
commit
6a33b03fdf
20 changed files with 1502 additions and 274 deletions
|
|
@ -25,6 +25,7 @@ from pipeline.transform.price_estimation.knn import (
|
|||
)
|
||||
from pipeline.transform.price_estimation.utils import (
|
||||
CURRENT_FRAC_YEAR,
|
||||
CURRENT_YEAR,
|
||||
MAX_LOG_ADJUSTMENT,
|
||||
interpolate_log_index,
|
||||
sector_expr,
|
||||
|
|
@ -41,6 +42,87 @@ MIN_KNN_TO_INDEX_RATIO = 0.5
|
|||
# only catching outliers.
|
||||
MAX_ESTIMATE_TO_LAST_PRICE_RATIO = 20.0
|
||||
|
||||
# Guard for rows with NO usable floor area: the per-sqm plausibility check
|
||||
# cannot fire there, which let commercial blocks misfiled as dwellings keep
|
||||
# absurd headline estimates (e.g. a GBP 175M "Detached" in SW1W). Without
|
||||
# floor area we cannot psm-check, so the only sanity reference left is what
|
||||
# the local market actually pays: beyond this multiple of the district's
|
||||
# recent 99th-percentile sale price the estimate is unreliable and misleading,
|
||||
# so it is nulled rather than shown.
|
||||
FLOORLESS_ESTIMATE_P99_MULT = 2.0
|
||||
# Never null a floorless estimate below this absolute value: genuine mansions
|
||||
# in cheap districts can legitimately exceed 2x their district's recent p99,
|
||||
# but a sub-GBP 2M estimate is within the plausible single-dwelling range
|
||||
# anywhere in the UK, so it survives regardless of the local p99.
|
||||
FLOORLESS_ESTIMATE_MIN_CAP = 2_000_000.0
|
||||
# Look-back window for the district p99 reference: long enough that thin
|
||||
# districts accumulate a usable sale sample, short enough that the reference
|
||||
# reflects today's price level rather than a pre-boom one.
|
||||
FLOORLESS_P99_LOOKBACK_YEARS = 10
|
||||
|
||||
|
||||
def apply_floorless_estimate_guard(df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Null floor-area-less estimates far above their district's recent sales.
|
||||
|
||||
Builds a per-district reference from the SAME frame -- the 99th percentile
|
||||
of `Last known price` over sales in the last FLOORLESS_P99_LOOKBACK_YEARS
|
||||
-- and nulls `Estimated current price` where the floor area is null/zero
|
||||
AND the estimate exceeds max(FLOORLESS_ESTIMATE_P99_MULT * p99,
|
||||
FLOORLESS_ESTIMATE_MIN_CAP). Districts with no recent sales yield a null
|
||||
p99 and are left alone: with neither a psm check nor a local reference we
|
||||
cannot judge the estimate, and nulling on the absolute cap alone would be
|
||||
too aggressive. Expects the `_sector` helper column; rows with floor area
|
||||
present are never touched (the psm guard covers them).
|
||||
"""
|
||||
# District = sector minus the trailing sector digit group, matching the
|
||||
# rsplit semantics of utils.hierarchy_keys ("SW1W 9" -> "SW1W").
|
||||
district = pl.col("_sector").str.replace(r"\s+\d+$", "")
|
||||
|
||||
district_p99 = (
|
||||
df.lazy()
|
||||
.filter(
|
||||
pl.col("Last known price").is_not_null(),
|
||||
pl.col("Date of last transaction").dt.year()
|
||||
>= CURRENT_YEAR - FLOORLESS_P99_LOOKBACK_YEARS,
|
||||
)
|
||||
.group_by(district.alias("_district"))
|
||||
.agg(
|
||||
pl.col("Last known price")
|
||||
.cast(pl.Float64)
|
||||
.quantile(0.99)
|
||||
.alias("_district_p99")
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
|
||||
df = df.with_columns(district.alias("_district")).join(
|
||||
district_p99, on="_district", how="left", maintain_order="left"
|
||||
)
|
||||
|
||||
floorless = pl.col("Total floor area (sqm)").is_null() | (
|
||||
pl.col("Total floor area (sqm)") <= 0
|
||||
)
|
||||
cap = pl.max_horizontal(
|
||||
FLOORLESS_ESTIMATE_P99_MULT * pl.col("_district_p99"),
|
||||
pl.lit(FLOORLESS_ESTIMATE_MIN_CAP),
|
||||
)
|
||||
implausible = (
|
||||
pl.col("Estimated current price").is_not_null()
|
||||
& floorless
|
||||
& pl.col("_district_p99").is_not_null()
|
||||
& (pl.col("Estimated current price") > cap)
|
||||
)
|
||||
|
||||
n_nulled = df.select(implausible.sum()).item()
|
||||
print(f" Floorless-estimate guard: nulled {n_nulled:,} estimates")
|
||||
|
||||
return df.with_columns(
|
||||
pl.when(implausible)
|
||||
.then(None)
|
||||
.otherwise(pl.col("Estimated current price"))
|
||||
.alias("Estimated current price"),
|
||||
).drop("_district", "_district_p99")
|
||||
|
||||
|
||||
def guarded_blend_estimates(
|
||||
index_est: np.ndarray,
|
||||
|
|
@ -249,9 +331,16 @@ def main():
|
|||
.alias("Estimated current price"),
|
||||
)
|
||||
|
||||
# Floor-area-less rows escape the per-sqm guard above entirely; cap them
|
||||
# against their district's recent sale prices instead (see
|
||||
# apply_floorless_estimate_guard). Must run before temp columns
|
||||
# (_sector) are dropped.
|
||||
df = apply_floorless_estimate_guard(df)
|
||||
|
||||
# Derive estimated price per sqm where both estimated price and floor area
|
||||
# exist. Now that the implausible-psm estimates are nulled above, the band
|
||||
# filter here mainly guards the floor-area>0 case.
|
||||
# filter here mainly guards the floor-area>0 case. (The floorless guard
|
||||
# never touches floor-area-present rows, so this derivation is unaffected.)
|
||||
_est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
|
||||
df = df.with_columns(
|
||||
pl.when(
|
||||
|
|
|
|||
|
|
@ -17,11 +17,13 @@ from scipy.sparse.linalg import lsqr
|
|||
from tqdm import tqdm
|
||||
|
||||
from pipeline.transform.price_estimation.shrinkage import (
|
||||
MAX_STEP_DEVIATION_PER_YEAR,
|
||||
blend_dicts,
|
||||
hierarchical_shrinkage,
|
||||
lift_onto_parent,
|
||||
shrink_dicts,
|
||||
spatial_smooth,
|
||||
winsorize_steps,
|
||||
)
|
||||
from pipeline.transform.price_estimation.utils import (
|
||||
CURRENT_YEAR,
|
||||
|
|
@ -485,8 +487,20 @@ def build_index(
|
|||
input_path, min_year, max_year, max_sale_year=estimation_cap
|
||||
)
|
||||
|
||||
# Precompute hierarchy
|
||||
all_sectors = pairs["sector"].unique().to_list()
|
||||
# Precompute hierarchy. The sector universe is the UNION of sectors with
|
||||
# repeat-sale pairs and every sector in the postcode universe (centroids
|
||||
# is keyed by every sector derived from postcode.parquet): a sector whose
|
||||
# properties never resold still gets a full index row via the district ->
|
||||
# area -> national fallback in hierarchical_shrinkage (then spatial
|
||||
# smoothing and forward fill). Restricting the universe to pairs-only
|
||||
# sectors silently dropped ~15% of live sectors from the output, nulling
|
||||
# every per-sector lookup and estimate there. n_pairs = 0 marks the
|
||||
# synthesised cells.
|
||||
all_sectors = sorted(set(pairs["sector"].unique().to_list()) | set(centroids))
|
||||
if sectors is not None:
|
||||
# Debug scoping restricts the universe too, not just the pairs.
|
||||
scoped = set(sectors)
|
||||
all_sectors = [s for s in all_sectors if s in scoped]
|
||||
sector_to_dist = {}
|
||||
dist_to_area = {}
|
||||
for s in all_sectors:
|
||||
|
|
@ -562,10 +576,23 @@ def build_index(
|
|||
sector_shrunk, centroids, sector_n, blend_dicts
|
||||
)
|
||||
|
||||
# Forward fill
|
||||
# Winsorise per-year steps against the national index, then forward
|
||||
# fill. The support-scaled smoothness prior still under-penalises
|
||||
# years identified by 1-2 pairs in thin early histories (observed:
|
||||
# x9.7 single-year jumps in city-centre regeneration sectors);
|
||||
# clamping each step to within +/-MAX_STEP_DEVIATION_PER_YEAR of the
|
||||
# national move over the same span removes those artefacts while
|
||||
# leaving genuine sector-vs-national divergence (well inside the
|
||||
# band) untouched.
|
||||
for sec in all_sectors:
|
||||
sector_smoothed[sec] = forward_fill(
|
||||
sector_smoothed.get(sec, hedonic_idx), min_year, max_year
|
||||
winsorize_steps(
|
||||
sector_smoothed.get(sec, hedonic_idx),
|
||||
national_shrunk,
|
||||
MAX_STEP_DEVIATION_PER_YEAR,
|
||||
),
|
||||
min_year,
|
||||
max_year,
|
||||
)
|
||||
|
||||
final[tg] = sector_smoothed
|
||||
|
|
|
|||
|
|
@ -12,6 +12,18 @@ V = TypeVar("V")
|
|||
SPATIAL_NEIGHBORS = 5
|
||||
SPATIAL_BLEND_K = 30
|
||||
|
||||
# Hard band on a sector's per-year index move RELATIVE to its parent (the
|
||||
# national index), enforced by winsorize_steps after spatial smoothing. The
|
||||
# support-scaled temporal smoothness prior still under-penalises years
|
||||
# identified by only 1-2 repeat-sale pairs in thin early histories, leaving
|
||||
# artefacts like a x9.7 single-year jump (log +2.27, sector "M3 1"
|
||||
# 1998->1999). A sector may genuinely outpace the nation -- regeneration, new
|
||||
# transport links -- but those stories play out over multiple years, not as a
|
||||
# one-year x9.7 step. +/-0.40 log/yr (~x1.5 in a year) relative to the
|
||||
# national move keeps every plausible genuine sector-level divergence while
|
||||
# clamping thin-year data artefacts.
|
||||
MAX_STEP_DEVIATION_PER_YEAR = 0.40
|
||||
|
||||
|
||||
def _base_value(index: dict[int, float], base_year: int) -> float:
|
||||
"""Value of an index dict at `base_year`, with forward/back-fill for gaps.
|
||||
|
|
@ -75,6 +87,42 @@ def lift_onto_parent(
|
|||
return {y: v + offset for y, v in child.items()}
|
||||
|
||||
|
||||
def winsorize_steps(
|
||||
child: dict[int, float],
|
||||
parent: dict[int, float],
|
||||
max_dev_per_year: float,
|
||||
) -> dict[int, float]:
|
||||
"""Clamp a child's per-year index steps to within a band of the parent's.
|
||||
|
||||
For each consecutive pair of solved years (y_prev, y) the child's per-year
|
||||
rate r = (child[y] - child[y_prev]) / (y - y_prev) is winsorised into
|
||||
[p - max_dev_per_year, p + max_dev_per_year], where p is the parent's
|
||||
per-year rate over the same span (via _base_value, so gaps in the parent's
|
||||
coverage are forward/back-filled rather than crashing). The series is then
|
||||
rebuilt cumulatively from the FIRST year's value, so:
|
||||
- the first year's level is preserved;
|
||||
- non-outlier steps are preserved exactly (later years simply shift by
|
||||
whatever the clamped steps removed);
|
||||
- a multi-year gap is judged on its per-year rate, not as one giant
|
||||
single-year move, so genuine level changes across gaps survive.
|
||||
|
||||
A child with <2 years has no steps to clamp; an empty parent only occurs
|
||||
in degenerate paths (build_index always passes the national index) -- both
|
||||
are returned unchanged.
|
||||
"""
|
||||
if len(child) < 2 or not parent:
|
||||
return child
|
||||
years = sorted(child)
|
||||
result = {years[0]: child[years[0]]}
|
||||
for y_prev, y in zip(years[:-1], years[1:]):
|
||||
span = y - y_prev
|
||||
r = (child[y] - child[y_prev]) / span
|
||||
p = (_base_value(parent, y) - _base_value(parent, y_prev)) / span
|
||||
r = min(max(r, p - max_dev_per_year), p + max_dev_per_year)
|
||||
result[y] = result[y_prev] + r * span
|
||||
return result
|
||||
|
||||
|
||||
def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
|
||||
"""Shrink dict values toward parent using n/(n+k) weighting.
|
||||
|
||||
|
|
|
|||
|
|
@ -1,14 +1,18 @@
|
|||
from datetime import date
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.price_estimation import index as index_mod
|
||||
from pipeline.transform.price_estimation.index import (
|
||||
MAX_EXTRAPOLATION_SLOPE,
|
||||
build_index,
|
||||
compute_indices_for_level,
|
||||
extract_pairs,
|
||||
forward_fill,
|
||||
solve_robust_index,
|
||||
)
|
||||
from pipeline.transform.price_estimation.utils import CURRENT_YEAR, TYPE_GROUPS
|
||||
|
||||
|
||||
def _pairs_from_path(true_levels: dict[int, float]):
|
||||
|
|
@ -269,3 +273,82 @@ def test_n_pairs_counts_only_cross_year_pairs():
|
|||
|
||||
assert "g" in indices
|
||||
assert n_pairs["g"] == 8 # not 11
|
||||
|
||||
|
||||
def _write_universe_fixtures(tmp_path):
|
||||
"""Properties with repeat sales only in sector 'AB1 2', plus a postcode
|
||||
universe that also contains the pairless sector 'AB1 3'."""
|
||||
props = pl.DataFrame(
|
||||
{
|
||||
"Postcode": [f"AB1 2A{c}" for c in "ABCDEF"],
|
||||
"Property type": ["Detached"] * 6,
|
||||
"Total floor area (sqm)": [80.0] * 6,
|
||||
"Last known price": [130_000] * 6,
|
||||
"Date of last transaction": [date(2021, 6, 1)] * 6,
|
||||
# 6 repeat-sale pairs 2018 -> 2021, log_ratio ~0.26 (well within
|
||||
# the flat and annualised outlier caps), comfortably >= MIN_PAIRS.
|
||||
"historical_prices": [
|
||||
[
|
||||
{"year": 2018, "month": 1, "price": 100_000},
|
||||
{"year": 2021, "month": 6, "price": 130_000},
|
||||
]
|
||||
]
|
||||
* 6,
|
||||
}
|
||||
)
|
||||
props_path = tmp_path / "props.parquet"
|
||||
props.write_parquet(props_path)
|
||||
|
||||
postcodes = pl.DataFrame(
|
||||
{
|
||||
"Postcode": ["AB1 2AA", "AB1 2AB", "AB1 3AA"],
|
||||
"lat": [57.10, 57.10, 57.20],
|
||||
"lon": [-2.10, -2.10, -2.20],
|
||||
}
|
||||
)
|
||||
pc_path = tmp_path / "postcodes.parquet"
|
||||
postcodes.write_parquet(pc_path)
|
||||
return props_path, pc_path
|
||||
|
||||
|
||||
def test_build_index_covers_pairless_sectors_from_postcode_universe(tmp_path):
|
||||
"""FIX: the sector universe is pairs-sectors UNION postcode-universe
|
||||
sectors, not just sectors that happened to have a repeat sale (which
|
||||
silently dropped ~15% of live sectors from the output). A pairless sector
|
||||
present in postcode.parquet must get index rows via the hierarchy
|
||||
fallback: n_pairs == 0 marks the synthesised cells, with full year
|
||||
coverage after forward fill."""
|
||||
props_path, pc_path = _write_universe_fixtures(tmp_path)
|
||||
|
||||
result = build_index(props_path, postcodes_path=pc_path)
|
||||
|
||||
pairless = result.filter(pl.col("sector") == "AB1 3")
|
||||
assert len(pairless) > 0
|
||||
assert set(pairless["type_group"]) == {"All", *TYPE_GROUPS}
|
||||
assert pairless["n_pairs"].to_list() == [0] * len(pairless)
|
||||
assert pairless["log_index"].is_not_null().all()
|
||||
# Full year coverage (min pair year .. CURRENT_YEAR) for the solved type
|
||||
# groups. (Type groups with <MIN_PAIRS pairs take the hedonic-fallback
|
||||
# skip branch, which only emits hedonic years -- unchanged behaviour.)
|
||||
expected_years = set(range(2018, CURRENT_YEAR + 1))
|
||||
for tg in ("All", "Detached"):
|
||||
years = set(pairless.filter(pl.col("type_group") == tg)["year"])
|
||||
assert years == expected_years
|
||||
|
||||
# The pairless sector inherits its district's index: same values as the
|
||||
# sector that actually has pairs (no other siblings to dilute it here).
|
||||
with_pairs = result.filter(pl.col("sector") == "AB1 2")
|
||||
assert (
|
||||
with_pairs.filter(pl.col("type_group") == "All")["n_pairs"].to_list()
|
||||
== [6] * (CURRENT_YEAR - 2018 + 1)
|
||||
)
|
||||
|
||||
|
||||
def test_build_index_sectors_scoping_restricts_universe(tmp_path):
|
||||
"""Debug scoping via sectors=[...] restricts the output universe too --
|
||||
not just the pairs -- so a scoped run does not emit every centroid sector."""
|
||||
props_path, pc_path = _write_universe_fixtures(tmp_path)
|
||||
|
||||
result = build_index(props_path, postcodes_path=pc_path, sectors=["AB1 2"])
|
||||
|
||||
assert set(result["sector"]) == {"AB1 2"}
|
||||
|
|
|
|||
|
|
@ -10,12 +10,17 @@ Note: re-anchoring each cell to the *global* base year is a no-op on real data
|
|||
(a cell anchored to 0 at its own earliest year already reads 0 there, and the
|
||||
global base is never later), which is why the fix lifts onto the *parent* at the
|
||||
child's own start year instead.
|
||||
|
||||
Also covers winsorize_steps, the post-smoothing per-year step clamp against the
|
||||
national index (fix: violent single-year index jumps in thin early years).
|
||||
"""
|
||||
|
||||
from pipeline.transform.price_estimation.shrinkage import (
|
||||
MAX_STEP_DEVIATION_PER_YEAR,
|
||||
hierarchical_shrinkage,
|
||||
lift_onto_parent,
|
||||
shrink_dicts,
|
||||
winsorize_steps,
|
||||
)
|
||||
from pipeline.transform.price_estimation.utils import SHRINKAGE_K
|
||||
|
||||
|
|
@ -115,3 +120,60 @@ def test_hierarchical_shrinkage_lift_fn_only_changes_spanning_comparisons():
|
|||
assert (with_lift[2024] - with_lift[2008]) > (
|
||||
without_lift[2024] - without_lift[2008]
|
||||
) + 0.1
|
||||
|
||||
|
||||
def test_winsorize_clamps_thin_year_spike_and_shifts_later_years():
|
||||
"""A "M3 1"-style single-year spike (x9.7, log +2.27) is clamped to
|
||||
parent_rate + max_dev; the first year's level is preserved, and later
|
||||
years keep their OWN steps (the tail shifts down rigidly by whatever the
|
||||
clamped step removed)."""
|
||||
child = {1995: 0.0, 1998: 0.2, 1999: 2.47, 2000: 2.5}
|
||||
parent = {y: 0.1 * (y - 1995) for y in range(1995, 2001)} # flat-ish 0.1/yr
|
||||
|
||||
out = winsorize_steps(child, parent, MAX_STEP_DEVIATION_PER_YEAR)
|
||||
|
||||
assert out[1995] == child[1995] # first year preserved
|
||||
# 1995->1998: 0.0667/yr, well within 0.1 +/- 0.40 -> untouched.
|
||||
assert abs(out[1998] - child[1998]) < 1e-12
|
||||
# 1998->1999: 2.27/yr clamped to parent_rate + max_dev = 0.1 + 0.40.
|
||||
assert abs((out[1999] - out[1998]) - (0.1 + MAX_STEP_DEVIATION_PER_YEAR)) < 1e-12
|
||||
# 1999->2000: the in-band +0.03 step survives; the level shifts down with
|
||||
# the clamped 1999.
|
||||
assert abs((out[2000] - out[1999]) - (child[2000] - child[1999])) < 1e-12
|
||||
assert abs(out[2000] - 0.73) < 1e-12
|
||||
|
||||
|
||||
def test_winsorize_preserves_genuine_moves():
|
||||
"""Steps within parent_rate +/- max_dev pass through (numerically) unchanged."""
|
||||
child = {2000: 0.0, 2001: 0.35, 2002: 0.40, 2003: 0.20}
|
||||
parent = {y: 0.05 * (y - 2000) for y in range(2000, 2004)}
|
||||
|
||||
out = winsorize_steps(child, parent, MAX_STEP_DEVIATION_PER_YEAR)
|
||||
|
||||
assert set(out) == set(child)
|
||||
assert max(abs(out[y] - child[y]) for y in child) < 1e-12
|
||||
|
||||
|
||||
def test_winsorize_judges_gap_steps_on_per_year_rate():
|
||||
"""A step across a multi-year gap is judged on its PER-YEAR rate (with
|
||||
gap-tolerant parent lookup via _base_value), not as one giant single-year
|
||||
move: +1.0 over 5 years (0.2/yr) is in-band even though +1.0 in one year
|
||||
would be clamped."""
|
||||
child = {1995: 0.0, 2000: 1.0}
|
||||
# Parent lacks both endpoint years: 1995 back-fills to its earliest value
|
||||
# (0.0), 2000 forward-fills from 1999 (0.3) -> parent rate 0.06/yr.
|
||||
parent = {1996: 0.0, 1999: 0.3}
|
||||
|
||||
out = winsorize_steps(child, parent, MAX_STEP_DEVIATION_PER_YEAR)
|
||||
|
||||
assert out == child
|
||||
|
||||
|
||||
def test_winsorize_degenerate_inputs_unchanged():
|
||||
"""<2 child years -> no steps to clamp; an empty parent only occurs in
|
||||
degenerate paths (build_index always passes the national index) -> child
|
||||
is returned unchanged, never clamped against an arbitrary rate."""
|
||||
assert winsorize_steps({}, {2000: 0.0, 2001: 0.1}, 0.4) == {}
|
||||
assert winsorize_steps({2000: 0.5}, {2000: 0.0, 2001: 0.1}, 0.4) == {2000: 0.5}
|
||||
spiky = {2000: 0.0, 2001: 5.0}
|
||||
assert winsorize_steps(spiky, {}, 0.4) == spiky
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue