154 lines
5 KiB
Python
154 lines
5 KiB
Python
from datetime import date
|
|
|
|
import numpy as np
|
|
import polars as pl
|
|
|
|
from pipeline.transform.price_estimation.estimate import guarded_blend_estimates
|
|
from pipeline.transform.price_estimation.knn import build_knn_pool, knn_median_psm
|
|
from pipeline.transform.price_estimation.utils import TYPE_GROUPS, type_group_expr
|
|
|
|
|
|
def _flat_index() -> pl.DataFrame:
|
|
return pl.DataFrame(
|
|
{
|
|
"sector": ["AA1 1", "AA1 1"],
|
|
"type_group": ["Detached", "All"],
|
|
"year": [2026, 2026],
|
|
"log_index": [0.0, 0.0],
|
|
}
|
|
)
|
|
|
|
|
|
def test_knn_excludes_same_sale_and_uses_stable_comparables():
|
|
sale_date = date(2026, 1, 1)
|
|
rows = [
|
|
{
|
|
"Postcode": "AA1 1AA",
|
|
"Property type": "Detached",
|
|
"lat": 51.5000,
|
|
"lon": -0.1000,
|
|
"Total floor area (sqm)": 80.0,
|
|
"Last known price": 900_000.0,
|
|
"Date of last transaction": sale_date,
|
|
}
|
|
]
|
|
rows.extend(
|
|
{
|
|
"Postcode": "AA1 1AA",
|
|
"Property type": "Detached",
|
|
"lat": 51.5001 + i * 0.00001,
|
|
"lon": -0.1001,
|
|
"Total floor area (sqm)": 20.0,
|
|
"Last known price": 900_000.0,
|
|
"Date of last transaction": sale_date,
|
|
}
|
|
for i in range(5)
|
|
)
|
|
rows.extend(
|
|
{
|
|
"Postcode": f"AA1 1B{i}",
|
|
"Property type": "Detached",
|
|
"lat": 51.5010 + i * 0.00001,
|
|
"lon": -0.1010,
|
|
"Total floor area (sqm)": 80.0,
|
|
"Last known price": 200_000.0,
|
|
"Date of last transaction": sale_date,
|
|
}
|
|
for i in range(5)
|
|
)
|
|
df = pl.DataFrame(rows)
|
|
|
|
trees = build_knn_pool(df.lazy(), _flat_index(), 2026.0)
|
|
psm = knn_median_psm(
|
|
trees,
|
|
lat=np.array([51.5000]),
|
|
lon=np.array([-0.1000]),
|
|
type_groups=np.array(["Detached"]),
|
|
postcodes=np.array(["AA1 1AA"]),
|
|
last_prices=np.array([900_000.0]),
|
|
last_sale_dates=np.array(
|
|
[sale_date.toordinal() - date(1970, 1, 1).toordinal()]
|
|
),
|
|
)
|
|
|
|
assert psm[0] == 2_500.0
|
|
|
|
|
|
def test_guarded_blend_routes_unstable_knn_to_index_and_caps_uplift():
|
|
blended = guarded_blend_estimates(
|
|
index_est=np.array([120_000.0, 1_000_000.0]),
|
|
knn_est=np.array([5_000_000.0, 1_000_000.0]),
|
|
last_prices=np.array([100_000.0, 100_000.0]),
|
|
)
|
|
|
|
# Property 0: unstable kNN (>2x index) is dropped, index estimate kept.
|
|
assert blended[0] == 120_000.0
|
|
# Property 1: a 10x uplift over the last price is legitimate appreciation and
|
|
# is no longer truncated (cap raised from 6x to 20x).
|
|
assert blended[1] == 1_000_000.0
|
|
|
|
|
|
def test_guarded_blend_caps_uplift_at_20x_last_price():
|
|
# 50x index estimate over the last price is capped at the 20x ceiling.
|
|
blended = guarded_blend_estimates(
|
|
index_est=np.array([5_000_000.0]),
|
|
knn_est=np.array([np.nan]),
|
|
last_prices=np.array([100_000.0]),
|
|
)
|
|
assert blended[0] == 2_000_000.0 # 100_000 * 20
|
|
|
|
|
|
def test_bungalow_is_not_a_dead_price_index_type_group():
|
|
df = pl.DataFrame({"Property type": ["Bungalow", "Other"]}).with_columns(
|
|
type_group_expr()
|
|
)
|
|
|
|
assert "Bungalow" not in TYPE_GROUPS
|
|
assert df["type_group"].to_list() == [None, None]
|
|
|
|
|
|
def test_temporal_regularization_damps_curvature_without_breaking_solve():
|
|
"""The second-difference prior reduces year-to-year curvature and keeps the
|
|
index well-formed (all years present, finite, contiguous)."""
|
|
from pipeline.transform.price_estimation import index as index_mod
|
|
|
|
years = np.arange(2010, 2021)
|
|
true = {y: 0.04 * (y - 2010) for y in years}
|
|
y1, y2, lr, w = [], [], [], []
|
|
for y in years[:-1]: # adjacent-year pairs following a smooth trend
|
|
y1.append(y)
|
|
y2.append(y + 1)
|
|
lr.append(true[y + 1] - true[y])
|
|
w.append(1.0)
|
|
# A spurious single-year jump at 2015 (poorly identified curvature spike).
|
|
y1.append(2014)
|
|
y2.append(2015)
|
|
lr.append(0.5)
|
|
w.append(1.0)
|
|
y1, y2 = np.array(y1), np.array(y2)
|
|
lr, w = np.array(lr, float), np.array(w, float)
|
|
|
|
def solve(lmbda):
|
|
original = index_mod.TEMPORAL_SMOOTHNESS_LAMBDA
|
|
index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = lmbda
|
|
try:
|
|
return index_mod.solve_robust_index(y1, y2, lr, w)
|
|
finally:
|
|
index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = original
|
|
|
|
unregularised = solve(0.0)
|
|
regularised = solve(0.2)
|
|
|
|
# Index is well-formed for both.
|
|
assert set(regularised) == set(range(2010, 2021))
|
|
assert all(np.isfinite(v) for v in regularised.values())
|
|
assert regularised[2010] == 0.0 # baseline year pinned to 0
|
|
|
|
def max_curvature(d):
|
|
betas = np.array([d[y] for y in sorted(d)])
|
|
return float(np.abs(np.diff(betas, 2)).max())
|
|
|
|
# Regularisation strictly reduces curvature, and never flattens the genuine
|
|
# uptrend (the index still rises end to end).
|
|
assert max_curvature(regularised) < max_curvature(unregularised)
|
|
assert regularised[2020] > regularised[2010]
|