from datetime import date import numpy as np import polars as pl from pipeline.transform.price_estimation.estimate import guarded_blend_estimates from pipeline.transform.price_estimation.knn import build_knn_pool, knn_median_psm from pipeline.transform.price_estimation.utils import TYPE_GROUPS, type_group_expr def _flat_index() -> pl.DataFrame: return pl.DataFrame( { "sector": ["AA1 1", "AA1 1"], "type_group": ["Detached", "All"], "year": [2026, 2026], "log_index": [0.0, 0.0], } ) def test_knn_excludes_same_sale_and_uses_stable_comparables(): sale_date = date(2026, 1, 1) rows = [ { "Postcode": "AA1 1AA", "Property type": "Detached", "lat": 51.5000, "lon": -0.1000, "Total floor area (sqm)": 80.0, "Last known price": 900_000.0, "Date of last transaction": sale_date, } ] rows.extend( { "Postcode": "AA1 1AA", "Property type": "Detached", "lat": 51.5001 + i * 0.00001, "lon": -0.1001, "Total floor area (sqm)": 20.0, "Last known price": 900_000.0, "Date of last transaction": sale_date, } for i in range(5) ) rows.extend( { "Postcode": f"AA1 1B{i}", "Property type": "Detached", "lat": 51.5010 + i * 0.00001, "lon": -0.1010, "Total floor area (sqm)": 80.0, "Last known price": 200_000.0, "Date of last transaction": sale_date, } for i in range(5) ) df = pl.DataFrame(rows) trees = build_knn_pool(df.lazy(), _flat_index(), 2026.0) psm = knn_median_psm( trees, lat=np.array([51.5000]), lon=np.array([-0.1000]), type_groups=np.array(["Detached"]), postcodes=np.array(["AA1 1AA"]), last_prices=np.array([900_000.0]), last_sale_dates=np.array( [sale_date.toordinal() - date(1970, 1, 1).toordinal()] ), ) # The five 900k same-postcode siblings share the target's (postcode, price, # date) identity proxy, so they are all excluded as comparables, leaving the # 200k/80sqm = 2_500 PSM neighbours. Removing same-identity siblings is an # INTENTIONAL conservative leakage-prevention tradeoff (no per-property id # exists to distinguish a target's own resale from a distinct bulk-block # sibling sold same-day at the same price), not ideal behaviour -- see the # _sale_identity_matches docstring. assert psm[0] == 2_500.0 def test_knn_median_psm_is_deterministic(): """Reproducibility guard (BUG #6): within-postcode neighbours are co-located (one centroid per postcode), so the kNN result for dense postcodes depends on an arbitrary same-postcode subset. That is acceptable, but it MUST be stable: two identical calls against the same trees/inputs return identical output, so future refactors cannot silently introduce run-to-run nondeterminism.""" sale_date = date(2026, 1, 1) rows = [ { "Postcode": "AA1 1AA", "Property type": "Detached", "lat": 51.5000 + i * 0.00001, "lon": -0.1000, "Total floor area (sqm)": 80.0, "Last known price": 200_000.0 + i * 1_000.0, "Date of last transaction": sale_date, } for i in range(40) ] df = pl.DataFrame(rows) trees = build_knn_pool(df.lazy(), _flat_index(), 2026.0) args = dict( lat=np.array([51.5000, 51.5002]), lon=np.array([-0.1000, -0.1000]), type_groups=np.array(["Detached", "Detached"]), ) first = knn_median_psm(trees, **args) second = knn_median_psm(trees, **args) assert np.array_equal(first, second) def test_guarded_blend_routes_unstable_knn_to_index_and_caps_uplift(): blended = guarded_blend_estimates( index_est=np.array([120_000.0, 1_000_000.0]), knn_est=np.array([5_000_000.0, 1_000_000.0]), last_prices=np.array([100_000.0, 100_000.0]), ) # Property 0: unstable kNN (>2x index) is dropped, index estimate kept. assert blended[0] == 120_000.0 # Property 1: a 10x uplift over the last price is legitimate appreciation and # is no longer truncated (cap raised from 6x to 20x). assert blended[1] == 1_000_000.0 def test_guarded_blend_caps_uplift_at_20x_last_price(): # 50x index estimate over the last price is capped at the 20x ceiling. blended = guarded_blend_estimates( index_est=np.array([5_000_000.0]), knn_est=np.array([np.nan]), last_prices=np.array([100_000.0]), ) assert blended[0] == 2_000_000.0 # 100_000 * 20 def test_bungalow_is_not_a_dead_price_index_type_group(): df = pl.DataFrame({"Property type": ["Bungalow", "Other"]}).with_columns( type_group_expr() ) assert "Bungalow" not in TYPE_GROUPS assert df["type_group"].to_list() == [None, None] def test_temporal_regularization_damps_curvature_without_breaking_solve(): """The second-difference prior reduces year-to-year curvature and keeps the index well-formed (all years present, finite, contiguous).""" from pipeline.transform.price_estimation import index as index_mod years = np.arange(2010, 2021) true = {y: 0.04 * (y - 2010) for y in years} y1, y2, lr, w = [], [], [], [] for y in years[:-1]: # adjacent-year pairs following a smooth trend y1.append(y) y2.append(y + 1) lr.append(true[y + 1] - true[y]) w.append(1.0) # A spurious single-year jump at 2015 (poorly identified curvature spike). y1.append(2014) y2.append(2015) lr.append(0.5) w.append(1.0) y1, y2 = np.array(y1), np.array(y2) lr, w = np.array(lr, float), np.array(w, float) def solve(lmbda): original = index_mod.TEMPORAL_SMOOTHNESS_LAMBDA index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = lmbda try: return index_mod.solve_robust_index(y1, y2, lr, w) finally: index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = original unregularised = solve(0.0) regularised = solve(0.2) # Index is well-formed for both. assert set(regularised) == set(range(2010, 2021)) assert all(np.isfinite(v) for v in regularised.values()) assert regularised[2010] == 0.0 # baseline year pinned to 0 def max_curvature(d): betas = np.array([d[y] for y in sorted(d)]) return float(np.abs(np.diff(betas, 2)).max()) # Regularisation strictly reduces curvature, and never flattens the genuine # uptrend (the index still rises end to end). assert max_curvature(regularised) < max_curvature(unregularised) assert regularised[2020] > regularised[2010]