Update data

2026-05-14 08:17:10 +01:00 · 2026-05-14 08:17:10 +01:00 · 273d7a83ee
commit 273d7a83ee
parent a4103b0896
15 changed files with 716 additions and 316 deletions
--- a/pipeline/transform/price_estimation/backtest.py
+++ b/pipeline/transform/price_estimation/backtest.py
@ -227,7 +227,18 @@ def main():
    fa = test["Total floor area (sqm)"].cast(pl.Float64).fill_null(0.0).to_numpy()

    print("\nComputing kNN estimates...")
-    knn_psm = knn_median_psm(trees, lat, lon, tg)
+    last_sale_dates = (
+        test["input_date"].dt.epoch("d").fill_null(-1).to_numpy().astype(np.int64)
+    )
+    knn_psm = knn_median_psm(
+        trees,
+        lat,
+        lon,
+        tg,
+        postcodes=test["Postcode"].fill_null("").to_numpy(),
+        last_prices=test["input_price"].cast(pl.Float64).to_numpy(),
+        last_sale_dates=last_sale_dates,
+    )

    # Temporal adjustment: pool PSM is at ref, adjust to actual
    log_idx_actual = test["log_index_actual"].to_numpy().astype(np.float64)
--- a/pipeline/transform/price_estimation/estimate.py
+++ b/pipeline/transform/price_estimation/estimate.py
@ -13,6 +13,7 @@ for lat/lon needed by kNN, then drops those columns before writing.
 import argparse
 from pathlib import Path

+import numpy as np
 import polars as pl

 from pipeline.transform.price_estimation.knn import (
@ -28,6 +29,45 @@ from pipeline.transform.price_estimation.utils import (
    type_group_expr,
 )

+MAX_KNN_TO_INDEX_RATIO = 2.0
+MIN_KNN_TO_INDEX_RATIO = 0.5
+MAX_ESTIMATE_TO_LAST_PRICE_RATIO = 6.0
+
+
+def guarded_blend_estimates(
+    index_est: np.ndarray,
+    knn_est: np.ndarray,
+    last_prices: np.ndarray,
+    weight: float = KNN_BLEND_WEIGHT,
+) -> np.ndarray:
+    """Blend only stable kNN estimates and cap final uplift from last sale price."""
+    index_est = index_est.astype(np.float64, copy=False)
+    knn_est = knn_est.astype(np.float64, copy=False)
+    last_prices = last_prices.astype(np.float64, copy=False)
+
+    has_index = np.isfinite(index_est) & (index_est > 0)
+    has_knn = np.isfinite(knn_est) & (knn_est > 0)
+    stable_knn = has_knn & (
+        has_index
+        & (knn_est >= index_est * MIN_KNN_TO_INDEX_RATIO)
+        & (knn_est <= index_est * MAX_KNN_TO_INDEX_RATIO)
+    )
+
+    blended = np.where(
+        has_index & stable_knn,
+        (1 - weight) * index_est + weight * knn_est,
+        np.where(has_index, index_est, np.nan),
+    )
+
+    cap = np.where(
+        np.isfinite(last_prices) & (last_prices > 0),
+        last_prices * MAX_ESTIMATE_TO_LAST_PRICE_RATIO,
+        np.nan,
+    )
+    return np.where(
+        np.isfinite(cap) & np.isfinite(blended), np.minimum(blended, cap), blended
+    )
+

 def main():
    parser = argparse.ArgumentParser(
@ -130,36 +170,54 @@ def main():
    lon = df["lon"].cast(pl.Float64).to_numpy()
    tg = df["_type_group"].fill_null("").to_numpy()
    fa = df["Total floor area (sqm)"].cast(pl.Float64).fill_null(0.0).to_numpy()
+    last_prices = (
+        df["Last known price"].cast(pl.Float64).fill_null(float("nan")).to_numpy()
+    )
+    last_sale_dates = (
+        df["Date of last transaction"]
+        .dt.epoch("d")
+        .fill_null(-1)
+        .to_numpy()
+        .astype(np.int64)
+    )

-    knn_psm = knn_median_psm(trees, lat, lon, tg)
+    knn_psm = knn_median_psm(
+        trees,
+        lat,
+        lon,
+        tg,
+        postcodes=df["Postcode"].fill_null("").to_numpy(),
+        last_prices=last_prices,
+        last_sale_dates=last_sale_dates,
+    )
    knn_est = knn_psm * fa  # No temporal adj: ref == current

    df = df.with_columns(
        pl.Series("_knn_est", knn_est, dtype=pl.Float64),
    )

-    # Blend: where kNN available, use weighted average; else keep index
+    # Blend only when kNN is close to the index estimate; otherwise keep index.
+    index_est = (
+        df["Estimated current price"]
+        .cast(pl.Float64)
+        .fill_null(float("nan"))
+        .to_numpy()
+    )
+    blended = guarded_blend_estimates(index_est, knn_est, last_prices)
    df = df.with_columns(
-        pl.when(
-            pl.col("Estimated current price").is_not_null()
-            & pl.col("_knn_est").is_not_null()
-            & pl.col("_knn_est").is_finite()
-            & (pl.col("_knn_est") > 0)
-        )
-        .then(
-            (1 - KNN_BLEND_WEIGHT) * pl.col("Estimated current price")
-            + KNN_BLEND_WEIGHT * pl.col("_knn_est")
-        )
-        .when(pl.col("Estimated current price").is_not_null())
-        .then(pl.col("Estimated current price"))
-        .otherwise(pl.lit(None))
-        .alias("Estimated current price"),
+        pl.Series("_index_est", index_est, dtype=pl.Float64),
+        pl.Series("Estimated current price", blended, dtype=pl.Float64),
+    ).with_columns(
+        pl.col("Estimated current price").fill_nan(None),
    )

    n_blended = df.filter(
        pl.col("_knn_est").is_not_null()
        & pl.col("_knn_est").is_finite()
        & (pl.col("_knn_est") > 0)
+        & (pl.col("_index_est").is_not_null())
+        & (pl.col("_knn_est") >= pl.col("_index_est") * MIN_KNN_TO_INDEX_RATIO)
+        & (pl.col("_knn_est") <= pl.col("_index_est") * MAX_KNN_TO_INDEX_RATIO)
        & pl.col("Estimated current price").is_not_null()
    ).height
    print(f"  kNN blended: {n_blended:,} of {n_estimated:,} estimates")
--- a/pipeline/transform/price_estimation/knn.py
+++ b/pipeline/transform/price_estimation/knn.py
@ -21,6 +21,10 @@ from pipeline.transform.price_estimation.utils import (
 KNN_K = 20
 KNN_MIN_NEIGHBORS = 5
 KNN_BLEND_WEIGHT = 0.35
+MIN_COMPARABLE_FLOOR_AREA_SQM = 15.0
+MAX_COMPARABLE_FLOOR_AREA_SQM = 1_000.0
+MIN_COMPARABLE_PSM = 500.0
+MAX_COMPARABLE_PSM = 50_000.0


 def _scale_coords(lat: np.ndarray, lon: np.ndarray) -> np.ndarray:
@ -33,13 +37,14 @@ def build_knn_pool(
    index: pl.DataFrame,
    ref_frac_year: float,
    max_sale_year: int | None = None,
-) -> dict[str, tuple[KDTree, np.ndarray]]:
+) -> dict[str, tuple[KDTree, np.ndarray, np.ndarray, np.ndarray, np.ndarray]]:
    """Build per-type_group KD-trees of index-adjusted price-per-sqm.

    Adjusts all pool properties' sale prices to ref_frac_year using the index,
    then builds a KD-tree per type_group for nearest-neighbor queries.

-    Returns dict mapping type_group -> (KDTree over scaled lat/lon, adjusted_psm array).
+    Returns dict mapping type_group to KDTree, adjusted PSM, and sale identity
+    arrays used to keep the target sale out of its own comparable set.
    """
    print("Building kNN pool...")
    lf = pl.scan_parquet(source) if isinstance(source, Path) else source
@ -55,7 +60,8 @@ def build_knn_pool(
        pl.col("lat").is_not_null(),
        pl.col("lon").is_not_null(),
        pl.col("Total floor area (sqm)").is_not_null(),
-        pl.col("Total floor area (sqm)") > 0,
+        pl.col("Total floor area (sqm)") >= MIN_COMPARABLE_FLOOR_AREA_SQM,
+        pl.col("Total floor area (sqm)") <= MAX_COMPARABLE_FLOOR_AREA_SQM,
        pl.col("Last known price").is_not_null(),
        pl.col("Last known price") > 0,
        pl.col("Postcode").is_not_null(),
@ -97,12 +103,13 @@ def build_knn_pool(
    ).filter(
        pl.col("_adj_psm").is_not_null(),
        pl.col("_adj_psm").is_finite(),
-        pl.col("_adj_psm") > 0,
+        pl.col("_adj_psm") >= MIN_COMPARABLE_PSM,
+        pl.col("_adj_psm") <= MAX_COMPARABLE_PSM,
    )
    print(f"  {len(pool):,} after index adjustment")

    # Build per-type KD-trees
-    trees: dict[str, tuple[KDTree, np.ndarray]] = {}
+    trees: dict[str, tuple[KDTree, np.ndarray, np.ndarray, np.ndarray, np.ndarray]] = {}
    for tg in TYPE_GROUPS:
        sub = pool.filter(pl.col("type_group") == tg)
        n = len(sub)
@ -111,19 +118,49 @@ def build_knn_pool(
        lat = sub["lat"].to_numpy().astype(np.float64)
        lon = sub["lon"].to_numpy().astype(np.float64)
        psm = sub["_adj_psm"].to_numpy().astype(np.float64)
+        postcodes = sub["Postcode"].fill_null("").to_numpy()
+        prices = sub["Last known price"].to_numpy().astype(np.float64)
+        sale_dates = (
+            sub["Date of last transaction"]
+            .dt.epoch("d")
+            .fill_null(-1)
+            .to_numpy()
+            .astype(np.int64)
+        )
        tree = KDTree(_scale_coords(lat, lon))
-        trees[tg] = (tree, psm)
+        trees[tg] = (tree, psm, postcodes, prices, sale_dates)
        print(f"    {tg}: {n:,}")

    return trees


+def _sale_identity_matches(
+    pool_postcodes: np.ndarray,
+    pool_prices: np.ndarray,
+    pool_sale_dates: np.ndarray,
+    target_postcode: str,
+    target_price: float,
+    target_sale_date: int,
+) -> np.ndarray:
+    if not target_postcode or not np.isfinite(target_price) or target_sale_date < 0:
+        return np.zeros(len(pool_postcodes), dtype=bool)
+    return (
+        (pool_postcodes == target_postcode)
+        & np.isfinite(pool_prices)
+        & np.isclose(pool_prices, target_price, rtol=0.0, atol=0.5)
+        & (pool_sale_dates == target_sale_date)
+    )
+
+
 def knn_median_psm(
-    trees: dict[str, tuple[KDTree, np.ndarray]],
+    trees: dict[str, tuple[KDTree, np.ndarray, np.ndarray, np.ndarray, np.ndarray]],
    lat: np.ndarray,
    lon: np.ndarray,
    type_groups: np.ndarray,
    k: int = KNN_K,
+    postcodes: np.ndarray | None = None,
+    last_prices: np.ndarray | None = None,
+    last_sale_dates: np.ndarray | None = None,
 ) -> np.ndarray:
    """Return median adjusted-PSM of k nearest neighbours for each target.

@ -133,21 +170,41 @@ def knn_median_psm(
    n = len(lat)
    result = np.full(n, np.nan)

-    for tg, (tree, psm) in trees.items():
+    for tg, (tree, psm, pool_postcodes, pool_prices, pool_sale_dates) in trees.items():
        mask = (type_groups == tg) & np.isfinite(lat) & np.isfinite(lon)
        idx = np.where(mask)[0]
        if len(idx) == 0:
            continue

-        actual_k = min(k, len(psm))
-        if actual_k < KNN_MIN_NEIGHBORS:
+        query_k = min(max(k * 2, k + KNN_MIN_NEIGHBORS), len(psm))
+        if query_k < KNN_MIN_NEIGHBORS:
            continue

        coords = _scale_coords(lat[idx], lon[idx])
-        _, nn_idx = tree.query(coords, k=actual_k)
+        _, nn_idx = tree.query(coords, k=query_k)
        if nn_idx.ndim == 1:
            nn_idx = nn_idx.reshape(-1, 1)

-        result[idx] = np.nanmedian(psm[nn_idx], axis=1)
+        medians = np.full(len(idx), np.nan)
+        for row_num, target_idx in enumerate(idx):
+            candidates = nn_idx[row_num]
+            if (
+                postcodes is not None
+                and last_prices is not None
+                and last_sale_dates is not None
+            ):
+                same_sale = _sale_identity_matches(
+                    pool_postcodes[candidates],
+                    pool_prices[candidates],
+                    pool_sale_dates[candidates],
+                    str(postcodes[target_idx] or ""),
+                    float(last_prices[target_idx]),
+                    int(last_sale_dates[target_idx]),
+                )
+                candidates = candidates[~same_sale]
+            if len(candidates) >= KNN_MIN_NEIGHBORS:
+                medians[row_num] = np.nanmedian(psm[candidates[:k]])
+
+        result[idx] = medians

    return result
--- a/pipeline/transform/price_estimation/utils.py
+++ b/pipeline/transform/price_estimation/utils.py
@ -19,7 +19,7 @@ TERRACE_TYPES = [
    "Terraced",
 ]
 FLAT_TYPES = ["Flats/Maisonettes"]
-TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats", "Bungalow"]
+TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats"]
 SHRINKAGE_K = 50


@ -30,8 +30,6 @@ def type_group_expr():
        .then(pl.lit("Terraced"))
        .when(pl.col("Property type").is_in(FLAT_TYPES))
        .then(pl.lit("Flats"))
-        .when(pl.col("Property type") == "Bungalow")
-        .then(pl.lit("Bungalow"))
        .when(pl.col("Property type").is_in(["Detached", "Semi-Detached"]))
        .then(pl.col("Property type"))
        .otherwise(pl.lit(None))
@ -61,7 +59,7 @@ def hierarchy_keys(sector: str) -> tuple[str, str]:
    return district, area


-NON_REF_TYPES = ["Terraced", "Semi-Detached", "Flats", "Bungalow"]
+NON_REF_TYPES = ["Terraced", "Semi-Detached", "Flats"]


 def build_hedonic_features(df: pl.DataFrame) -> np.ndarray: