Update data

2026-05-14 08:17:10 +01:00 · 2026-05-14 08:17:10 +01:00 · 273d7a83ee
commit 273d7a83ee
parent a4103b0896
15 changed files with 716 additions and 316 deletions
--- a/pipeline/transform/join_epc_pp.py
+++ b/pipeline/transform/join_epc_pp.py
@ -10,7 +10,11 @@ import pyarrow as pa
 import pyarrow.csv as pa_csv
 import pyarrow.parquet as pq

-from ..utils import fuzzy_join_on_postcode
+from ..utils import (
+    fuzzy_join_on_postcode,
+    normalize_address_key,
+    normalize_postcode_key,
+)


 pl.Config.set_tbl_cols(-1)
@ -193,12 +197,15 @@ def main():


 def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Path):
-    epc_base = _scan_epc_certificates(epc_path, temp_dir)
+    epc_base = _scan_epc_certificates(epc_path, temp_dir).with_columns(
+        normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"),
+        normalize_postcode_key(pl.col("epc_postcode")).alias("_epc_match_postcode"),
+    )

    # Dedup fork: keep latest certificate per property (existing logic)
    epc = (
        epc_base.sort("inspection_date", descending=True)
-        .group_by("epc_address", "epc_postcode")
+        .group_by("_epc_match_address", "_epc_match_postcode")
        .first()
        .drop("tenure")
    )
@ -216,15 +223,15 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
        .with_columns(
            pl.col("number_habitable_rooms")
            .shift(1)
-            .over("epc_address", "epc_postcode")
+            .over("_epc_match_address", "_epc_match_postcode")
            .alias("_prev_rooms"),
            pl.col("total_floor_area")
            .shift(1)
-            .over("epc_address", "epc_postcode")
+            .over("_epc_match_address", "_epc_match_postcode")
            .alias("_prev_area"),
            pl.col("_rating_rank")
            .shift(1)
-            .over("epc_address", "epc_postcode")
+            .over("_epc_match_address", "_epc_match_postcode")
            .alias("_prev_rating_rank"),
        )
        .with_columns(
@ -257,7 +264,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
            .cast(pl.Int32)
            .alias("_event_year"),
        )
-        .group_by("epc_address", "epc_postcode")
+        .group_by("_epc_match_address", "_epc_match_postcode")
        .agg(
            pl.struct(
                pl.col("_event_year").alias("year"),
@ -276,7 +283,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
    # Social tenure fork: flag properties that were ever social housing
    social_tenure = (
        epc_base.filter(pl.col("tenure").str.to_lowercase().str.contains("social"))
-        .select("epc_address", "epc_postcode")
+        .select("_epc_match_address", "_epc_match_postcode")
        .unique()
        .with_columns(pl.lit("Yes").alias("was_council_house"))
        .collect()
@ -287,12 +294,12 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
    epc = (
        epc.join(
            events.lazy(),
-            on=["epc_address", "epc_postcode"],
+            on=["_epc_match_address", "_epc_match_postcode"],
            how="left",
        )
        .join(
            social_tenure.lazy(),
-            on=["epc_address", "epc_postcode"],
+            on=["_epc_match_address", "_epc_match_postcode"],
            how="left",
        )
        .with_columns(
@ -339,9 +346,23 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
                ignore_nulls=True,
            ).alias("pp_address"),
        )
+        .with_columns(
+            normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
+            normalize_postcode_key(pl.col("postcode")).alias("_pp_match_postcode"),
+        )
+        .filter(pl.col("_pp_match_postcode").is_not_null())
+        .with_columns(
+            pl.coalesce("_pp_match_address", "pp_address").alias("_pp_group_address"),
+            pl.col("_pp_match_postcode").alias("_pp_group_postcode"),
+        )
+        .filter(pl.col("pp_address").is_not_null())
        .sort("date_of_transfer")
-        .group_by("pp_address", "postcode", maintain_order=True)
+        .group_by("_pp_group_address", "_pp_group_postcode", maintain_order=True)
        .agg(
+            pl.col("pp_address").last(),
+            pl.col("postcode").last(),
+            pl.col("_pp_match_address").last(),
+            pl.col("_pp_match_postcode").last(),
            pl.struct(
                pl.col("date_of_transfer").dt.year().alias("year"),
                pl.col("date_of_transfer").dt.month().cast(pl.UInt8).alias("month"),
@ -354,7 +375,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
            pl.col("date_of_transfer").first().alias("first_transfer_date"),
            pl.col("old_new").first(),
        )
-    ).filter(pl.col("pp_address").is_not_null())
+    )

    print("Price paid dataset")
    print(price_paid.head().collect())
@ -405,7 +426,19 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
        .then(pl.lit(1, dtype=pl.UInt8))
        .otherwise(pl.lit(None, dtype=pl.UInt8))
        .alias("is_construction_date_approximate"),
-    ).drop("old_new", "first_transfer_date")
+    ).drop(
+        [
+            "old_new",
+            "first_transfer_date",
+            "_pp_match_address",
+            "_pp_match_postcode",
+            "_pp_group_address",
+            "_pp_group_postcode",
+            "_epc_match_address",
+            "_epc_match_postcode",
+        ],
+        strict=False,
+    )

    joined = joined.rename({col: col.lower() for col in joined.columns})

--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -22,6 +22,8 @@ _AREA_COLUMNS = [
    "Postcode",
    "lat",
    "lon",
+    # Runtime provenance for deciding whether missing coordinates are skippable.
+    "ctry25cd",
    # Deprivation
    "Income Score",
    "Employment Score",
@ -86,6 +88,15 @@ _AREA_COLUMNS = [
 _DYNAMIC_POI_DISTANCE_RE = re.compile(r"^Distance to nearest amenity \(.+\) \(km\)$")
 _DYNAMIC_POI_COUNT_RE = re.compile(r"^Number of amenities \(.+\) within (2|5)km$")
 TREE_DENSITY_FEATURE = "Street tree density percentile"
+_POSTCODE_TREE_DENSITY_PERCENTILE_RE = re.compile(
+    r"^Tree canopy density percentile within \d+m$"
+)
+_RENT_SOURCE_UNAVAILABLE_LADS = {
+    # ONS PIPR does not publish LAD-level private-rent estimates for these
+    # small authorities. Keep rent null there, but fail on any other LAD miss.
+    "E06000053": "Isles of Scilly",
+    "E09000001": "City of London",
+}


 def _is_dynamic_poi_metric_column(column: str) -> bool:
@ -112,6 +123,107 @@ def _less_deprived_percentile_expr(column: str) -> pl.Expr:
    )


+def _tree_density_by_postcode(tree_density_postcodes_path: Path) -> pl.LazyFrame:
+    tree_density = pl.scan_parquet(tree_density_postcodes_path)
+    columns = set(tree_density.collect_schema().names())
+    if "postcode" not in columns:
+        raise ValueError(
+            f"{tree_density_postcodes_path} is missing required column: postcode"
+        )
+
+    if TREE_DENSITY_FEATURE in columns:
+        density_column = TREE_DENSITY_FEATURE
+    else:
+        candidates = sorted(
+            c for c in columns if _POSTCODE_TREE_DENSITY_PERCENTILE_RE.match(c)
+        )
+        if len(candidates) != 1:
+            raise ValueError(
+                f'{tree_density_postcodes_path} must contain column "{TREE_DENSITY_FEATURE}" '
+                'or exactly one "Tree canopy density percentile within {radius}m" column; '
+                f"found {len(candidates)} postcode percentile columns"
+            )
+        density_column = candidates[0]
+
+    return (
+        tree_density.select(
+            pl.col("postcode"),
+            pl.col(density_column).cast(pl.Float32).alias(TREE_DENSITY_FEATURE),
+        )
+        .drop_nulls(["postcode"])
+        .unique(["postcode"])
+    )
+
+
+def _validate_lad_source_coverage(
+    iod_path: Path, ethnicity_path: Path, rental_prices_path: Path
+) -> None:
+    iod_lads = (
+        pl.read_parquet(
+            iod_path,
+            columns=[
+                "Local Authority District code (2024)",
+                "Local Authority District name (2024)",
+            ],
+        )
+        .rename(
+            {
+                "Local Authority District code (2024)": "lad",
+                "Local Authority District name (2024)": "lad_name",
+            }
+        )
+        .unique(["lad"])
+    )
+
+    ethnicity_lads = pl.read_parquet(ethnicity_path, columns=["Geography_code"]).rename(
+        {"Geography_code": "lad"}
+    )
+    missing_ethnicity = iod_lads.join(ethnicity_lads, on="lad", how="anti").sort("lad")
+    if missing_ethnicity.height > 0:
+        raise ValueError(
+            "Ethnicity data is missing 2024 LAD coverage: "
+            f"{missing_ethnicity.to_dicts()}"
+        )
+
+    rental_lads = pl.read_parquet(rental_prices_path, columns=["area_code"]).rename(
+        {"area_code": "lad"}
+    )
+    missing_rent = iod_lads.join(rental_lads, on="lad", how="anti").sort("lad")
+    unexpected_missing_rent = missing_rent.filter(
+        ~pl.col("lad").is_in(list(_RENT_SOURCE_UNAVAILABLE_LADS))
+    )
+    if unexpected_missing_rent.height > 0:
+        raise ValueError(
+            "Rental data is missing 2024 LAD coverage: "
+            f"{unexpected_missing_rent.to_dicts()}"
+        )
+    if missing_rent.height > 0:
+        print(
+            "PIPR has no LAD-level rent estimates for source-unavailable LADs; "
+            f"rent will remain null there: {missing_rent.to_dicts()}"
+        )
+
+
+def _validate_property_postcodes(df: pl.DataFrame) -> None:
+    invalid = df.filter(
+        pl.col("Postcode").is_null()
+        | (pl.col("Postcode").cast(pl.Utf8).str.strip_chars() == "")
+    )
+    if invalid.height == 0:
+        return
+
+    sample_cols = [
+        col
+        for col in ("Postcode", "Address per Property Register", "Last known price")
+        if col in invalid.columns
+    ]
+    sample = invalid.select(sample_cols).head(10).to_dicts()
+    raise ValueError(
+        "Property rows missing a postcode after merge: "
+        f"{invalid.height} rows. Sample: {sample}"
+    )
+
+
 def _build(
    epc_pp_path: Path,
    arcgis_path: Path,
@ -126,12 +238,14 @@ def _build(
    lsoa_population_path: Path,
    median_age_path: Path,
    election_results_path: Path,
-    tree_density_addresses_path: Path | None = None,
+    tree_density_postcodes_path: Path | None = None,
 ) -> tuple[pl.DataFrame, pl.DataFrame]:
    """Build postcode and properties dataframes from epc_pp + auxiliary data.

    Returns (postcode_df, properties_df).
    """
+    _validate_lad_source_coverage(iod_path, ethnicity_path, rental_prices_path)
+
    wide = pl.scan_parquet(epc_pp_path).filter(
        pl.col("total_floor_area").is_null()
        | (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
@ -152,9 +266,15 @@ def _build(
        .drop("new_postcode")
    )

+    arcgis_raw = pl.scan_parquet(arcgis_path)
+    postcode_country = arcgis_raw.select(
+        pl.col("pcds").alias("postcode"),
+        pl.col("ctry25cd"),
+    ).unique(["postcode"])
+    wide = wide.join(postcode_country, on="postcode", how="left")
+
    arcgis = (
-        pl.scan_parquet(arcgis_path)
-        .filter(pl.col("ctry25cd") == "E92000001")  # England only
+        arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")  # England only
        .filter(pl.col("doterm").is_null())  # Active postcodes only
        # NSPL Feb 2026 renamed geographic code columns to {field}{year}cd.
        # Alias them back to the short canonical names used across the
@ -191,7 +311,9 @@ def _build(
        .cast(pl.UInt8)
        .alias("_bedrooms"),
    )
-    rental = pl.scan_parquet(rental_prices_path)
+    rental = pl.scan_parquet(rental_prices_path).select(
+        "area_code", "bedrooms", "mean_monthly_rent"
+    )
    wide = wide.join(
        rental,
        left_on=["Local Authority District code (2024)", "_bedrooms"],
@ -260,17 +382,9 @@ def _build(
    school_proximity = pl.scan_parquet(school_proximity_path)
    wide = wide.join(school_proximity, on="postcode", how="left")

-    if tree_density_addresses_path is not None:
-        tree_density = (
-            pl.scan_parquet(tree_density_addresses_path)
-            .select(
-                pl.col("postcode"),
-                pl.col("pp_address"),
-                pl.col(TREE_DENSITY_FEATURE).cast(pl.Float32),
-            )
-            .unique(["postcode", "pp_address"])
-        )
-        wide = wide.join(tree_density, on=["postcode", "pp_address"], how="left")
+    if tree_density_postcodes_path is not None:
+        tree_density = _tree_density_by_postcode(tree_density_postcodes_path)
+        wide = wide.join(tree_density, on="postcode", how="left")

    # Broadband: derive max available download speed tier per postcode from
    # Ofcom availability percentages.  Tiers: Gigabit ≥1000, UFBB ≥300,
@ -415,6 +529,7 @@ def _build(

    print("Collecting with streaming engine...")
    df = wide.collect(engine="streaming")
+    _validate_property_postcodes(df)

    # Split into postcode-level and property-level dataframes
    area_cols = [
@ -508,10 +623,10 @@ def main():
        help="2024 General Election results by constituency parquet file",
    )
    parser.add_argument(
-        "--tree-density-addresses",
+        "--tree-density-postcodes",
        type=Path,
        required=False,
-        help="Address-level tree density parquet from pipeline.transform.tree_density",
+        help="Postcode-level tree density parquet from pipeline.transform.tree_density",
    )
    parser.add_argument(
        "--output-postcodes",
@ -541,7 +656,7 @@ def main():
        lsoa_population_path=args.lsoa_population,
        median_age_path=args.median_age,
        election_results_path=args.election_results,
-        tree_density_addresses_path=args.tree_density_addresses,
+        tree_density_postcodes_path=args.tree_density_postcodes,
    )

    print(f"\nPostcode columns: {postcode_df.columns}")
--- a/pipeline/transform/price_estimation/backtest.py
+++ b/pipeline/transform/price_estimation/backtest.py
@ -227,7 +227,18 @@ def main():
    fa = test["Total floor area (sqm)"].cast(pl.Float64).fill_null(0.0).to_numpy()

    print("\nComputing kNN estimates...")
-    knn_psm = knn_median_psm(trees, lat, lon, tg)
+    last_sale_dates = (
+        test["input_date"].dt.epoch("d").fill_null(-1).to_numpy().astype(np.int64)
+    )
+    knn_psm = knn_median_psm(
+        trees,
+        lat,
+        lon,
+        tg,
+        postcodes=test["Postcode"].fill_null("").to_numpy(),
+        last_prices=test["input_price"].cast(pl.Float64).to_numpy(),
+        last_sale_dates=last_sale_dates,
+    )

    # Temporal adjustment: pool PSM is at ref, adjust to actual
    log_idx_actual = test["log_index_actual"].to_numpy().astype(np.float64)
--- a/pipeline/transform/price_estimation/estimate.py
+++ b/pipeline/transform/price_estimation/estimate.py
@ -13,6 +13,7 @@ for lat/lon needed by kNN, then drops those columns before writing.
 import argparse
 from pathlib import Path

+import numpy as np
 import polars as pl

 from pipeline.transform.price_estimation.knn import (
@ -28,6 +29,45 @@ from pipeline.transform.price_estimation.utils import (
    type_group_expr,
 )

+MAX_KNN_TO_INDEX_RATIO = 2.0
+MIN_KNN_TO_INDEX_RATIO = 0.5
+MAX_ESTIMATE_TO_LAST_PRICE_RATIO = 6.0
+
+
+def guarded_blend_estimates(
+    index_est: np.ndarray,
+    knn_est: np.ndarray,
+    last_prices: np.ndarray,
+    weight: float = KNN_BLEND_WEIGHT,
+) -> np.ndarray:
+    """Blend only stable kNN estimates and cap final uplift from last sale price."""
+    index_est = index_est.astype(np.float64, copy=False)
+    knn_est = knn_est.astype(np.float64, copy=False)
+    last_prices = last_prices.astype(np.float64, copy=False)
+
+    has_index = np.isfinite(index_est) & (index_est > 0)
+    has_knn = np.isfinite(knn_est) & (knn_est > 0)
+    stable_knn = has_knn & (
+        has_index
+        & (knn_est >= index_est * MIN_KNN_TO_INDEX_RATIO)
+        & (knn_est <= index_est * MAX_KNN_TO_INDEX_RATIO)
+    )
+
+    blended = np.where(
+        has_index & stable_knn,
+        (1 - weight) * index_est + weight * knn_est,
+        np.where(has_index, index_est, np.nan),
+    )
+
+    cap = np.where(
+        np.isfinite(last_prices) & (last_prices > 0),
+        last_prices * MAX_ESTIMATE_TO_LAST_PRICE_RATIO,
+        np.nan,
+    )
+    return np.where(
+        np.isfinite(cap) & np.isfinite(blended), np.minimum(blended, cap), blended
+    )
+

 def main():
    parser = argparse.ArgumentParser(
@ -130,36 +170,54 @@ def main():
    lon = df["lon"].cast(pl.Float64).to_numpy()
    tg = df["_type_group"].fill_null("").to_numpy()
    fa = df["Total floor area (sqm)"].cast(pl.Float64).fill_null(0.0).to_numpy()
+    last_prices = (
+        df["Last known price"].cast(pl.Float64).fill_null(float("nan")).to_numpy()
+    )
+    last_sale_dates = (
+        df["Date of last transaction"]
+        .dt.epoch("d")
+        .fill_null(-1)
+        .to_numpy()
+        .astype(np.int64)
+    )

-    knn_psm = knn_median_psm(trees, lat, lon, tg)
+    knn_psm = knn_median_psm(
+        trees,
+        lat,
+        lon,
+        tg,
+        postcodes=df["Postcode"].fill_null("").to_numpy(),
+        last_prices=last_prices,
+        last_sale_dates=last_sale_dates,
+    )
    knn_est = knn_psm * fa  # No temporal adj: ref == current

    df = df.with_columns(
        pl.Series("_knn_est", knn_est, dtype=pl.Float64),
    )

-    # Blend: where kNN available, use weighted average; else keep index
+    # Blend only when kNN is close to the index estimate; otherwise keep index.
+    index_est = (
+        df["Estimated current price"]
+        .cast(pl.Float64)
+        .fill_null(float("nan"))
+        .to_numpy()
+    )
+    blended = guarded_blend_estimates(index_est, knn_est, last_prices)
    df = df.with_columns(
-        pl.when(
-            pl.col("Estimated current price").is_not_null()
-            & pl.col("_knn_est").is_not_null()
-            & pl.col("_knn_est").is_finite()
-            & (pl.col("_knn_est") > 0)
-        )
-        .then(
-            (1 - KNN_BLEND_WEIGHT) * pl.col("Estimated current price")
-            + KNN_BLEND_WEIGHT * pl.col("_knn_est")
-        )
-        .when(pl.col("Estimated current price").is_not_null())
-        .then(pl.col("Estimated current price"))
-        .otherwise(pl.lit(None))
-        .alias("Estimated current price"),
+        pl.Series("_index_est", index_est, dtype=pl.Float64),
+        pl.Series("Estimated current price", blended, dtype=pl.Float64),
+    ).with_columns(
+        pl.col("Estimated current price").fill_nan(None),
    )

    n_blended = df.filter(
        pl.col("_knn_est").is_not_null()
        & pl.col("_knn_est").is_finite()
        & (pl.col("_knn_est") > 0)
+        & (pl.col("_index_est").is_not_null())
+        & (pl.col("_knn_est") >= pl.col("_index_est") * MIN_KNN_TO_INDEX_RATIO)
+        & (pl.col("_knn_est") <= pl.col("_index_est") * MAX_KNN_TO_INDEX_RATIO)
        & pl.col("Estimated current price").is_not_null()
    ).height
    print(f"  kNN blended: {n_blended:,} of {n_estimated:,} estimates")
--- a/pipeline/transform/price_estimation/knn.py
+++ b/pipeline/transform/price_estimation/knn.py
@ -21,6 +21,10 @@ from pipeline.transform.price_estimation.utils import (
 KNN_K = 20
 KNN_MIN_NEIGHBORS = 5
 KNN_BLEND_WEIGHT = 0.35
+MIN_COMPARABLE_FLOOR_AREA_SQM = 15.0
+MAX_COMPARABLE_FLOOR_AREA_SQM = 1_000.0
+MIN_COMPARABLE_PSM = 500.0
+MAX_COMPARABLE_PSM = 50_000.0


 def _scale_coords(lat: np.ndarray, lon: np.ndarray) -> np.ndarray:
@ -33,13 +37,14 @@ def build_knn_pool(
    index: pl.DataFrame,
    ref_frac_year: float,
    max_sale_year: int | None = None,
-) -> dict[str, tuple[KDTree, np.ndarray]]:
+) -> dict[str, tuple[KDTree, np.ndarray, np.ndarray, np.ndarray, np.ndarray]]:
    """Build per-type_group KD-trees of index-adjusted price-per-sqm.

    Adjusts all pool properties' sale prices to ref_frac_year using the index,
    then builds a KD-tree per type_group for nearest-neighbor queries.

-    Returns dict mapping type_group -> (KDTree over scaled lat/lon, adjusted_psm array).
+    Returns dict mapping type_group to KDTree, adjusted PSM, and sale identity
+    arrays used to keep the target sale out of its own comparable set.
    """
    print("Building kNN pool...")
    lf = pl.scan_parquet(source) if isinstance(source, Path) else source
@ -55,7 +60,8 @@ def build_knn_pool(
        pl.col("lat").is_not_null(),
        pl.col("lon").is_not_null(),
        pl.col("Total floor area (sqm)").is_not_null(),
-        pl.col("Total floor area (sqm)") > 0,
+        pl.col("Total floor area (sqm)") >= MIN_COMPARABLE_FLOOR_AREA_SQM,
+        pl.col("Total floor area (sqm)") <= MAX_COMPARABLE_FLOOR_AREA_SQM,
        pl.col("Last known price").is_not_null(),
        pl.col("Last known price") > 0,
        pl.col("Postcode").is_not_null(),
@ -97,12 +103,13 @@ def build_knn_pool(
    ).filter(
        pl.col("_adj_psm").is_not_null(),
        pl.col("_adj_psm").is_finite(),
-        pl.col("_adj_psm") > 0,
+        pl.col("_adj_psm") >= MIN_COMPARABLE_PSM,
+        pl.col("_adj_psm") <= MAX_COMPARABLE_PSM,
    )
    print(f"  {len(pool):,} after index adjustment")

    # Build per-type KD-trees
-    trees: dict[str, tuple[KDTree, np.ndarray]] = {}
+    trees: dict[str, tuple[KDTree, np.ndarray, np.ndarray, np.ndarray, np.ndarray]] = {}
    for tg in TYPE_GROUPS:
        sub = pool.filter(pl.col("type_group") == tg)
        n = len(sub)
@ -111,19 +118,49 @@ def build_knn_pool(
        lat = sub["lat"].to_numpy().astype(np.float64)
        lon = sub["lon"].to_numpy().astype(np.float64)
        psm = sub["_adj_psm"].to_numpy().astype(np.float64)
+        postcodes = sub["Postcode"].fill_null("").to_numpy()
+        prices = sub["Last known price"].to_numpy().astype(np.float64)
+        sale_dates = (
+            sub["Date of last transaction"]
+            .dt.epoch("d")
+            .fill_null(-1)
+            .to_numpy()
+            .astype(np.int64)
+        )
        tree = KDTree(_scale_coords(lat, lon))
-        trees[tg] = (tree, psm)
+        trees[tg] = (tree, psm, postcodes, prices, sale_dates)
        print(f"    {tg}: {n:,}")

    return trees


+def _sale_identity_matches(
+    pool_postcodes: np.ndarray,
+    pool_prices: np.ndarray,
+    pool_sale_dates: np.ndarray,
+    target_postcode: str,
+    target_price: float,
+    target_sale_date: int,
+) -> np.ndarray:
+    if not target_postcode or not np.isfinite(target_price) or target_sale_date < 0:
+        return np.zeros(len(pool_postcodes), dtype=bool)
+    return (
+        (pool_postcodes == target_postcode)
+        & np.isfinite(pool_prices)
+        & np.isclose(pool_prices, target_price, rtol=0.0, atol=0.5)
+        & (pool_sale_dates == target_sale_date)
+    )
+
+
 def knn_median_psm(
-    trees: dict[str, tuple[KDTree, np.ndarray]],
+    trees: dict[str, tuple[KDTree, np.ndarray, np.ndarray, np.ndarray, np.ndarray]],
    lat: np.ndarray,
    lon: np.ndarray,
    type_groups: np.ndarray,
    k: int = KNN_K,
+    postcodes: np.ndarray | None = None,
+    last_prices: np.ndarray | None = None,
+    last_sale_dates: np.ndarray | None = None,
 ) -> np.ndarray:
    """Return median adjusted-PSM of k nearest neighbours for each target.

@ -133,21 +170,41 @@ def knn_median_psm(
    n = len(lat)
    result = np.full(n, np.nan)

-    for tg, (tree, psm) in trees.items():
+    for tg, (tree, psm, pool_postcodes, pool_prices, pool_sale_dates) in trees.items():
        mask = (type_groups == tg) & np.isfinite(lat) & np.isfinite(lon)
        idx = np.where(mask)[0]
        if len(idx) == 0:
            continue

-        actual_k = min(k, len(psm))
-        if actual_k < KNN_MIN_NEIGHBORS:
+        query_k = min(max(k * 2, k + KNN_MIN_NEIGHBORS), len(psm))
+        if query_k < KNN_MIN_NEIGHBORS:
            continue

        coords = _scale_coords(lat[idx], lon[idx])
-        _, nn_idx = tree.query(coords, k=actual_k)
+        _, nn_idx = tree.query(coords, k=query_k)
        if nn_idx.ndim == 1:
            nn_idx = nn_idx.reshape(-1, 1)

-        result[idx] = np.nanmedian(psm[nn_idx], axis=1)
+        medians = np.full(len(idx), np.nan)
+        for row_num, target_idx in enumerate(idx):
+            candidates = nn_idx[row_num]
+            if (
+                postcodes is not None
+                and last_prices is not None
+                and last_sale_dates is not None
+            ):
+                same_sale = _sale_identity_matches(
+                    pool_postcodes[candidates],
+                    pool_prices[candidates],
+                    pool_sale_dates[candidates],
+                    str(postcodes[target_idx] or ""),
+                    float(last_prices[target_idx]),
+                    int(last_sale_dates[target_idx]),
+                )
+                candidates = candidates[~same_sale]
+            if len(candidates) >= KNN_MIN_NEIGHBORS:
+                medians[row_num] = np.nanmedian(psm[candidates[:k]])
+
+        result[idx] = medians

    return result
--- a/pipeline/transform/price_estimation/utils.py
+++ b/pipeline/transform/price_estimation/utils.py
@ -19,7 +19,7 @@ TERRACE_TYPES = [
    "Terraced",
 ]
 FLAT_TYPES = ["Flats/Maisonettes"]
-TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats", "Bungalow"]
+TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats"]
 SHRINKAGE_K = 50


@ -30,8 +30,6 @@ def type_group_expr():
        .then(pl.lit("Terraced"))
        .when(pl.col("Property type").is_in(FLAT_TYPES))
        .then(pl.lit("Flats"))
-        .when(pl.col("Property type") == "Bungalow")
-        .then(pl.lit("Bungalow"))
        .when(pl.col("Property type").is_in(["Detached", "Semi-Detached"]))
        .then(pl.col("Property type"))
        .otherwise(pl.lit(None))
@ -61,7 +59,7 @@ def hierarchy_keys(sector: str) -> tuple[str, str]:
    return district, area


-NON_REF_TYPES = ["Terraced", "Semi-Detached", "Flats", "Bungalow"]
+NON_REF_TYPES = ["Terraced", "Semi-Detached", "Flats"]


 def build_hedonic_features(df: pl.DataFrame) -> np.ndarray:
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -15,12 +15,21 @@ DROP_CATEGORIES = {
    "amenity/bicycle_parking",
    "amenity/binoculars",
    "amenity/boot_scraper",
+    "amenity/bus_garage",
    "amenity/check_in",
    "amenity/clock",
+    "amenity/clothes_dryer",
+    "amenity/coast_guard",
+    "amenity/coffin_rest",
    "amenity/compressed_air",
+    "amenity/court_yard",
    "amenity/donation_box",
    "amenity/dressing_room",
    "amenity/drinking_water",
+    "emergency/water_tank",
+    "leisure/bleachers",
+    "leisure/schoolyard",
+    "public_transport/pay_scale_area",
    "shop/taxi",
    "amenity/feeding_place",
    "amenity/fixme",
@ -31,6 +40,7 @@ DROP_CATEGORIES = {
    "amenity/lounge",
    "tourism/preserved_railway",
    "amenity/lounger",
+    "leisure/sport",
    "amenity/motorcycle_parking",
    "amenity/mounting_block",
    "amenity/notice_board",
@ -71,8 +81,12 @@ DROP_CATEGORIES = {
    "amenity/boat_storage",
    "amenity/bureau_de_change",
    "amenity/bus_station",
+    "amenity/beachhut",
+    "amenity/canteen",
    "amenity/conference_centre",
    "amenity/crematorium",
+    "amenity/disused",
+    "amenity/driver_training",
    "amenity/driving_school",
    "amenity/escooter_rental",
    "amenity/ferry_terminal",
@ -82,14 +96,21 @@ DROP_CATEGORIES = {
    "amenity/kick-scooter_rental",
    "amenity/money_transfer",
    "amenity/post_depot",
+    "amenity/prison",
    "amenity/public_building",
    "amenity/recycling",
    "amenity/scout_hut",
    "amenity/social_facility",
    "amenity/studio",
+    "amenity/student_accommodation",
    "amenity/taxi",
+    "amenity/telephone_exchange",
    "amenity/training",
    "amenity/vehicle_inspection",
+    "amenity/waiting_room",
+    "amenity/yes",
+    "shop/disused",
+    "shop/no",
    # Buildings (except church & university which are mapped)
    "building/air_shaft",
    "building/apartments",
@ -148,12 +169,14 @@ DROP_CATEGORIES = {
    "emergency/yes",
    "tourism/apartment",
    "tourism/apartments",
+    "tourism/alpine_hut",
    "tourism/camp_pitch",
    "tourism/caravan_site",
    "tourism/information",
    "tourism/picnic_site",
    "tourism/viewpoint",
    "tourism/village_sign",
+    "tourism/wilderness_hut",
    "tourism/yes",
    # Public transport (from NaPTAN instead)
    "public_transport/entrance",
@ -191,6 +214,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        "🍺",
        [
            "amenity/pub",
+            "amenity/beer_garden",
+            "amenity/biergarten",
            "amenity/social_club",
            "amenity/club",
            "leisure/social_club",
@ -293,7 +318,13 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "leisure/miniature_golf",
            "leisure/horse_riding",
            "leisure/fishing",
+            "leisure/ice_rink",
+            "leisure/paddling_pool",
+            "leisure/practice_pitch",
+            "leisure/shooting_ground",
+            "leisure/stadium",
            "leisure/swimming_pool",
+            "leisure/swimming_area",
            "leisure/water_park",
            "leisure/bathing_place",
        ],
@ -307,9 +338,11 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "leisure/amusement_arcade",
            "leisure/adult_gaming_centre",
            "leisure/escape_game",
+            "leisure/maze",
            "leisure/trampoline_park",
            "leisure/sauna",
            "leisure/tanning_salon",
+            "shop/amusements",
            "tourism/theme_park",
            "amenity/bicycle_rental",
            "amenity/boat_rental",
@ -345,6 +378,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        [
            "shop/bakery",
            "shop/pastry",
+            "craft/bakery",
            "craft/confectionery",
        ],
    ),
@ -364,6 +398,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        [
            "shop/greengrocer",
            "shop/farm",
+            "shop/market",
            "amenity/marketplace",
        ],
    ),
@ -424,6 +459,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "shop/appliance",
            "shop/electrical",
            "shop/hifi",
+            "shop/vacuum_cleaner",
            "shop/video_games",
            "shop/games",
        ],
@ -444,7 +480,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        [
            "shop/doityourself",
            "shop/hardware",
+            "shop/builders_merchant",
            "shop/paint",
+            "shop/plumbing",
        ],
    ),
    (
@ -462,11 +500,15 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "shop/curtain",
            "shop/flooring",
            "shop/fireplace",
+            "shop/garden_furniture",
+            "shop/groundskeeping",
            "shop/household",
            "shop/household_linen",
            "shop/houseware",
+            "shop/homeware",
            "shop/interior_decoration",
            "shop/lighting",
+            "shop/kitchenware",
            "shop/window_blind",
        ],
    ),
@ -493,8 +535,11 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        "🏕️",
        [
            "shop/sports",
+            "shop/angling",
            "shop/outdoor",
            "shop/bicycle",
+            "shop/equestrian",
+            "shop/surf",
        ],
    ),
    (
@ -532,9 +577,11 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "shop/music",
            "shop/musical_instrument",
            "shop/antiques",
+            "shop/anime",
            "shop/baby_goods",
            "shop/fabric",
            "shop/haberdashery",
+            "shop/hobby",
            "shop/wool",
            "shop/pottery",
        ],
@ -549,9 +596,13 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "shop/bookmaker",
            "shop/building_materials",
            "shop/camera",
+            "shop/cannabis",
            "shop/car",
            "shop/caravan",
            "shop/catalogue",
+            "shop/auction",
+            "shop/auction_house",
+            "shop/chandler",
            "shop/collector",
            "shop/copyshop",
            "shop/country_store",
@ -560,6 +611,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "shop/erotic",
            "shop/esoteric",
            "shop/fan",
+            "shop/fireworks",
            "shop/fishing",
            "shop/frame",
            "shop/fuel",
@ -582,6 +634,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "shop/scuba_diving",
            "shop/security",
            "shop/sewing",
+            "shop/ship_chandler",
+            "shop/signs",
            "shop/storage_rental",
            "shop/swimming_pool",
            "shop/telecommunication",
@ -590,7 +644,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "shop/tool_hire",
            "shop/trade",
            "shop/trophy",
+            "shop/truck",
            "shop/vacant",
+            "shop/van",
            "shop/video",
            "shop/water_sports",
            "shop/weapons",
@ -611,6 +667,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "shop/cosmetics",
            "shop/massage",
            "shop/perfumery",
+            "leisure/spa",
        ],
    ),
    (
@ -757,6 +814,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        [
            "amenity/hospital",
            "amenity/clinic",
+            "amenity/health_centre",
+            "healthcare/blood_donation",
            "healthcare/hospital",
            "healthcare/centre",
            "healthcare/clinic",
@ -804,6 +863,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        [
            "amenity/care_home",
            "amenity/nursing_home",
+            "amenity/retirement_home",
+            "healthcare/hospice",
+            "healthcare/nursing_home",
            "office/home_care",
        ],
    ),
@ -848,6 +910,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        "⛪",
        [
            "amenity/place_of_worship",
+            "amenity/monastery",
            "building/church",
        ],
    ),
@ -873,6 +936,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        "📸",
        [
            "tourism/attraction",
+            "tourism/aquarium",
            "amenity/fountain",
            "amenity/courthouse",
            "tourism/chalet",
@ -892,6 +956,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "building/university",
            "amenity/kindergarten",
            "amenity/childcare",
+            "office/tutoring",
        ],
    ),
    (
@ -904,6 +969,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "tourism/guest_house",
            "tourism/motel",
            "tourism/camp_site",
+            "leisure/resort",
+            "tourism/holiday_park",
+            "tourism/self_catering",
        ],
    ),
    (
@ -928,14 +996,19 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "craft/window_construction",
            "craft/agricultural_engines",
            "craft/atelier",
+            "craft/beekeeper",
            "craft/blacksmith",
            "craft/bookbinder",
+            "craft/boatbuilder",
            "craft/caterer",
+            "craft/carpet_layer",
+            "craft/clockmaker",
            "craft/handicraft",
            "craft/jeweller",
            "craft/metal_construction",
            "craft/photographer",
            "craft/photographic_laboratory",
+            "craft/plasterer",
            "craft/pottery",
            "craft/printer",
            "craft/sawmill",
@ -946,22 +1019,28 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "craft/upholsterer",
            "craft/watchmaker",
            "craft/yes",
+            "amenity/workshop",
            "shop/glaziery",
            "shop/windows",
            # Professional offices & estate agents
            "shop/estate_agent",
            "office/accountant",
            "office/architect",
+            "office/auctioneer",
+            "office/builder",
+            "office/construction",
            "office/construction_company",
            "office/engineer",
            "office/estate_agent",
            "office/financial",
            "office/financial_advisor",
+            "office/financial_services",
            "office/insurance",
            "office/lawyer",
            "office/mortgage",
            "office/property_management",
            "office/solicitor",
+            "office/solicitors",
            "office/surveyor",
            "office/tax_advisor",
        ],
@ -972,6 +1051,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        "🏢",
        [
            "amenity/coworking_space",
+            "amenity/research_institute",
+            "office/administrative",
            "office/advertising_agency",
            "office/association",
            "office/charity",
@ -997,12 +1078,15 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "office/notary",
            "office/political_party",
            "office/politician",
+            "office/publisher",
+            "office/quango",
            "office/recruitment",
            "office/religion",
            "office/research",
            "office/security",
            "office/taxi",
            "office/telecommunication",
+            "office/transport",
            "office/union",
            "office/university",
            "office/vacant",
@ -1032,7 +1116,11 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        "Community Centre",
        "🤝",
        [
+            "amenity/church_hall",
+            "amenity/clubhouse",
            "amenity/community_centre",
+            "amenity/community_hall",
+            "amenity/scout_hall",
            "amenity/social_centre",
            "amenity/townhall",
        ],