perfect-postcode/pipeline/transform/price_estimation/knn.py
2026-06-02 20:14:32 +01:00

234 lines
8.7 KiB
Python

"""kNN price estimation using nearby recently-sold properties.
For each target property, finds k nearest sold properties of the same type,
computes the median index-adjusted price-per-sqm, and multiplies by the
target's floor area to produce an estimate.
"""
from pathlib import Path
import numpy as np
import polars as pl
from scipy.spatial import KDTree
from pipeline.transform.price_estimation.utils import (
TYPE_GROUPS,
interpolate_log_index,
sector_expr,
type_group_expr,
)
KNN_K = 20
KNN_MIN_NEIGHBORS = 5
KNN_BLEND_WEIGHT = 0.35
MIN_COMPARABLE_FLOOR_AREA_SQM = 15.0
MAX_COMPARABLE_FLOOR_AREA_SQM = 1_000.0
MIN_COMPARABLE_PSM = 500.0
MAX_COMPARABLE_PSM = 50_000.0
def _scale_coords(lat: np.ndarray, lon: np.ndarray) -> np.ndarray:
"""Equirectangular projection: scale lon by cos(lat) for approximate distances."""
return np.column_stack([lat, lon * np.cos(np.radians(lat))])
def build_knn_pool(
source: Path | pl.LazyFrame,
index: pl.DataFrame,
ref_frac_year: float,
max_sale_year: int | None = None,
) -> dict[str, tuple[KDTree, np.ndarray, np.ndarray, np.ndarray, np.ndarray]]:
"""Build per-type_group KD-trees of index-adjusted price-per-sqm.
Adjusts all pool properties' sale prices to ref_frac_year using the index,
then builds a KD-tree per type_group for nearest-neighbor queries.
Returns dict mapping type_group to KDTree, adjusted PSM, and sale identity
arrays used to keep the target sale out of its own comparable set.
"""
print("Building kNN pool...")
lf = pl.scan_parquet(source) if isinstance(source, Path) else source
query = lf.select(
"Postcode",
"Property type",
"lat",
"lon",
"Total floor area (sqm)",
"Last known price",
"Date of last transaction",
).filter(
pl.col("lat").is_not_null(),
pl.col("lon").is_not_null(),
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") >= MIN_COMPARABLE_FLOOR_AREA_SQM,
pl.col("Total floor area (sqm)") <= MAX_COMPARABLE_FLOOR_AREA_SQM,
pl.col("Last known price").is_not_null(),
pl.col("Last known price") > 0,
pl.col("Postcode").is_not_null(),
pl.col("Date of last transaction").is_not_null(),
)
if max_sale_year is not None:
query = query.filter(
pl.col("Date of last transaction").dt.year() < max_sale_year
)
pool = query.with_columns(
sector_expr(),
type_group_expr(),
(
pl.col("Date of last transaction").dt.year().cast(pl.Float64)
+ (pl.col("Date of last transaction").dt.month().cast(pl.Float64) - 1.0)
/ 12.0
).alias("_sale_fy"),
pl.lit(ref_frac_year).alias("_ref_fy"),
).collect()
pool = pool.filter(pl.col("type_group").is_not_null())
print(f" {len(pool):,} pool properties with lat/lon, floor area, price")
# Interpolate log_index at sale date and reference date
pool = interpolate_log_index(
index, pool, "sector", "type_group", "_sale_fy", "_li_sale"
)
pool = interpolate_log_index(
index, pool, "sector", "type_group", "_ref_fy", "_li_ref"
)
# adjusted_psm = price / floor_area * exp(log_index_ref - log_index_sale)
pool = pool.with_columns(
(
pl.col("Last known price").cast(pl.Float64)
/ pl.col("Total floor area (sqm)").cast(pl.Float64)
* (pl.col("_li_ref") - pl.col("_li_sale")).exp()
).alias("_adj_psm")
).filter(
pl.col("_adj_psm").is_not_null(),
pl.col("_adj_psm").is_finite(),
pl.col("_adj_psm") >= MIN_COMPARABLE_PSM,
pl.col("_adj_psm") <= MAX_COMPARABLE_PSM,
)
print(f" {len(pool):,} after index adjustment")
# Build per-type KD-trees
trees: dict[str, tuple[KDTree, np.ndarray, np.ndarray, np.ndarray, np.ndarray]] = {}
for tg in TYPE_GROUPS:
sub = pool.filter(pl.col("type_group") == tg)
n = len(sub)
if n < KNN_MIN_NEIGHBORS:
continue
lat = sub["lat"].to_numpy().astype(np.float64)
lon = sub["lon"].to_numpy().astype(np.float64)
psm = sub["_adj_psm"].to_numpy().astype(np.float64)
postcodes = sub["Postcode"].fill_null("").to_numpy()
prices = sub["Last known price"].to_numpy().astype(np.float64)
sale_dates = (
sub["Date of last transaction"]
.dt.epoch("d")
.fill_null(-1)
.to_numpy()
.astype(np.int64)
)
tree = KDTree(_scale_coords(lat, lon))
trees[tg] = (tree, psm, postcodes, prices, sale_dates)
print(f" {tg}: {n:,}")
return trees
def _sale_identity_matches(
pool_postcodes: np.ndarray,
pool_prices: np.ndarray,
pool_sale_dates: np.ndarray,
target_postcode: str,
target_price: float,
target_sale_date: int,
) -> np.ndarray:
"""Mark pool comparables that are (almost certainly) the target's own sale.
properties.parquet has no per-property id, so a sale is identified by the
proxy tuple (postcode, price within 0.5, sale_date) to keep a target's own
prior sale out of its comparable set (leakage prevention).
Limitation: new-build / bulk blocks sell many DISTINCT properties in one
postcode on the same day at the same price, so all such siblings collide on
this proxy and are excluded together. This is intentional conservative
over-exclusion: it guarantees no leakage at the cost of occasionally
dropping legitimate same-(postcode, price, date) siblings. The effect is
bounded (~1.8% of the pool) and a precise fix would require a per-property
id that the data does not carry.
"""
if not target_postcode or not np.isfinite(target_price) or target_sale_date < 0:
return np.zeros(len(pool_postcodes), dtype=bool)
return (
(pool_postcodes == target_postcode)
& np.isfinite(pool_prices)
& np.isclose(pool_prices, target_price, rtol=0.0, atol=0.5)
& (pool_sale_dates == target_sale_date)
)
def knn_median_psm(
trees: dict[str, tuple[KDTree, np.ndarray, np.ndarray, np.ndarray, np.ndarray]],
lat: np.ndarray,
lon: np.ndarray,
type_groups: np.ndarray,
k: int = KNN_K,
postcodes: np.ndarray | None = None,
last_prices: np.ndarray | None = None,
last_sale_dates: np.ndarray | None = None,
) -> np.ndarray:
"""Return median adjusted-PSM of k nearest neighbours for each target.
PSM is at the reference date used when building the pool.
NaN where not computable (missing coords, unknown type, too few neighbors).
Coordinate limitation: lat/lon come from postcode.parquet (one centroid per
postcode), so every property within a postcode is co-located. For a dense
postcode the "k nearest" therefore degenerates into an arbitrary
same-postcode subset whose membership is decided by KDTree index order
rather than true proximity. No property-level coordinates exist to fix this,
so the kNN signal is treated as a weak, noisy prior: the downstream guarded
blend (guarded_blend_estimates) only blends kNN when it is close to the
index estimate and otherwise discards it, bounding the impact of this
degeneracy. The result is deterministic for a fixed pool order.
"""
n = len(lat)
result = np.full(n, np.nan)
for tg, (tree, psm, pool_postcodes, pool_prices, pool_sale_dates) in trees.items():
mask = (type_groups == tg) & np.isfinite(lat) & np.isfinite(lon)
idx = np.where(mask)[0]
if len(idx) == 0:
continue
query_k = min(max(k * 2, k + KNN_MIN_NEIGHBORS), len(psm))
if query_k < KNN_MIN_NEIGHBORS:
continue
coords = _scale_coords(lat[idx], lon[idx])
_, nn_idx = tree.query(coords, k=query_k)
if nn_idx.ndim == 1:
nn_idx = nn_idx.reshape(-1, 1)
medians = np.full(len(idx), np.nan)
for row_num, target_idx in enumerate(idx):
candidates = nn_idx[row_num]
if (
postcodes is not None
and last_prices is not None
and last_sale_dates is not None
):
same_sale = _sale_identity_matches(
pool_postcodes[candidates],
pool_prices[candidates],
pool_sale_dates[candidates],
str(postcodes[target_idx] or ""),
float(last_prices[target_idx]),
int(last_sale_dates[target_idx]),
)
candidates = candidates[~same_sale]
if len(candidates) >= KNN_MIN_NEIGHBORS:
medians[row_num] = np.nanmedian(psm[candidates[:k]])
result[idx] = medians
return result