Update data
This commit is contained in:
parent
a4103b0896
commit
273d7a83ee
15 changed files with 716 additions and 316 deletions
|
|
@ -13,6 +13,7 @@ for lat/lon needed by kNN, then drops those columns before writing.
|
|||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.price_estimation.knn import (
|
||||
|
|
@ -28,6 +29,45 @@ from pipeline.transform.price_estimation.utils import (
|
|||
type_group_expr,
|
||||
)
|
||||
|
||||
MAX_KNN_TO_INDEX_RATIO = 2.0
|
||||
MIN_KNN_TO_INDEX_RATIO = 0.5
|
||||
MAX_ESTIMATE_TO_LAST_PRICE_RATIO = 6.0
|
||||
|
||||
|
||||
def guarded_blend_estimates(
|
||||
index_est: np.ndarray,
|
||||
knn_est: np.ndarray,
|
||||
last_prices: np.ndarray,
|
||||
weight: float = KNN_BLEND_WEIGHT,
|
||||
) -> np.ndarray:
|
||||
"""Blend only stable kNN estimates and cap final uplift from last sale price."""
|
||||
index_est = index_est.astype(np.float64, copy=False)
|
||||
knn_est = knn_est.astype(np.float64, copy=False)
|
||||
last_prices = last_prices.astype(np.float64, copy=False)
|
||||
|
||||
has_index = np.isfinite(index_est) & (index_est > 0)
|
||||
has_knn = np.isfinite(knn_est) & (knn_est > 0)
|
||||
stable_knn = has_knn & (
|
||||
has_index
|
||||
& (knn_est >= index_est * MIN_KNN_TO_INDEX_RATIO)
|
||||
& (knn_est <= index_est * MAX_KNN_TO_INDEX_RATIO)
|
||||
)
|
||||
|
||||
blended = np.where(
|
||||
has_index & stable_knn,
|
||||
(1 - weight) * index_est + weight * knn_est,
|
||||
np.where(has_index, index_est, np.nan),
|
||||
)
|
||||
|
||||
cap = np.where(
|
||||
np.isfinite(last_prices) & (last_prices > 0),
|
||||
last_prices * MAX_ESTIMATE_TO_LAST_PRICE_RATIO,
|
||||
np.nan,
|
||||
)
|
||||
return np.where(
|
||||
np.isfinite(cap) & np.isfinite(blended), np.minimum(blended, cap), blended
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
|
|
@ -130,36 +170,54 @@ def main():
|
|||
lon = df["lon"].cast(pl.Float64).to_numpy()
|
||||
tg = df["_type_group"].fill_null("").to_numpy()
|
||||
fa = df["Total floor area (sqm)"].cast(pl.Float64).fill_null(0.0).to_numpy()
|
||||
last_prices = (
|
||||
df["Last known price"].cast(pl.Float64).fill_null(float("nan")).to_numpy()
|
||||
)
|
||||
last_sale_dates = (
|
||||
df["Date of last transaction"]
|
||||
.dt.epoch("d")
|
||||
.fill_null(-1)
|
||||
.to_numpy()
|
||||
.astype(np.int64)
|
||||
)
|
||||
|
||||
knn_psm = knn_median_psm(trees, lat, lon, tg)
|
||||
knn_psm = knn_median_psm(
|
||||
trees,
|
||||
lat,
|
||||
lon,
|
||||
tg,
|
||||
postcodes=df["Postcode"].fill_null("").to_numpy(),
|
||||
last_prices=last_prices,
|
||||
last_sale_dates=last_sale_dates,
|
||||
)
|
||||
knn_est = knn_psm * fa # No temporal adj: ref == current
|
||||
|
||||
df = df.with_columns(
|
||||
pl.Series("_knn_est", knn_est, dtype=pl.Float64),
|
||||
)
|
||||
|
||||
# Blend: where kNN available, use weighted average; else keep index
|
||||
# Blend only when kNN is close to the index estimate; otherwise keep index.
|
||||
index_est = (
|
||||
df["Estimated current price"]
|
||||
.cast(pl.Float64)
|
||||
.fill_null(float("nan"))
|
||||
.to_numpy()
|
||||
)
|
||||
blended = guarded_blend_estimates(index_est, knn_est, last_prices)
|
||||
df = df.with_columns(
|
||||
pl.when(
|
||||
pl.col("Estimated current price").is_not_null()
|
||||
& pl.col("_knn_est").is_not_null()
|
||||
& pl.col("_knn_est").is_finite()
|
||||
& (pl.col("_knn_est") > 0)
|
||||
)
|
||||
.then(
|
||||
(1 - KNN_BLEND_WEIGHT) * pl.col("Estimated current price")
|
||||
+ KNN_BLEND_WEIGHT * pl.col("_knn_est")
|
||||
)
|
||||
.when(pl.col("Estimated current price").is_not_null())
|
||||
.then(pl.col("Estimated current price"))
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("Estimated current price"),
|
||||
pl.Series("_index_est", index_est, dtype=pl.Float64),
|
||||
pl.Series("Estimated current price", blended, dtype=pl.Float64),
|
||||
).with_columns(
|
||||
pl.col("Estimated current price").fill_nan(None),
|
||||
)
|
||||
|
||||
n_blended = df.filter(
|
||||
pl.col("_knn_est").is_not_null()
|
||||
& pl.col("_knn_est").is_finite()
|
||||
& (pl.col("_knn_est") > 0)
|
||||
& (pl.col("_index_est").is_not_null())
|
||||
& (pl.col("_knn_est") >= pl.col("_index_est") * MIN_KNN_TO_INDEX_RATIO)
|
||||
& (pl.col("_knn_est") <= pl.col("_index_est") * MAX_KNN_TO_INDEX_RATIO)
|
||||
& pl.col("Estimated current price").is_not_null()
|
||||
).height
|
||||
print(f" kNN blended: {n_blended:,} of {n_estimated:,} estimates")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue