Update data

This commit is contained in:
Andras Schmelczer 2026-05-14 08:17:10 +01:00
parent a4103b0896
commit 273d7a83ee
15 changed files with 716 additions and 316 deletions

View file

@ -10,7 +10,11 @@ import pyarrow as pa
import pyarrow.csv as pa_csv
import pyarrow.parquet as pq
from ..utils import fuzzy_join_on_postcode
from ..utils import (
fuzzy_join_on_postcode,
normalize_address_key,
normalize_postcode_key,
)
pl.Config.set_tbl_cols(-1)
@ -193,12 +197,15 @@ def main():
def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Path):
epc_base = _scan_epc_certificates(epc_path, temp_dir)
epc_base = _scan_epc_certificates(epc_path, temp_dir).with_columns(
normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"),
normalize_postcode_key(pl.col("epc_postcode")).alias("_epc_match_postcode"),
)
# Dedup fork: keep latest certificate per property (existing logic)
epc = (
epc_base.sort("inspection_date", descending=True)
.group_by("epc_address", "epc_postcode")
.group_by("_epc_match_address", "_epc_match_postcode")
.first()
.drop("tenure")
)
@ -216,15 +223,15 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
.with_columns(
pl.col("number_habitable_rooms")
.shift(1)
.over("epc_address", "epc_postcode")
.over("_epc_match_address", "_epc_match_postcode")
.alias("_prev_rooms"),
pl.col("total_floor_area")
.shift(1)
.over("epc_address", "epc_postcode")
.over("_epc_match_address", "_epc_match_postcode")
.alias("_prev_area"),
pl.col("_rating_rank")
.shift(1)
.over("epc_address", "epc_postcode")
.over("_epc_match_address", "_epc_match_postcode")
.alias("_prev_rating_rank"),
)
.with_columns(
@ -257,7 +264,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
.cast(pl.Int32)
.alias("_event_year"),
)
.group_by("epc_address", "epc_postcode")
.group_by("_epc_match_address", "_epc_match_postcode")
.agg(
pl.struct(
pl.col("_event_year").alias("year"),
@ -276,7 +283,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
# Social tenure fork: flag properties that were ever social housing
social_tenure = (
epc_base.filter(pl.col("tenure").str.to_lowercase().str.contains("social"))
.select("epc_address", "epc_postcode")
.select("_epc_match_address", "_epc_match_postcode")
.unique()
.with_columns(pl.lit("Yes").alias("was_council_house"))
.collect()
@ -287,12 +294,12 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
epc = (
epc.join(
events.lazy(),
on=["epc_address", "epc_postcode"],
on=["_epc_match_address", "_epc_match_postcode"],
how="left",
)
.join(
social_tenure.lazy(),
on=["epc_address", "epc_postcode"],
on=["_epc_match_address", "_epc_match_postcode"],
how="left",
)
.with_columns(
@ -339,9 +346,23 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
ignore_nulls=True,
).alias("pp_address"),
)
.with_columns(
normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
normalize_postcode_key(pl.col("postcode")).alias("_pp_match_postcode"),
)
.filter(pl.col("_pp_match_postcode").is_not_null())
.with_columns(
pl.coalesce("_pp_match_address", "pp_address").alias("_pp_group_address"),
pl.col("_pp_match_postcode").alias("_pp_group_postcode"),
)
.filter(pl.col("pp_address").is_not_null())
.sort("date_of_transfer")
.group_by("pp_address", "postcode", maintain_order=True)
.group_by("_pp_group_address", "_pp_group_postcode", maintain_order=True)
.agg(
pl.col("pp_address").last(),
pl.col("postcode").last(),
pl.col("_pp_match_address").last(),
pl.col("_pp_match_postcode").last(),
pl.struct(
pl.col("date_of_transfer").dt.year().alias("year"),
pl.col("date_of_transfer").dt.month().cast(pl.UInt8).alias("month"),
@ -354,7 +375,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
pl.col("date_of_transfer").first().alias("first_transfer_date"),
pl.col("old_new").first(),
)
).filter(pl.col("pp_address").is_not_null())
)
print("Price paid dataset")
print(price_paid.head().collect())
@ -405,7 +426,19 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
.then(pl.lit(1, dtype=pl.UInt8))
.otherwise(pl.lit(None, dtype=pl.UInt8))
.alias("is_construction_date_approximate"),
).drop("old_new", "first_transfer_date")
).drop(
[
"old_new",
"first_transfer_date",
"_pp_match_address",
"_pp_match_postcode",
"_pp_group_address",
"_pp_group_postcode",
"_epc_match_address",
"_epc_match_postcode",
],
strict=False,
)
joined = joined.rename({col: col.lower() for col in joined.columns})

View file

@ -22,6 +22,8 @@ _AREA_COLUMNS = [
"Postcode",
"lat",
"lon",
# Runtime provenance for deciding whether missing coordinates are skippable.
"ctry25cd",
# Deprivation
"Income Score",
"Employment Score",
@ -86,6 +88,15 @@ _AREA_COLUMNS = [
_DYNAMIC_POI_DISTANCE_RE = re.compile(r"^Distance to nearest amenity \(.+\) \(km\)$")
_DYNAMIC_POI_COUNT_RE = re.compile(r"^Number of amenities \(.+\) within (2|5)km$")
TREE_DENSITY_FEATURE = "Street tree density percentile"
_POSTCODE_TREE_DENSITY_PERCENTILE_RE = re.compile(
r"^Tree canopy density percentile within \d+m$"
)
_RENT_SOURCE_UNAVAILABLE_LADS = {
# ONS PIPR does not publish LAD-level private-rent estimates for these
# small authorities. Keep rent null there, but fail on any other LAD miss.
"E06000053": "Isles of Scilly",
"E09000001": "City of London",
}
def _is_dynamic_poi_metric_column(column: str) -> bool:
@ -112,6 +123,107 @@ def _less_deprived_percentile_expr(column: str) -> pl.Expr:
)
def _tree_density_by_postcode(tree_density_postcodes_path: Path) -> pl.LazyFrame:
tree_density = pl.scan_parquet(tree_density_postcodes_path)
columns = set(tree_density.collect_schema().names())
if "postcode" not in columns:
raise ValueError(
f"{tree_density_postcodes_path} is missing required column: postcode"
)
if TREE_DENSITY_FEATURE in columns:
density_column = TREE_DENSITY_FEATURE
else:
candidates = sorted(
c for c in columns if _POSTCODE_TREE_DENSITY_PERCENTILE_RE.match(c)
)
if len(candidates) != 1:
raise ValueError(
f'{tree_density_postcodes_path} must contain column "{TREE_DENSITY_FEATURE}" '
'or exactly one "Tree canopy density percentile within {radius}m" column; '
f"found {len(candidates)} postcode percentile columns"
)
density_column = candidates[0]
return (
tree_density.select(
pl.col("postcode"),
pl.col(density_column).cast(pl.Float32).alias(TREE_DENSITY_FEATURE),
)
.drop_nulls(["postcode"])
.unique(["postcode"])
)
def _validate_lad_source_coverage(
iod_path: Path, ethnicity_path: Path, rental_prices_path: Path
) -> None:
iod_lads = (
pl.read_parquet(
iod_path,
columns=[
"Local Authority District code (2024)",
"Local Authority District name (2024)",
],
)
.rename(
{
"Local Authority District code (2024)": "lad",
"Local Authority District name (2024)": "lad_name",
}
)
.unique(["lad"])
)
ethnicity_lads = pl.read_parquet(ethnicity_path, columns=["Geography_code"]).rename(
{"Geography_code": "lad"}
)
missing_ethnicity = iod_lads.join(ethnicity_lads, on="lad", how="anti").sort("lad")
if missing_ethnicity.height > 0:
raise ValueError(
"Ethnicity data is missing 2024 LAD coverage: "
f"{missing_ethnicity.to_dicts()}"
)
rental_lads = pl.read_parquet(rental_prices_path, columns=["area_code"]).rename(
{"area_code": "lad"}
)
missing_rent = iod_lads.join(rental_lads, on="lad", how="anti").sort("lad")
unexpected_missing_rent = missing_rent.filter(
~pl.col("lad").is_in(list(_RENT_SOURCE_UNAVAILABLE_LADS))
)
if unexpected_missing_rent.height > 0:
raise ValueError(
"Rental data is missing 2024 LAD coverage: "
f"{unexpected_missing_rent.to_dicts()}"
)
if missing_rent.height > 0:
print(
"PIPR has no LAD-level rent estimates for source-unavailable LADs; "
f"rent will remain null there: {missing_rent.to_dicts()}"
)
def _validate_property_postcodes(df: pl.DataFrame) -> None:
invalid = df.filter(
pl.col("Postcode").is_null()
| (pl.col("Postcode").cast(pl.Utf8).str.strip_chars() == "")
)
if invalid.height == 0:
return
sample_cols = [
col
for col in ("Postcode", "Address per Property Register", "Last known price")
if col in invalid.columns
]
sample = invalid.select(sample_cols).head(10).to_dicts()
raise ValueError(
"Property rows missing a postcode after merge: "
f"{invalid.height} rows. Sample: {sample}"
)
def _build(
epc_pp_path: Path,
arcgis_path: Path,
@ -126,12 +238,14 @@ def _build(
lsoa_population_path: Path,
median_age_path: Path,
election_results_path: Path,
tree_density_addresses_path: Path | None = None,
tree_density_postcodes_path: Path | None = None,
) -> tuple[pl.DataFrame, pl.DataFrame]:
"""Build postcode and properties dataframes from epc_pp + auxiliary data.
Returns (postcode_df, properties_df).
"""
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_prices_path)
wide = pl.scan_parquet(epc_pp_path).filter(
pl.col("total_floor_area").is_null()
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
@ -152,9 +266,15 @@ def _build(
.drop("new_postcode")
)
arcgis_raw = pl.scan_parquet(arcgis_path)
postcode_country = arcgis_raw.select(
pl.col("pcds").alias("postcode"),
pl.col("ctry25cd"),
).unique(["postcode"])
wide = wide.join(postcode_country, on="postcode", how="left")
arcgis = (
pl.scan_parquet(arcgis_path)
.filter(pl.col("ctry25cd") == "E92000001") # England only
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001") # England only
.filter(pl.col("doterm").is_null()) # Active postcodes only
# NSPL Feb 2026 renamed geographic code columns to {field}{year}cd.
# Alias them back to the short canonical names used across the
@ -191,7 +311,9 @@ def _build(
.cast(pl.UInt8)
.alias("_bedrooms"),
)
rental = pl.scan_parquet(rental_prices_path)
rental = pl.scan_parquet(rental_prices_path).select(
"area_code", "bedrooms", "mean_monthly_rent"
)
wide = wide.join(
rental,
left_on=["Local Authority District code (2024)", "_bedrooms"],
@ -260,17 +382,9 @@ def _build(
school_proximity = pl.scan_parquet(school_proximity_path)
wide = wide.join(school_proximity, on="postcode", how="left")
if tree_density_addresses_path is not None:
tree_density = (
pl.scan_parquet(tree_density_addresses_path)
.select(
pl.col("postcode"),
pl.col("pp_address"),
pl.col(TREE_DENSITY_FEATURE).cast(pl.Float32),
)
.unique(["postcode", "pp_address"])
)
wide = wide.join(tree_density, on=["postcode", "pp_address"], how="left")
if tree_density_postcodes_path is not None:
tree_density = _tree_density_by_postcode(tree_density_postcodes_path)
wide = wide.join(tree_density, on="postcode", how="left")
# Broadband: derive max available download speed tier per postcode from
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
@ -415,6 +529,7 @@ def _build(
print("Collecting with streaming engine...")
df = wide.collect(engine="streaming")
_validate_property_postcodes(df)
# Split into postcode-level and property-level dataframes
area_cols = [
@ -508,10 +623,10 @@ def main():
help="2024 General Election results by constituency parquet file",
)
parser.add_argument(
"--tree-density-addresses",
"--tree-density-postcodes",
type=Path,
required=False,
help="Address-level tree density parquet from pipeline.transform.tree_density",
help="Postcode-level tree density parquet from pipeline.transform.tree_density",
)
parser.add_argument(
"--output-postcodes",
@ -541,7 +656,7 @@ def main():
lsoa_population_path=args.lsoa_population,
median_age_path=args.median_age,
election_results_path=args.election_results,
tree_density_addresses_path=args.tree_density_addresses,
tree_density_postcodes_path=args.tree_density_postcodes,
)
print(f"\nPostcode columns: {postcode_df.columns}")

View file

@ -227,7 +227,18 @@ def main():
fa = test["Total floor area (sqm)"].cast(pl.Float64).fill_null(0.0).to_numpy()
print("\nComputing kNN estimates...")
knn_psm = knn_median_psm(trees, lat, lon, tg)
last_sale_dates = (
test["input_date"].dt.epoch("d").fill_null(-1).to_numpy().astype(np.int64)
)
knn_psm = knn_median_psm(
trees,
lat,
lon,
tg,
postcodes=test["Postcode"].fill_null("").to_numpy(),
last_prices=test["input_price"].cast(pl.Float64).to_numpy(),
last_sale_dates=last_sale_dates,
)
# Temporal adjustment: pool PSM is at ref, adjust to actual
log_idx_actual = test["log_index_actual"].to_numpy().astype(np.float64)

View file

@ -13,6 +13,7 @@ for lat/lon needed by kNN, then drops those columns before writing.
import argparse
from pathlib import Path
import numpy as np
import polars as pl
from pipeline.transform.price_estimation.knn import (
@ -28,6 +29,45 @@ from pipeline.transform.price_estimation.utils import (
type_group_expr,
)
MAX_KNN_TO_INDEX_RATIO = 2.0
MIN_KNN_TO_INDEX_RATIO = 0.5
MAX_ESTIMATE_TO_LAST_PRICE_RATIO = 6.0
def guarded_blend_estimates(
index_est: np.ndarray,
knn_est: np.ndarray,
last_prices: np.ndarray,
weight: float = KNN_BLEND_WEIGHT,
) -> np.ndarray:
"""Blend only stable kNN estimates and cap final uplift from last sale price."""
index_est = index_est.astype(np.float64, copy=False)
knn_est = knn_est.astype(np.float64, copy=False)
last_prices = last_prices.astype(np.float64, copy=False)
has_index = np.isfinite(index_est) & (index_est > 0)
has_knn = np.isfinite(knn_est) & (knn_est > 0)
stable_knn = has_knn & (
has_index
& (knn_est >= index_est * MIN_KNN_TO_INDEX_RATIO)
& (knn_est <= index_est * MAX_KNN_TO_INDEX_RATIO)
)
blended = np.where(
has_index & stable_knn,
(1 - weight) * index_est + weight * knn_est,
np.where(has_index, index_est, np.nan),
)
cap = np.where(
np.isfinite(last_prices) & (last_prices > 0),
last_prices * MAX_ESTIMATE_TO_LAST_PRICE_RATIO,
np.nan,
)
return np.where(
np.isfinite(cap) & np.isfinite(blended), np.minimum(blended, cap), blended
)
def main():
parser = argparse.ArgumentParser(
@ -130,36 +170,54 @@ def main():
lon = df["lon"].cast(pl.Float64).to_numpy()
tg = df["_type_group"].fill_null("").to_numpy()
fa = df["Total floor area (sqm)"].cast(pl.Float64).fill_null(0.0).to_numpy()
last_prices = (
df["Last known price"].cast(pl.Float64).fill_null(float("nan")).to_numpy()
)
last_sale_dates = (
df["Date of last transaction"]
.dt.epoch("d")
.fill_null(-1)
.to_numpy()
.astype(np.int64)
)
knn_psm = knn_median_psm(trees, lat, lon, tg)
knn_psm = knn_median_psm(
trees,
lat,
lon,
tg,
postcodes=df["Postcode"].fill_null("").to_numpy(),
last_prices=last_prices,
last_sale_dates=last_sale_dates,
)
knn_est = knn_psm * fa # No temporal adj: ref == current
df = df.with_columns(
pl.Series("_knn_est", knn_est, dtype=pl.Float64),
)
# Blend: where kNN available, use weighted average; else keep index
# Blend only when kNN is close to the index estimate; otherwise keep index.
index_est = (
df["Estimated current price"]
.cast(pl.Float64)
.fill_null(float("nan"))
.to_numpy()
)
blended = guarded_blend_estimates(index_est, knn_est, last_prices)
df = df.with_columns(
pl.when(
pl.col("Estimated current price").is_not_null()
& pl.col("_knn_est").is_not_null()
& pl.col("_knn_est").is_finite()
& (pl.col("_knn_est") > 0)
)
.then(
(1 - KNN_BLEND_WEIGHT) * pl.col("Estimated current price")
+ KNN_BLEND_WEIGHT * pl.col("_knn_est")
)
.when(pl.col("Estimated current price").is_not_null())
.then(pl.col("Estimated current price"))
.otherwise(pl.lit(None))
.alias("Estimated current price"),
pl.Series("_index_est", index_est, dtype=pl.Float64),
pl.Series("Estimated current price", blended, dtype=pl.Float64),
).with_columns(
pl.col("Estimated current price").fill_nan(None),
)
n_blended = df.filter(
pl.col("_knn_est").is_not_null()
& pl.col("_knn_est").is_finite()
& (pl.col("_knn_est") > 0)
& (pl.col("_index_est").is_not_null())
& (pl.col("_knn_est") >= pl.col("_index_est") * MIN_KNN_TO_INDEX_RATIO)
& (pl.col("_knn_est") <= pl.col("_index_est") * MAX_KNN_TO_INDEX_RATIO)
& pl.col("Estimated current price").is_not_null()
).height
print(f" kNN blended: {n_blended:,} of {n_estimated:,} estimates")

View file

@ -21,6 +21,10 @@ from pipeline.transform.price_estimation.utils import (
KNN_K = 20
KNN_MIN_NEIGHBORS = 5
KNN_BLEND_WEIGHT = 0.35
MIN_COMPARABLE_FLOOR_AREA_SQM = 15.0
MAX_COMPARABLE_FLOOR_AREA_SQM = 1_000.0
MIN_COMPARABLE_PSM = 500.0
MAX_COMPARABLE_PSM = 50_000.0
def _scale_coords(lat: np.ndarray, lon: np.ndarray) -> np.ndarray:
@ -33,13 +37,14 @@ def build_knn_pool(
index: pl.DataFrame,
ref_frac_year: float,
max_sale_year: int | None = None,
) -> dict[str, tuple[KDTree, np.ndarray]]:
) -> dict[str, tuple[KDTree, np.ndarray, np.ndarray, np.ndarray, np.ndarray]]:
"""Build per-type_group KD-trees of index-adjusted price-per-sqm.
Adjusts all pool properties' sale prices to ref_frac_year using the index,
then builds a KD-tree per type_group for nearest-neighbor queries.
Returns dict mapping type_group -> (KDTree over scaled lat/lon, adjusted_psm array).
Returns dict mapping type_group to KDTree, adjusted PSM, and sale identity
arrays used to keep the target sale out of its own comparable set.
"""
print("Building kNN pool...")
lf = pl.scan_parquet(source) if isinstance(source, Path) else source
@ -55,7 +60,8 @@ def build_knn_pool(
pl.col("lat").is_not_null(),
pl.col("lon").is_not_null(),
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") > 0,
pl.col("Total floor area (sqm)") >= MIN_COMPARABLE_FLOOR_AREA_SQM,
pl.col("Total floor area (sqm)") <= MAX_COMPARABLE_FLOOR_AREA_SQM,
pl.col("Last known price").is_not_null(),
pl.col("Last known price") > 0,
pl.col("Postcode").is_not_null(),
@ -97,12 +103,13 @@ def build_knn_pool(
).filter(
pl.col("_adj_psm").is_not_null(),
pl.col("_adj_psm").is_finite(),
pl.col("_adj_psm") > 0,
pl.col("_adj_psm") >= MIN_COMPARABLE_PSM,
pl.col("_adj_psm") <= MAX_COMPARABLE_PSM,
)
print(f" {len(pool):,} after index adjustment")
# Build per-type KD-trees
trees: dict[str, tuple[KDTree, np.ndarray]] = {}
trees: dict[str, tuple[KDTree, np.ndarray, np.ndarray, np.ndarray, np.ndarray]] = {}
for tg in TYPE_GROUPS:
sub = pool.filter(pl.col("type_group") == tg)
n = len(sub)
@ -111,19 +118,49 @@ def build_knn_pool(
lat = sub["lat"].to_numpy().astype(np.float64)
lon = sub["lon"].to_numpy().astype(np.float64)
psm = sub["_adj_psm"].to_numpy().astype(np.float64)
postcodes = sub["Postcode"].fill_null("").to_numpy()
prices = sub["Last known price"].to_numpy().astype(np.float64)
sale_dates = (
sub["Date of last transaction"]
.dt.epoch("d")
.fill_null(-1)
.to_numpy()
.astype(np.int64)
)
tree = KDTree(_scale_coords(lat, lon))
trees[tg] = (tree, psm)
trees[tg] = (tree, psm, postcodes, prices, sale_dates)
print(f" {tg}: {n:,}")
return trees
def _sale_identity_matches(
pool_postcodes: np.ndarray,
pool_prices: np.ndarray,
pool_sale_dates: np.ndarray,
target_postcode: str,
target_price: float,
target_sale_date: int,
) -> np.ndarray:
if not target_postcode or not np.isfinite(target_price) or target_sale_date < 0:
return np.zeros(len(pool_postcodes), dtype=bool)
return (
(pool_postcodes == target_postcode)
& np.isfinite(pool_prices)
& np.isclose(pool_prices, target_price, rtol=0.0, atol=0.5)
& (pool_sale_dates == target_sale_date)
)
def knn_median_psm(
trees: dict[str, tuple[KDTree, np.ndarray]],
trees: dict[str, tuple[KDTree, np.ndarray, np.ndarray, np.ndarray, np.ndarray]],
lat: np.ndarray,
lon: np.ndarray,
type_groups: np.ndarray,
k: int = KNN_K,
postcodes: np.ndarray | None = None,
last_prices: np.ndarray | None = None,
last_sale_dates: np.ndarray | None = None,
) -> np.ndarray:
"""Return median adjusted-PSM of k nearest neighbours for each target.
@ -133,21 +170,41 @@ def knn_median_psm(
n = len(lat)
result = np.full(n, np.nan)
for tg, (tree, psm) in trees.items():
for tg, (tree, psm, pool_postcodes, pool_prices, pool_sale_dates) in trees.items():
mask = (type_groups == tg) & np.isfinite(lat) & np.isfinite(lon)
idx = np.where(mask)[0]
if len(idx) == 0:
continue
actual_k = min(k, len(psm))
if actual_k < KNN_MIN_NEIGHBORS:
query_k = min(max(k * 2, k + KNN_MIN_NEIGHBORS), len(psm))
if query_k < KNN_MIN_NEIGHBORS:
continue
coords = _scale_coords(lat[idx], lon[idx])
_, nn_idx = tree.query(coords, k=actual_k)
_, nn_idx = tree.query(coords, k=query_k)
if nn_idx.ndim == 1:
nn_idx = nn_idx.reshape(-1, 1)
result[idx] = np.nanmedian(psm[nn_idx], axis=1)
medians = np.full(len(idx), np.nan)
for row_num, target_idx in enumerate(idx):
candidates = nn_idx[row_num]
if (
postcodes is not None
and last_prices is not None
and last_sale_dates is not None
):
same_sale = _sale_identity_matches(
pool_postcodes[candidates],
pool_prices[candidates],
pool_sale_dates[candidates],
str(postcodes[target_idx] or ""),
float(last_prices[target_idx]),
int(last_sale_dates[target_idx]),
)
candidates = candidates[~same_sale]
if len(candidates) >= KNN_MIN_NEIGHBORS:
medians[row_num] = np.nanmedian(psm[candidates[:k]])
result[idx] = medians
return result

View file

@ -19,7 +19,7 @@ TERRACE_TYPES = [
"Terraced",
]
FLAT_TYPES = ["Flats/Maisonettes"]
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats", "Bungalow"]
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats"]
SHRINKAGE_K = 50
@ -30,8 +30,6 @@ def type_group_expr():
.then(pl.lit("Terraced"))
.when(pl.col("Property type").is_in(FLAT_TYPES))
.then(pl.lit("Flats"))
.when(pl.col("Property type") == "Bungalow")
.then(pl.lit("Bungalow"))
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"]))
.then(pl.col("Property type"))
.otherwise(pl.lit(None))
@ -61,7 +59,7 @@ def hierarchy_keys(sector: str) -> tuple[str, str]:
return district, area
NON_REF_TYPES = ["Terraced", "Semi-Detached", "Flats", "Bungalow"]
NON_REF_TYPES = ["Terraced", "Semi-Detached", "Flats"]
def build_hedonic_features(df: pl.DataFrame) -> np.ndarray:

View file

@ -15,12 +15,21 @@ DROP_CATEGORIES = {
"amenity/bicycle_parking",
"amenity/binoculars",
"amenity/boot_scraper",
"amenity/bus_garage",
"amenity/check_in",
"amenity/clock",
"amenity/clothes_dryer",
"amenity/coast_guard",
"amenity/coffin_rest",
"amenity/compressed_air",
"amenity/court_yard",
"amenity/donation_box",
"amenity/dressing_room",
"amenity/drinking_water",
"emergency/water_tank",
"leisure/bleachers",
"leisure/schoolyard",
"public_transport/pay_scale_area",
"shop/taxi",
"amenity/feeding_place",
"amenity/fixme",
@ -31,6 +40,7 @@ DROP_CATEGORIES = {
"amenity/lounge",
"tourism/preserved_railway",
"amenity/lounger",
"leisure/sport",
"amenity/motorcycle_parking",
"amenity/mounting_block",
"amenity/notice_board",
@ -71,8 +81,12 @@ DROP_CATEGORIES = {
"amenity/boat_storage",
"amenity/bureau_de_change",
"amenity/bus_station",
"amenity/beachhut",
"amenity/canteen",
"amenity/conference_centre",
"amenity/crematorium",
"amenity/disused",
"amenity/driver_training",
"amenity/driving_school",
"amenity/escooter_rental",
"amenity/ferry_terminal",
@ -82,14 +96,21 @@ DROP_CATEGORIES = {
"amenity/kick-scooter_rental",
"amenity/money_transfer",
"amenity/post_depot",
"amenity/prison",
"amenity/public_building",
"amenity/recycling",
"amenity/scout_hut",
"amenity/social_facility",
"amenity/studio",
"amenity/student_accommodation",
"amenity/taxi",
"amenity/telephone_exchange",
"amenity/training",
"amenity/vehicle_inspection",
"amenity/waiting_room",
"amenity/yes",
"shop/disused",
"shop/no",
# Buildings (except church & university which are mapped)
"building/air_shaft",
"building/apartments",
@ -148,12 +169,14 @@ DROP_CATEGORIES = {
"emergency/yes",
"tourism/apartment",
"tourism/apartments",
"tourism/alpine_hut",
"tourism/camp_pitch",
"tourism/caravan_site",
"tourism/information",
"tourism/picnic_site",
"tourism/viewpoint",
"tourism/village_sign",
"tourism/wilderness_hut",
"tourism/yes",
# Public transport (from NaPTAN instead)
"public_transport/entrance",
@ -191,6 +214,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"🍺",
[
"amenity/pub",
"amenity/beer_garden",
"amenity/biergarten",
"amenity/social_club",
"amenity/club",
"leisure/social_club",
@ -293,7 +318,13 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"leisure/miniature_golf",
"leisure/horse_riding",
"leisure/fishing",
"leisure/ice_rink",
"leisure/paddling_pool",
"leisure/practice_pitch",
"leisure/shooting_ground",
"leisure/stadium",
"leisure/swimming_pool",
"leisure/swimming_area",
"leisure/water_park",
"leisure/bathing_place",
],
@ -307,9 +338,11 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"leisure/amusement_arcade",
"leisure/adult_gaming_centre",
"leisure/escape_game",
"leisure/maze",
"leisure/trampoline_park",
"leisure/sauna",
"leisure/tanning_salon",
"shop/amusements",
"tourism/theme_park",
"amenity/bicycle_rental",
"amenity/boat_rental",
@ -345,6 +378,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
[
"shop/bakery",
"shop/pastry",
"craft/bakery",
"craft/confectionery",
],
),
@ -364,6 +398,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
[
"shop/greengrocer",
"shop/farm",
"shop/market",
"amenity/marketplace",
],
),
@ -424,6 +459,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"shop/appliance",
"shop/electrical",
"shop/hifi",
"shop/vacuum_cleaner",
"shop/video_games",
"shop/games",
],
@ -444,7 +480,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
[
"shop/doityourself",
"shop/hardware",
"shop/builders_merchant",
"shop/paint",
"shop/plumbing",
],
),
(
@ -462,11 +500,15 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"shop/curtain",
"shop/flooring",
"shop/fireplace",
"shop/garden_furniture",
"shop/groundskeeping",
"shop/household",
"shop/household_linen",
"shop/houseware",
"shop/homeware",
"shop/interior_decoration",
"shop/lighting",
"shop/kitchenware",
"shop/window_blind",
],
),
@ -493,8 +535,11 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"🏕️",
[
"shop/sports",
"shop/angling",
"shop/outdoor",
"shop/bicycle",
"shop/equestrian",
"shop/surf",
],
),
(
@ -532,9 +577,11 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"shop/music",
"shop/musical_instrument",
"shop/antiques",
"shop/anime",
"shop/baby_goods",
"shop/fabric",
"shop/haberdashery",
"shop/hobby",
"shop/wool",
"shop/pottery",
],
@ -549,9 +596,13 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"shop/bookmaker",
"shop/building_materials",
"shop/camera",
"shop/cannabis",
"shop/car",
"shop/caravan",
"shop/catalogue",
"shop/auction",
"shop/auction_house",
"shop/chandler",
"shop/collector",
"shop/copyshop",
"shop/country_store",
@ -560,6 +611,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"shop/erotic",
"shop/esoteric",
"shop/fan",
"shop/fireworks",
"shop/fishing",
"shop/frame",
"shop/fuel",
@ -582,6 +634,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"shop/scuba_diving",
"shop/security",
"shop/sewing",
"shop/ship_chandler",
"shop/signs",
"shop/storage_rental",
"shop/swimming_pool",
"shop/telecommunication",
@ -590,7 +644,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"shop/tool_hire",
"shop/trade",
"shop/trophy",
"shop/truck",
"shop/vacant",
"shop/van",
"shop/video",
"shop/water_sports",
"shop/weapons",
@ -611,6 +667,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"shop/cosmetics",
"shop/massage",
"shop/perfumery",
"leisure/spa",
],
),
(
@ -757,6 +814,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
[
"amenity/hospital",
"amenity/clinic",
"amenity/health_centre",
"healthcare/blood_donation",
"healthcare/hospital",
"healthcare/centre",
"healthcare/clinic",
@ -804,6 +863,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
[
"amenity/care_home",
"amenity/nursing_home",
"amenity/retirement_home",
"healthcare/hospice",
"healthcare/nursing_home",
"office/home_care",
],
),
@ -848,6 +910,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"",
[
"amenity/place_of_worship",
"amenity/monastery",
"building/church",
],
),
@ -873,6 +936,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"📸",
[
"tourism/attraction",
"tourism/aquarium",
"amenity/fountain",
"amenity/courthouse",
"tourism/chalet",
@ -892,6 +956,7 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"building/university",
"amenity/kindergarten",
"amenity/childcare",
"office/tutoring",
],
),
(
@ -904,6 +969,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"tourism/guest_house",
"tourism/motel",
"tourism/camp_site",
"leisure/resort",
"tourism/holiday_park",
"tourism/self_catering",
],
),
(
@ -928,14 +996,19 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"craft/window_construction",
"craft/agricultural_engines",
"craft/atelier",
"craft/beekeeper",
"craft/blacksmith",
"craft/bookbinder",
"craft/boatbuilder",
"craft/caterer",
"craft/carpet_layer",
"craft/clockmaker",
"craft/handicraft",
"craft/jeweller",
"craft/metal_construction",
"craft/photographer",
"craft/photographic_laboratory",
"craft/plasterer",
"craft/pottery",
"craft/printer",
"craft/sawmill",
@ -946,22 +1019,28 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"craft/upholsterer",
"craft/watchmaker",
"craft/yes",
"amenity/workshop",
"shop/glaziery",
"shop/windows",
# Professional offices & estate agents
"shop/estate_agent",
"office/accountant",
"office/architect",
"office/auctioneer",
"office/builder",
"office/construction",
"office/construction_company",
"office/engineer",
"office/estate_agent",
"office/financial",
"office/financial_advisor",
"office/financial_services",
"office/insurance",
"office/lawyer",
"office/mortgage",
"office/property_management",
"office/solicitor",
"office/solicitors",
"office/surveyor",
"office/tax_advisor",
],
@ -972,6 +1051,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"🏢",
[
"amenity/coworking_space",
"amenity/research_institute",
"office/administrative",
"office/advertising_agency",
"office/association",
"office/charity",
@ -997,12 +1078,15 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"office/notary",
"office/political_party",
"office/politician",
"office/publisher",
"office/quango",
"office/recruitment",
"office/religion",
"office/research",
"office/security",
"office/taxi",
"office/telecommunication",
"office/transport",
"office/union",
"office/university",
"office/vacant",
@ -1032,7 +1116,11 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"Community Centre",
"🤝",
[
"amenity/church_hall",
"amenity/clubhouse",
"amenity/community_centre",
"amenity/community_hall",
"amenity/scout_hall",
"amenity/social_centre",
"amenity/townhall",
],