perfect-postcode/pipeline/transform/price_estimation/utils.py
2026-02-15 22:39:54 +00:00

233 lines
7.4 KiB
Python

"""Shared utilities for price estimation modules."""
from datetime import date
from pathlib import Path
import numpy as np
import polars as pl
CURRENT_YEAR = 2026
_today = date.today()
CURRENT_FRAC_YEAR = _today.year + (_today.month - 1) / 12
CURRENT_MONTH = _today.month
# Cap on log(index_ratio) to prevent wild estimates from thin sectors
MAX_LOG_ADJUSTMENT = 3.0 # ~20x max price change
TERRACE_TYPES = [
"Mid-Terrace",
"End-Terrace",
"Enclosed Mid-Terrace",
"Enclosed End-Terrace",
"Terraced",
]
FLAT_TYPES = ["Flats/Maisonettes", "Flat", "Maisonette"]
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats", "Bungalow"]
SHRINKAGE_K = 50
def type_group_expr():
"""Polars expression: Property type -> type_group."""
return (
pl.when(pl.col("Property type").is_in(TERRACE_TYPES))
.then(pl.lit("Terraced"))
.when(pl.col("Property type").is_in(FLAT_TYPES))
.then(pl.lit("Flats"))
.when(pl.col("Property type") == "Bungalow")
.then(pl.lit("Bungalow"))
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"]))
.then(pl.col("Property type"))
.otherwise(pl.lit(None))
.alias("type_group")
)
def sector_expr():
"""Polars expression: Postcode -> sector (drop last 2 chars, strip)."""
return (
pl.col("Postcode")
.str.slice(0, pl.col("Postcode").str.len_chars() - 2)
.str.strip_chars()
.alias("sector")
)
def hierarchy_keys(sector: str) -> tuple[str, str]:
"""Return (district, area) for a sector string."""
district = sector.rsplit(" ", 1)[0] if " " in sector else sector
area = ""
for ch in district:
if ch.isalpha():
area += ch
else:
break
return district, area
NON_REF_TYPES = ["Terraced", "Semi-Detached", "Flats", "Bungalow"]
def build_hedonic_features(df: pl.DataFrame) -> np.ndarray:
"""Build hedonic feature matrix: log(floor_area) + 4 type dummies (ref: Detached)."""
fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32)
log_fa = np.log(np.maximum(fa, 1.0)).reshape(-1, 1)
tg = df["type_group"].to_numpy()
parts = [log_fa]
for t in NON_REF_TYPES:
parts.append((tg == t).astype(np.float32).reshape(-1, 1))
return np.hstack(parts)
def interpolate_log_index(
index: pl.DataFrame,
df: pl.DataFrame,
sector_col: str,
type_col: str,
frac_year_col: str,
output_alias: str,
) -> pl.DataFrame:
"""Join and interpolate log_index at fractional years.
For frac_year 2019.75: joins index at year=2019 and year=2020,
then linearly interpolates: 0.25*idx_2019 + 0.75*idx_2020.
Falls back to floor or ceil when the other is missing.
"""
floor_col = f"_{output_alias}_floor"
ceil_col = f"_{output_alias}_ceil"
floor_year = f"_{output_alias}_floor_year"
ceil_year = f"_{output_alias}_ceil_year"
frac_col = f"_{output_alias}_frac"
df = df.with_columns(
pl.col(frac_year_col).floor().cast(pl.Int32).alias(floor_year),
pl.col(frac_year_col).ceil().cast(pl.Int32).alias(ceil_year),
(pl.col(frac_year_col) - pl.col(frac_year_col).floor()).alias(frac_col),
)
df = join_type_stratified_index(
df, index, sector_col, type_col, floor_year, floor_col
)
df = join_type_stratified_index(
df, index, sector_col, type_col, ceil_year, ceil_col
)
# Interpolate: (1-frac)*floor + frac*ceil, with fallbacks
df = df.with_columns(
pl.when(pl.col(floor_col).is_not_null() & pl.col(ceil_col).is_not_null())
.then(
(1.0 - pl.col(frac_col)) * pl.col(floor_col)
+ pl.col(frac_col) * pl.col(ceil_col)
)
.when(pl.col(floor_col).is_not_null())
.then(pl.col(floor_col))
.when(pl.col(ceil_col).is_not_null())
.then(pl.col(ceil_col))
.otherwise(pl.lit(None))
.alias(output_alias),
).drop(floor_col, ceil_col, floor_year, ceil_year, frac_col)
return df
def extract_centroids(input_path) -> dict[str, tuple[float, float]]:
"""Compute mean lat/lon per postcode sector."""
print("Computing sector centroids...")
df = (
pl.scan_parquet(input_path)
.select("Postcode", "lat", "lon")
.filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null())
.with_columns(sector_expr())
.group_by("sector")
.agg(pl.col("lat").mean(), pl.col("lon").mean())
.collect()
)
centroids = {}
for row in df.iter_rows(named=True):
centroids[row["sector"]] = (row["lat"], row["lon"])
print(f" {len(centroids):,} sector centroids")
return centroids
def join_type_stratified_index(
df: pl.DataFrame,
index: pl.DataFrame,
sector_col: str,
type_col: str,
year_col: str,
output_alias: str,
) -> pl.DataFrame:
"""Join price index with typed->All fallback. Returns df with `output_alias` column."""
idx_typed = index.filter(pl.col("type_group") != "All")
idx_all = index.filter(pl.col("type_group") == "All")
_typed = f"_{output_alias}_typed"
_all = f"_{output_alias}_all"
df = df.join(
idx_typed.select(
"sector", "type_group", "year", pl.col("log_index").alias(_typed)
),
left_on=[sector_col, type_col, year_col],
right_on=["sector", "type_group", "year"],
how="left",
).join(
idx_all.select("sector", "year", pl.col("log_index").alias(_all)),
left_on=[sector_col, year_col],
right_on=["sector", "year"],
how="left",
)
df = df.with_columns(
pl.col(_typed).fill_null(pl.col(_all)).alias(output_alias),
).drop(_typed, _all)
return df
def compute_seasonal_factors(
input_path: Path, max_sale_year: int | None = None
) -> np.ndarray:
"""Compute 12 multiplicative monthly price factors from price-per-sqm.
Detrends by normalizing median £/sqm within each year, then averages
across years. Returns array of 12 factors (index 0 = January).
Normalized so mean = 1.0.
"""
query = (
pl.scan_parquet(input_path)
.select("Last known price", "Total floor area (sqm)", "Date of last transaction")
.filter(
pl.col("Last known price").is_not_null(),
pl.col("Last known price") > 0,
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") > 0,
pl.col("Date of last transaction").is_not_null(),
)
.with_columns(
(
pl.col("Last known price").cast(pl.Float64)
/ pl.col("Total floor area (sqm)").cast(pl.Float64)
).alias("psm"),
pl.col("Date of last transaction").dt.month().alias("month"),
pl.col("Date of last transaction").dt.year().alias("year"),
)
)
if max_sale_year is not None:
query = query.filter(pl.col("year") < max_sale_year)
monthly = (
query.group_by("year", "month")
.agg(pl.col("psm").median().alias("median_psm"))
.with_columns(
pl.col("median_psm").mean().over("year").alias("year_mean"),
)
.with_columns(
(pl.col("median_psm") / pl.col("year_mean")).alias("ratio"),
)
.group_by("month")
.agg(pl.col("ratio").mean().alias("factor"))
.sort("month")
.collect()
)
factors = monthly["factor"].to_numpy().astype(np.float64)
return factors / factors.mean()