121 lines
3.6 KiB
Python
121 lines
3.6 KiB
Python
"""Shared utilities for price index, price estimate, and renovation premium scripts."""
|
|
|
|
import numpy as np
|
|
import polars as pl
|
|
|
|
CURRENT_YEAR = 2025
|
|
TERRACE_TYPES = [
|
|
"Mid-Terrace",
|
|
"End-Terrace",
|
|
"Enclosed Mid-Terrace",
|
|
"Enclosed End-Terrace",
|
|
"Terraced",
|
|
]
|
|
FLAT_TYPES = ["Flats/Maisonettes", "Flat", "Maisonette"]
|
|
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats", "Bungalow"]
|
|
SHRINKAGE_K = 50
|
|
|
|
|
|
def type_group_expr():
|
|
"""Polars expression: Property type -> type_group."""
|
|
return (
|
|
pl.when(pl.col("Property type").is_in(TERRACE_TYPES))
|
|
.then(pl.lit("Terraced"))
|
|
.when(pl.col("Property type").is_in(FLAT_TYPES))
|
|
.then(pl.lit("Flats"))
|
|
.when(pl.col("Property type") == "Bungalow")
|
|
.then(pl.lit("Bungalow"))
|
|
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"]))
|
|
.then(pl.col("Property type"))
|
|
.otherwise(pl.lit(None))
|
|
.alias("type_group")
|
|
)
|
|
|
|
|
|
def sector_expr():
|
|
"""Polars expression: Postcode -> sector (drop last 2 chars, strip)."""
|
|
return (
|
|
pl.col("Postcode")
|
|
.str.slice(0, pl.col("Postcode").str.len_chars() - 2)
|
|
.str.strip_chars()
|
|
.alias("sector")
|
|
)
|
|
|
|
|
|
def hierarchy_keys(sector: str) -> tuple[str, str]:
|
|
"""Return (district, area) for a sector string."""
|
|
district = sector.rsplit(" ", 1)[0] if " " in sector else sector
|
|
area = ""
|
|
for ch in district:
|
|
if ch.isalpha():
|
|
area += ch
|
|
else:
|
|
break
|
|
return district, area
|
|
|
|
|
|
AGE_BREAKS = [1900, 1930, 1950, 1967, 1983, 2000, 2010]
|
|
AGE_LABELS = [
|
|
"pre-1900",
|
|
"1900-1929",
|
|
"1930-1949",
|
|
"1950-1966",
|
|
"1967-1982",
|
|
"1983-1999",
|
|
"2000-2009",
|
|
"2010+",
|
|
]
|
|
|
|
HEDONIC_COLUMNS = [
|
|
"Last known price",
|
|
"Date of last transaction",
|
|
"Property type",
|
|
"Total floor area (sqm)",
|
|
"Postcode",
|
|
]
|
|
|
|
|
|
def age_band_expr():
|
|
"""Polars expression: Construction age (UInt16 year) → age band string."""
|
|
expr = pl.when(pl.col("Construction age").is_null()).then(pl.lit(None))
|
|
for i, brk in enumerate(AGE_BREAKS):
|
|
expr = expr.when(pl.col("Construction age") < brk).then(pl.lit(AGE_LABELS[i]))
|
|
return expr.otherwise(pl.lit(AGE_LABELS[-1])).alias("age_band")
|
|
|
|
|
|
NON_REF_TYPES = ["Terraced", "Semi-Detached", "Flats", "Bungalow"]
|
|
|
|
|
|
def build_hedonic_features(df: pl.DataFrame) -> np.ndarray:
|
|
"""Build hedonic feature matrix from a DataFrame with type_group column.
|
|
|
|
Columns (5 total): log(floor_area), 4 type dummies (ref: Detached).
|
|
Sector fixed effects do the heavy lifting — additional property features
|
|
(EPC, rooms, age) add no predictive value after sector demeaning.
|
|
"""
|
|
fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32)
|
|
log_fa = np.log(np.maximum(fa, 1.0)).reshape(-1, 1)
|
|
tg = df["type_group"].to_numpy()
|
|
parts = [log_fa]
|
|
for t in NON_REF_TYPES:
|
|
parts.append((tg == t).astype(np.float32).reshape(-1, 1))
|
|
return np.hstack(parts)
|
|
|
|
|
|
def extract_centroids(input_path) -> dict[str, tuple[float, float]]:
|
|
"""Compute mean lat/lon per postcode sector."""
|
|
print("Computing sector centroids...")
|
|
df = (
|
|
pl.scan_parquet(input_path)
|
|
.select("Postcode", "lat", "lon")
|
|
.filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null())
|
|
.with_columns(sector_expr())
|
|
.group_by("sector")
|
|
.agg(pl.col("lat").mean(), pl.col("lon").mean())
|
|
.collect()
|
|
)
|
|
centroids = {}
|
|
for row in df.iter_rows(named=True):
|
|
centroids[row["sector"]] = (row["lat"], row["lon"])
|
|
print(f" {len(centroids):,} sector centroids")
|
|
return centroids
|