"""Shared utilities for price index, price estimate, and renovation premium scripts.""" import numpy as np import polars as pl CURRENT_YEAR = 2025 TERRACE_TYPES = [ "Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace", "Terraced", ] FLAT_TYPES = ["Flats/Maisonettes", "Flat", "Maisonette"] TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats", "Bungalow"] SHRINKAGE_K = 50 def type_group_expr(): """Polars expression: Property type -> type_group.""" return ( pl.when(pl.col("Property type").is_in(TERRACE_TYPES)) .then(pl.lit("Terraced")) .when(pl.col("Property type").is_in(FLAT_TYPES)) .then(pl.lit("Flats")) .when(pl.col("Property type") == "Bungalow") .then(pl.lit("Bungalow")) .when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])) .then(pl.col("Property type")) .otherwise(pl.lit(None)) .alias("type_group") ) def sector_expr(): """Polars expression: Postcode -> sector (drop last 2 chars, strip).""" return ( pl.col("Postcode") .str.slice(0, pl.col("Postcode").str.len_chars() - 2) .str.strip_chars() .alias("sector") ) def hierarchy_keys(sector: str) -> tuple[str, str]: """Return (district, area) for a sector string.""" district = sector.rsplit(" ", 1)[0] if " " in sector else sector area = "" for ch in district: if ch.isalpha(): area += ch else: break return district, area AGE_BREAKS = [1900, 1930, 1950, 1967, 1983, 2000, 2010] AGE_LABELS = [ "pre-1900", "1900-1929", "1930-1949", "1950-1966", "1967-1982", "1983-1999", "2000-2009", "2010+", ] HEDONIC_COLUMNS = [ "Last known price", "Date of last transaction", "Property type", "Total floor area (sqm)", "Postcode", ] def age_band_expr(): """Polars expression: Construction age (UInt16 year) → age band string.""" expr = pl.when(pl.col("Construction age").is_null()).then(pl.lit(None)) for i, brk in enumerate(AGE_BREAKS): expr = expr.when(pl.col("Construction age") < brk).then(pl.lit(AGE_LABELS[i])) return expr.otherwise(pl.lit(AGE_LABELS[-1])).alias("age_band") NON_REF_TYPES = ["Terraced", "Semi-Detached", "Flats", "Bungalow"] def build_hedonic_features(df: pl.DataFrame) -> np.ndarray: """Build hedonic feature matrix from a DataFrame with type_group column. Columns (5 total): log(floor_area), 4 type dummies (ref: Detached). Sector fixed effects do the heavy lifting — additional property features (EPC, rooms, age) add no predictive value after sector demeaning. """ fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32) log_fa = np.log(np.maximum(fa, 1.0)).reshape(-1, 1) tg = df["type_group"].to_numpy() parts = [log_fa] for t in NON_REF_TYPES: parts.append((tg == t).astype(np.float32).reshape(-1, 1)) return np.hstack(parts) def extract_centroids(input_path) -> dict[str, tuple[float, float]]: """Compute mean lat/lon per postcode sector.""" print("Computing sector centroids...") df = ( pl.scan_parquet(input_path) .select("Postcode", "lat", "lon") .filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null()) .with_columns(sector_expr()) .group_by("sector") .agg(pl.col("lat").mean(), pl.col("lon").mean()) .collect() ) centroids = {} for row in df.iter_rows(named=True): centroids[row["sector"]] = (row["lat"], row["lon"]) print(f" {len(centroids):,} sector centroids") return centroids