"""Shared utilities for price estimation modules.""" from datetime import date import numpy as np import polars as pl CURRENT_YEAR = 2026 # Latest COMPLETE calendar year. The current year's transactions are only # partially reported (Land Registry lags ~2-3 months), so a sector's thin # partial-year repeat-sale set produces wild index betas (e.g. +334% in a # single sector). The index is SOLVED only on complete years (<= this) and # forward-filled/trend-extrapolated to CURRENT_YEAR, so current-value # projections follow the established trend instead of a partial-year spike. LATEST_COMPLETE_YEAR = CURRENT_YEAR - 1 _today = date.today() CURRENT_FRAC_YEAR = _today.year + (_today.month - 1) / 12 # Cap on log(index_ratio) to prevent wild estimates from thin sectors MAX_LOG_ADJUSTMENT = 3.0 # ~20x max price change TERRACE_TYPES = [ "Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace", "Terraced", ] FLAT_TYPES = ["Flats/Maisonettes"] TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats"] SHRINKAGE_K = 50 # Temporal regularization for the repeat-sales index: a second-difference # (curvature) penalty lambda * sum((beta_t - 2*beta_{t-1} + beta_{t-2})^2) added # to the IRLS solve. A mild penalty damps single-year index spikes (which would # otherwise distort the estimate of any property whose last sale landed on a # noisy year) without flattening genuine multi-year trends. TEMPORAL_SMOOTHNESS_LAMBDA = 0.05 # Per-year support scaling for the temporal smoothness penalty. A flat lambda # is too weak for years with very few repeat-sale pairs: a sector can have # hundreds of pairs overall (so cell-level n/(n+k) shrinkage barely moves it) # yet have individual years estimated from 1-2 pairs, producing 2-7x # single-year index spikes. Each curvature row is therefore scaled by the # local pair support of its year triple: # lambda_eff = lambda0 * (1 + SMOOTHNESS_SUPPORT_PAIRS / s) # where s is the minimum cross-year pair count among the triple's years. # Well-supported years (s >> SMOOTHNESS_SUPPORT_PAIRS) keep lambda_eff ~ # lambda0 (current behaviour); a year identified by a single pair gets # ~41x lambda0, pulling its beta strongly toward the local trend through its # neighbours. Same-year pairs cancel in the design and are not counted. SMOOTHNESS_SUPPORT_PAIRS = 40 def type_group_expr(): """Polars expression: Property type -> type_group.""" return ( pl.when(pl.col("Property type").is_in(TERRACE_TYPES)) .then(pl.lit("Terraced")) .when(pl.col("Property type").is_in(FLAT_TYPES)) .then(pl.lit("Flats")) .when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])) .then(pl.col("Property type")) .otherwise(pl.lit(None)) .alias("type_group") ) def sector_expr(): """Polars expression: Postcode -> sector (drop last 2 chars, strip).""" return ( pl.col("Postcode") .str.slice(0, pl.col("Postcode").str.len_chars() - 2) .str.strip_chars() .alias("sector") ) def hierarchy_keys(sector: str) -> tuple[str, str]: """Return (district, area) for a sector string.""" district = sector.rsplit(" ", 1)[0] if " " in sector else sector area = "" for ch in district: if ch.isalpha(): area += ch else: break return district, area NON_REF_TYPES = ["Terraced", "Semi-Detached", "Flats"] def build_hedonic_features(df: pl.DataFrame) -> np.ndarray: """Build hedonic feature matrix: log(floor_area) + 4 type dummies (ref: Detached).""" fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32) log_fa = np.log(np.maximum(fa, 1.0)).reshape(-1, 1) tg = df["type_group"].to_numpy() parts = [log_fa] for t in NON_REF_TYPES: parts.append((tg == t).astype(np.float32).reshape(-1, 1)) return np.hstack(parts) def interpolate_log_index( index: pl.DataFrame, df: pl.DataFrame, sector_col: str, type_col: str, frac_year_col: str, output_alias: str, ) -> pl.DataFrame: """Join and interpolate log_index at fractional years. For frac_year 2019.75: joins index at year=2019 and year=2020, then linearly interpolates: 0.25*idx_2019 + 0.75*idx_2020. Falls back to floor or ceil when the other is missing. """ floor_col = f"_{output_alias}_floor" ceil_col = f"_{output_alias}_ceil" floor_year = f"_{output_alias}_floor_year" ceil_year = f"_{output_alias}_ceil_year" frac_col = f"_{output_alias}_frac" df = df.with_columns( pl.col(frac_year_col).floor().cast(pl.Int32).alias(floor_year), pl.col(frac_year_col).ceil().cast(pl.Int32).alias(ceil_year), (pl.col(frac_year_col) - pl.col(frac_year_col).floor()).alias(frac_col), ) df = join_type_stratified_index( df, index, sector_col, type_col, floor_year, floor_col ) df = join_type_stratified_index( df, index, sector_col, type_col, ceil_year, ceil_col ) # Interpolate: (1-frac)*floor + frac*ceil, with fallbacks df = df.with_columns( pl.when(pl.col(floor_col).is_not_null() & pl.col(ceil_col).is_not_null()) .then( (1.0 - pl.col(frac_col)) * pl.col(floor_col) + pl.col(frac_col) * pl.col(ceil_col) ) .when(pl.col(floor_col).is_not_null()) .then(pl.col(floor_col)) .when(pl.col(ceil_col).is_not_null()) .then(pl.col(ceil_col)) .otherwise(pl.lit(None)) .alias(output_alias), ).drop(floor_col, ceil_col, floor_year, ceil_year, frac_col) return df def extract_centroids(input_path) -> dict[str, tuple[float, float]]: """Compute mean lat/lon per postcode sector.""" print("Computing sector centroids...") df = ( pl.scan_parquet(input_path) .select("Postcode", "lat", "lon") .filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null()) .with_columns(sector_expr()) .group_by("sector") .agg(pl.col("lat").mean(), pl.col("lon").mean()) .collect() ) centroids = {} for row in df.iter_rows(named=True): centroids[row["sector"]] = (row["lat"], row["lon"]) print(f" {len(centroids):,} sector centroids") return centroids def join_type_stratified_index( df: pl.DataFrame, index: pl.DataFrame, sector_col: str, type_col: str, year_col: str, output_alias: str, ) -> pl.DataFrame: """Join price index with typed->All fallback. Returns df with `output_alias` column.""" idx_typed = index.filter(pl.col("type_group") != "All") idx_all = index.filter(pl.col("type_group") == "All") _typed = f"_{output_alias}_typed" _all = f"_{output_alias}_all" df = df.join( idx_typed.select( "sector", "type_group", "year", pl.col("log_index").alias(_typed) ), left_on=[sector_col, type_col, year_col], right_on=["sector", "type_group", "year"], how="left", ).join( idx_all.select("sector", "year", pl.col("log_index").alias(_all)), left_on=[sector_col, year_col], right_on=["sector", "year"], how="left", ) df = df.with_columns( pl.col(_typed).fill_null(pl.col(_all)).alias(output_alias), ).drop(_typed, _all) return df