Changes

2026-02-14 12:53:29 +00:00 · 2026-02-14 12:53:29 +00:00 · 128b3191e7
commit 128b3191e7
parent 3a3f899ea2
68 changed files with 28060 additions and 1152 deletions
--- a/pipeline/transform/price_index.py
+++ b/pipeline/transform/price_index.py
@ -19,66 +19,38 @@ from scipy.sparse.linalg import lsqr
 from scipy.spatial import KDTree
 from tqdm import tqdm

+from pipeline.transform._price_utils import (
+    CURRENT_YEAR,
+    SHRINKAGE_K,
+    TYPE_GROUPS,
+    build_hedonic_features,
+    extract_centroids,
+    hierarchy_keys,
+    sector_expr,
+    type_group_expr,
+)
+
 # --- Constants ---
 MIN_PAIRS = 5
-SHRINKAGE_K = 50
 OUTLIER_THRESHOLD = 3.0  # hard pre-filter; Huber handles the rest
 HUBER_K = 1.345
 IRLS_ITERATIONS = 5
 SPATIAL_NEIGHBORS = 5
 SPATIAL_BLEND_K = 30
-CURRENT_YEAR = 2025
-
-TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats"]
-TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
-AGE_BREAKS = [1900, 1930, 1950, 1967, 1983, 2000, 2010]
-AGE_LABELS = ["pre-1900", "1900-1929", "1930-1949", "1950-1966", "1967-1982", "1983-1999", "2000-2009", "2010+"]
-
-
-def type_group_expr():
-    """Polars expression: Property type → type_group."""
-    return (
-        pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
-        .when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
-        .when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
-        .otherwise(pl.lit(None))
-        .alias("type_group")
-    )
-
-
-def age_band_expr():
-    """Polars expression: Construction age (UInt16 year) → age band string."""
-    expr = pl.when(pl.col("Construction age").is_null()).then(pl.lit(None))
-    for i, brk in enumerate(AGE_BREAKS):
-        expr = expr.when(pl.col("Construction age") < brk).then(pl.lit(AGE_LABELS[i]))
-    return expr.otherwise(pl.lit(AGE_LABELS[-1])).alias("age_band")
-
-
-def sector_expr():
-    """Polars expression: Postcode → sector (drop last 2 chars, strip)."""
-    return pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector")
-
-
-def hierarchy_keys(sector: str) -> tuple[str, str]:
-    """Return (district, area) for a sector string."""
-    district = sector.rsplit(" ", 1)[0] if " " in sector else sector
-    area = ""
-    for ch in district:
-        if ch.isalpha():
-            area += ch
-        else:
-            break
-    return district, area


 # --- Pair extraction ---

+
 def extract_pairs(input_path: Path) -> pl.DataFrame:
    print("Extracting repeat-sale pairs...")
    df = (
        pl.scan_parquet(input_path)
        .select("Postcode", "historical_prices", "Property type")
-        .filter(pl.col("Postcode").is_not_null(), pl.col("historical_prices").list.len() >= 2)
+        .filter(
+            pl.col("Postcode").is_not_null(),
+            pl.col("historical_prices").list.len() >= 2,
+        )
        .with_columns(sector_expr(), type_group_expr())
        .collect()
    )
@ -87,7 +59,9 @@ def extract_pairs(input_path: Path) -> pl.DataFrame:
    pairs = (
        df.lazy()
        .with_columns(
-            pl.col("historical_prices").list.slice(0, pl.col("historical_prices").list.len() - 1).alias("from_txn"),
+            pl.col("historical_prices")
+            .list.slice(0, pl.col("historical_prices").list.len() - 1)
+            .alias("from_txn"),
            pl.col("historical_prices").list.slice(1).alias("to_txn"),
        )
        .explode("from_txn", "to_txn")
@ -98,10 +72,18 @@ def extract_pairs(input_path: Path) -> pl.DataFrame:
            pl.col("to_txn").struct.field("price").alias("price2"),
        )
        .select("sector", "type_group", "year1", "price1", "year2", "price2")
-        .filter(pl.col("price1") > 0, pl.col("price2") > 0, pl.col("year2") > pl.col("year1"))
+        .filter(
+            pl.col("price1") > 0,
+            pl.col("price2") > 0,
+            pl.col("year2") > pl.col("year1"),
+        )
        .with_columns(
-            (pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64)).log().alias("log_ratio"),
-            (1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias("weight"),
+            (pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64))
+            .log()
+            .alias("log_ratio"),
+            (1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias(
+                "weight"
+            ),
        )
        .filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
        .collect()
@ -118,31 +100,14 @@ def extract_pairs(input_path: Path) -> pl.DataFrame:
    return pairs


-# --- Sector centroids ---
-
-def extract_centroids(input_path: Path) -> dict[str, tuple[float, float]]:
-    print("Computing sector centroids...")
-    df = (
-        pl.scan_parquet(input_path)
-        .select("Postcode", "lat", "lon")
-        .filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null())
-        .with_columns(sector_expr())
-        .group_by("sector")
-        .agg(pl.col("lat").mean(), pl.col("lon").mean())
-        .collect()
-    )
-    centroids = {}
-    for row in df.iter_rows(named=True):
-        centroids[row["sector"]] = (row["lat"], row["lon"])
-    print(f"  {len(centroids):,} sector centroids")
-    return centroids
-
-
 # --- Robust IRLS solver ---

+
 def solve_robust_index(
-    years1: np.ndarray, years2: np.ndarray,
-    log_ratios: np.ndarray, base_weights: np.ndarray,
+    years1: np.ndarray,
+    years2: np.ndarray,
+    log_ratios: np.ndarray,
+    base_weights: np.ndarray,
 ) -> dict[int, float]:
    """IRLS Huber M-estimation for the Case-Shiller repeat-sales model."""
    n = len(years1)
@ -205,11 +170,16 @@ def solve_robust_index(
 def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
    """Solve robust indices for each group. Returns (indices, n_pairs) dicts."""
    groups = pairs.group_by(group_col).agg(
-        pl.col("year1"), pl.col("year2"), pl.col("log_ratio"), pl.col("weight"),
+        pl.col("year1"),
+        pl.col("year2"),
+        pl.col("log_ratio"),
+        pl.col("weight"),
    )
    indices = {}
    n_pairs = {}
-    for row in tqdm(groups.iter_rows(named=True), total=len(groups), desc=f"    {group_col}"):
+    for row in tqdm(
+        groups.iter_rows(named=True), total=len(groups), desc=f"    {group_col}"
+    ):
        key = row[group_col]
        y1 = np.array(row["year1"], dtype=np.int32)
        y2 = np.array(row["year2"], dtype=np.int32)
@ -224,28 +194,28 @@ def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):

 # --- Hedonic model ---

-def compute_hedonic_index(input_path: Path, min_year: int, max_year: int) -> dict[int, float]:
+
+def compute_hedonic_index(
+    input_path: Path, min_year: int, max_year: int
+) -> dict[int, float]:
    """Two-step hedonic index: regress log(price) on features, average residual by year."""
    print("Computing hedonic index...")
    df = (
        pl.scan_parquet(input_path)
        .select(
-            "Last known price", "Date of last transaction", "Property type",
-            "Total floor area (sqm)", "Current energy rating",
-            "Number of bedrooms & living rooms", "Construction age",
+            "Last known price",
+            "Date of last transaction",
+            "Property type",
+            "Total floor area (sqm)",
        )
        .filter(
            pl.col("Last known price").is_not_null(),
            pl.col("Total floor area (sqm)").is_not_null(),
            pl.col("Total floor area (sqm)") > 0,
-            pl.col("Current energy rating").is_in(["A", "B", "C", "D", "E", "F", "G"]),
-            pl.col("Number of bedrooms & living rooms").is_not_null(),
-            pl.col("Construction age").is_not_null(),
        )
        .with_columns(
            pl.col("Date of last transaction").dt.year().alias("sale_year"),
            type_group_expr(),
-            age_band_expr(),
        )
        .filter(
            pl.col("type_group").is_not_null(),
@ -261,29 +231,9 @@ def compute_hedonic_index(input_path: Path, min_year: int, max_year: int) -> dic
    log_price = np.log(df["Last known price"].to_numpy().astype(np.float64))
    sale_years = df["sale_year"].to_numpy()

-    # Build feature matrix
-    parts = []
-    # log(floor_area)
-    fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32)
-    parts.append(np.log(np.maximum(fa, 1.0)).reshape(-1, 1))
-    # Type dummies (ref: Detached)
-    tg = df["type_group"].to_numpy()
-    for t in ["Terraced", "Semi-Detached", "Flats"]:
-        parts.append((tg == t).astype(np.float32).reshape(-1, 1))
-    # EPC dummies (ref: D)
-    epc = df["Current energy rating"].to_numpy()
-    for r in ["A", "B", "C", "E", "F", "G"]:
-        parts.append((epc == r).astype(np.float32).reshape(-1, 1))
-    # Rooms
-    parts.append(df["Number of bedrooms & living rooms"].to_numpy().astype(np.float32).reshape(-1, 1))
-    # Age band dummies (ref: pre-1900)
-    ab = df["age_band"].to_numpy()
-    for band in AGE_LABELS[1:]:
-        parts.append((ab == band).astype(np.float32).reshape(-1, 1))
-    # Intercept
-    parts.append(np.ones((len(df), 1), dtype=np.float32))
-
-    F = np.hstack(parts)
+    # Build feature matrix (18 hedonic features + intercept)
+    X = build_hedonic_features(df)
+    F = np.hstack([X, np.ones((len(df), 1), dtype=np.float32)])
    print(f"  Feature matrix: {F.shape[0]:,} × {F.shape[1]}")

    # Step 1: regress log(price) on features → quality score
@ -303,12 +253,15 @@ def compute_hedonic_index(input_path: Path, min_year: int, max_year: int) -> dic
    for y in hedonic:
        hedonic[y] -= base

-    print(f"  Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}")
+    print(
+        f"  Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}"
+    )
    return hedonic


 # --- Shrinkage ---

+
 def shrink_index(raw: dict, parent: dict, n_pairs: int, k: int = SHRINKAGE_K) -> dict:
    w = n_pairs / (n_pairs + k)
    result = {}
@ -320,9 +273,18 @@ def shrink_index(raw: dict, parent: dict, n_pairs: int, k: int = SHRINKAGE_K) ->


 def apply_shrinkage(
-    sector_idx, sector_n, district_idx, district_n,
-    area_idx, area_n, national_idx, national_n,
-    hedonic_idx, all_sectors, sector_to_dist, dist_to_area,
+    sector_idx,
+    sector_n,
+    district_idx,
+    district_n,
+    area_idx,
+    area_n,
+    national_idx,
+    national_n,
+    hedonic_idx,
+    all_sectors,
+    sector_to_dist,
+    dist_to_area,
 ):
    """Top-down hierarchical shrinkage: national→hedonic, area→national, etc."""
    # National → hedonic
@ -361,8 +323,11 @@ def apply_shrinkage(

 # --- Spatial smoothing ---

+
 def spatial_smooth(
-    sector_indices: dict, centroids: dict, n_pairs_map: dict,
+    sector_indices: dict,
+    centroids: dict,
+    n_pairs_map: dict,
 ) -> dict:
    """Blend sparse sector indices with K nearest neighbors."""
    # Build coordinate arrays for sectors with centroids
@ -420,6 +385,7 @@ def spatial_smooth(

 # --- Forward fill ---

+
 def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
    filled = {}
    last = 0.0
@ -432,8 +398,11 @@ def forward_fill(index: dict, min_year: int, max_year: int) -> dict:

 # --- Main ---

+
 def main():
-    parser = argparse.ArgumentParser(description="Build improved repeat-sales price index")
+    parser = argparse.ArgumentParser(
+        description="Build improved repeat-sales price index"
+    )
    parser.add_argument("--input", type=Path, required=True)
    parser.add_argument("--output", type=Path, required=True)
    args = parser.parse_args()
@ -474,8 +443,10 @@ def main():
        # National
        np_arrs = typed.select("year1", "year2", "log_ratio", "weight")
        national_idx = solve_robust_index(
-            np_arrs["year1"].to_numpy(), np_arrs["year2"].to_numpy(),
-            np_arrs["log_ratio"].to_numpy(), np_arrs["weight"].to_numpy(),
+            np_arrs["year1"].to_numpy(),
+            np_arrs["year2"].to_numpy(),
+            np_arrs["log_ratio"].to_numpy(),
+            np_arrs["weight"].to_numpy(),
        )
        national_n = len(typed)
        print(f"  National: {len(national_idx)} years")
@ -485,14 +456,25 @@ def main():
        area_idx, area_n = compute_indices_for_level(typed, "area")
        district_idx, district_n = compute_indices_for_level(typed, "district")
        sector_idx, sector_n = compute_indices_for_level(typed, "sector")
-        print(f"  {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors")
+        print(
+            f"  {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
+        )

        # Shrinkage
        print("  Applying shrinkage...")
        sector_shrunk = apply_shrinkage(
-            sector_idx, sector_n, district_idx, district_n,
-            area_idx, area_n, national_idx, national_n,
-            hedonic_idx, all_sectors, sector_to_dist, dist_to_area,
+            sector_idx,
+            sector_n,
+            district_idx,
+            district_n,
+            area_idx,
+            area_n,
+            national_idx,
+            national_n,
+            hedonic_idx,
+            all_sectors,
+            sector_to_dist,
+            dist_to_area,
        )

        # Spatial smoothing
@ -519,15 +501,22 @@ def main():

    result = pl.DataFrame(
        rows,
-        schema={"sector": pl.String, "type_group": pl.String, "year": pl.Int32,
-                "log_index": pl.Float64, "n_pairs": pl.Int64},
+        schema={
+            "sector": pl.String,
+            "type_group": pl.String,
+            "year": pl.Int32,
+            "log_index": pl.Float64,
+            "n_pairs": pl.Int64,
+        },
        orient="row",
    ).sort("type_group", "sector", "year")

    result.write_parquet(args.output)
    size_mb = args.output.stat().st_size / (1024 * 1024)
    print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
-    print(f"  {result['sector'].n_unique():,} sectors × {len(all_type_groups)} types × {max_year - min_year + 1} years = {len(result):,} rows")
+    print(
+        f"  {result['sector'].n_unique():,} sectors × {len(all_type_groups)} types × {max_year - min_year + 1} years = {len(result):,} rows"
+    )


 if __name__ == "__main__":