Changes
This commit is contained in:
parent
6c90cf3c0f
commit
5b68c8da04
14 changed files with 687 additions and 551 deletions
|
|
@ -3,6 +3,7 @@
|
|||
Test set: properties with 2+ sales where the last sale is 2022-2025.
|
||||
Uses the second-to-last sale as input, predicts the last sale price.
|
||||
Compares index-based prediction against a naive baseline (raw input price).
|
||||
Uses type-stratified index when available, falling back to "All" type.
|
||||
|
||||
Output: backtest_results.parquet with predictions vs actuals.
|
||||
"""
|
||||
|
|
@ -15,6 +16,17 @@ import polars as pl
|
|||
|
||||
CURRENT_YEAR = 2025
|
||||
TEST_YEAR_MIN = 2022
|
||||
TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
|
||||
|
||||
|
||||
def type_group_expr():
|
||||
return (
|
||||
pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
|
||||
.when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
|
||||
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("type_group")
|
||||
)
|
||||
|
||||
|
||||
def extract_test_set(input_path: Path) -> pl.DataFrame:
|
||||
|
|
@ -22,13 +34,14 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
|
|||
print("Loading test set...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Postcode", "historical_prices")
|
||||
.select("Postcode", "historical_prices", "Property type")
|
||||
.filter(
|
||||
pl.col("Postcode").is_not_null(),
|
||||
pl.col("historical_prices").list.len() >= 2,
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
|
||||
type_group_expr(),
|
||||
# Last sale (ground truth)
|
||||
pl.col("historical_prices").list.last().struct.field("year").alias("actual_year"),
|
||||
pl.col("historical_prices").list.last().struct.field("price").alias("actual_price"),
|
||||
|
|
@ -49,21 +62,60 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
|
|||
|
||||
|
||||
def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Index-based prediction: adjust input price by sector index change."""
|
||||
# Join index at input year
|
||||
test = test.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_input")),
|
||||
left_on=["sector", "input_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join index at actual year
|
||||
test = test.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_actual")),
|
||||
left_on=["sector", "actual_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
"""Index-based prediction with type-stratified fallback."""
|
||||
has_type_group = "type_group" in index.columns
|
||||
|
||||
if has_type_group:
|
||||
idx_typed = index.filter(pl.col("type_group") != "All")
|
||||
idx_all = index.filter(pl.col("type_group") == "All")
|
||||
|
||||
# Join type-specific index at input year
|
||||
test = test.join(
|
||||
idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_in_typed")),
|
||||
left_on=["sector", "type_group", "input_year"],
|
||||
right_on=["sector", "type_group", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join "All" index at input year
|
||||
test = test.join(
|
||||
idx_all.select("sector", "year", pl.col("log_index").alias("li_in_all")),
|
||||
left_on=["sector", "input_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join type-specific index at actual year
|
||||
test = test.join(
|
||||
idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("li_act_typed")),
|
||||
left_on=["sector", "type_group", "actual_year"],
|
||||
right_on=["sector", "type_group", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join "All" index at actual year
|
||||
test = test.join(
|
||||
idx_all.select("sector", "year", pl.col("log_index").alias("li_act_all")),
|
||||
left_on=["sector", "actual_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
test = test.with_columns(
|
||||
pl.col("li_in_typed").fill_null(pl.col("li_in_all")).alias("log_index_input"),
|
||||
pl.col("li_act_typed").fill_null(pl.col("li_act_all")).alias("log_index_actual"),
|
||||
)
|
||||
else:
|
||||
# Unstratified index
|
||||
test = test.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_input")),
|
||||
left_on=["sector", "input_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
test = test.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_actual")),
|
||||
left_on=["sector", "actual_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
test = test.with_columns(
|
||||
(
|
||||
|
|
@ -75,7 +127,6 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
|||
|
||||
|
||||
def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
|
||||
"""Compute error metrics."""
|
||||
valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0)
|
||||
actual = actual[valid]
|
||||
predicted = predicted[valid]
|
||||
|
|
@ -95,7 +146,6 @@ def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
|
|||
|
||||
|
||||
def print_metrics_table(metrics_by_stage: dict):
|
||||
"""Print a comparison table of metrics."""
|
||||
print("\n" + "=" * 55)
|
||||
print("BACKTEST RESULTS")
|
||||
print("=" * 55)
|
||||
|
|
@ -103,7 +153,6 @@ def print_metrics_table(metrics_by_stage: dict):
|
|||
metric_names = ["MdAPE (%)", "% within 10%", "% within 20%", "% within 30%", "MAE (£)", "Mean signed error (£)", "n"]
|
||||
stages = list(metrics_by_stage.keys())
|
||||
|
||||
# Header
|
||||
header = f"{'Metric':<25s}"
|
||||
for stage in stages:
|
||||
header += f" {stage:>14s}"
|
||||
|
|
@ -133,7 +182,12 @@ def main():
|
|||
args = parser.parse_args()
|
||||
|
||||
index = pl.read_parquet(args.index)
|
||||
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
|
||||
has_type_group = "type_group" in index.columns
|
||||
if has_type_group:
|
||||
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
|
||||
f"{index['type_group'].n_unique()} type groups")
|
||||
else:
|
||||
print(f"Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
|
||||
|
||||
test = extract_test_set(args.input)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,9 +1,10 @@
|
|||
"""Apply repeat-sales price index to estimate current property prices.
|
||||
"""Augment wide.parquet with an estimated current price column.
|
||||
|
||||
Joins the precomputed price index (from price_index.py) with each property's
|
||||
last known sale to produce an inflation-adjusted current price estimate.
|
||||
Joins the precomputed repeat-sales price index (from price_index.py) with each
|
||||
property's last known sale to produce an inflation-adjusted current price estimate.
|
||||
Uses type-stratified index when available, falling back to "All" type.
|
||||
|
||||
Output: estimated_prices.parquet with per-property estimates.
|
||||
Modifies wide.parquet in-place, adding the "Estimated current price" column.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
|
@ -12,78 +13,133 @@ from pathlib import Path
|
|||
import polars as pl
|
||||
|
||||
CURRENT_YEAR = 2025
|
||||
TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
|
||||
|
||||
|
||||
def type_group_expr():
|
||||
return (
|
||||
pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
|
||||
.when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
|
||||
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("type_group")
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Estimate current property prices")
|
||||
parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet")
|
||||
parser = argparse.ArgumentParser(description="Augment wide.parquet with estimated current prices")
|
||||
parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet (modified in-place)")
|
||||
parser.add_argument("--index", type=Path, required=True, help="Path to price_index.parquet")
|
||||
parser.add_argument("--output", type=Path, required=True, help="Output estimated_prices.parquet")
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Loading property data...")
|
||||
df = (
|
||||
pl.scan_parquet(args.input)
|
||||
.select("Postcode", "Address per Property Register", "Last known price", "Date of last transaction")
|
||||
.filter(
|
||||
pl.col("Last known price").is_not_null(),
|
||||
pl.col("Postcode").is_not_null(),
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
|
||||
pl.col("Date of last transaction").dt.year().alias("sale_year"),
|
||||
)
|
||||
.collect()
|
||||
print("Loading wide.parquet...")
|
||||
df = pl.read_parquet(args.input)
|
||||
print(f" {len(df):,} rows, {len(df.columns)} columns")
|
||||
|
||||
# Drop existing estimated price column if re-running
|
||||
if "Estimated current price" in df.columns:
|
||||
df = df.drop("Estimated current price")
|
||||
|
||||
# Derive helper columns for the join
|
||||
has_price = (
|
||||
pl.col("Last known price").is_not_null()
|
||||
& pl.col("Postcode").is_not_null()
|
||||
& pl.col("Date of last transaction").is_not_null()
|
||||
)
|
||||
|
||||
df = df.with_columns(
|
||||
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("_sector"),
|
||||
pl.col("Date of last transaction").dt.year().alias("_sale_year"),
|
||||
type_group_expr().alias("_type_group"),
|
||||
)
|
||||
print(f" {len(df):,} properties with known price and postcode")
|
||||
|
||||
index = pl.read_parquet(args.index)
|
||||
print(f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors")
|
||||
has_type_group = "type_group" in index.columns
|
||||
if has_type_group:
|
||||
print(f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
|
||||
f"{index['type_group'].n_unique()} type groups")
|
||||
else:
|
||||
print(f" Price index: {len(index):,} rows, {index['sector'].n_unique():,} sectors (unstratified)")
|
||||
|
||||
print("\nApplying repeat-sales index...")
|
||||
|
||||
# Join index at sale year
|
||||
df = df.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("log_index_sale")),
|
||||
left_on=["sector", "sale_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
if has_type_group:
|
||||
idx_typed = index.filter(pl.col("type_group") != "All")
|
||||
idx_all = index.filter(pl.col("type_group") == "All")
|
||||
|
||||
# Join index at current year
|
||||
index_current = (
|
||||
index.filter(pl.col("year") == CURRENT_YEAR)
|
||||
.select("sector", pl.col("log_index").alias("log_index_current"))
|
||||
)
|
||||
df = df.join(index_current, on="sector", how="left")
|
||||
|
||||
# Compute estimate; fall back to raw price when no index available
|
||||
df = df.with_columns(
|
||||
(
|
||||
pl.col("Last known price").cast(pl.Float64)
|
||||
* (pl.col("log_index_current") - pl.col("log_index_sale")).exp()
|
||||
# Join type-specific index at sale year
|
||||
df = df.join(
|
||||
idx_typed.select("sector", "type_group", "year", pl.col("log_index").alias("log_idx_sale_typed")),
|
||||
left_on=["_sector", "_type_group", "_sale_year"],
|
||||
right_on=["sector", "type_group", "year"],
|
||||
how="left",
|
||||
)
|
||||
.fill_null(pl.col("Last known price").cast(pl.Float64))
|
||||
.alias("estimated_price"),
|
||||
# Join "All" index at sale year
|
||||
df = df.join(
|
||||
idx_all.select("sector", "year", pl.col("log_index").alias("log_idx_sale_all")),
|
||||
left_on=["_sector", "_sale_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
# Join type-specific index at current year
|
||||
df = df.join(
|
||||
idx_typed.filter(pl.col("year") == CURRENT_YEAR)
|
||||
.select("sector", "type_group", pl.col("log_index").alias("log_idx_cur_typed")),
|
||||
left_on=["_sector", "_type_group"],
|
||||
right_on=["sector", "type_group"],
|
||||
how="left",
|
||||
)
|
||||
# Join "All" index at current year
|
||||
df = df.join(
|
||||
idx_all.filter(pl.col("year") == CURRENT_YEAR)
|
||||
.select("sector", pl.col("log_index").alias("log_idx_cur_all")),
|
||||
left_on="_sector",
|
||||
right_on="sector",
|
||||
how="left",
|
||||
)
|
||||
|
||||
df = df.with_columns(
|
||||
pl.col("log_idx_sale_typed").fill_null(pl.col("log_idx_sale_all")).alias("_log_index_sale"),
|
||||
pl.col("log_idx_cur_typed").fill_null(pl.col("log_idx_cur_all")).alias("_log_index_current"),
|
||||
)
|
||||
else:
|
||||
df = df.join(
|
||||
index.select("sector", "year", pl.col("log_index").alias("_log_index_sale")),
|
||||
left_on=["_sector", "_sale_year"],
|
||||
right_on=["sector", "year"],
|
||||
how="left",
|
||||
)
|
||||
index_current = (
|
||||
index.filter(pl.col("year") == CURRENT_YEAR)
|
||||
.select("sector", pl.col("log_index").alias("_log_index_current"))
|
||||
)
|
||||
df = df.join(index_current, left_on="_sector", right_on="sector", how="left")
|
||||
|
||||
# Compute estimate — only for rows with a known price
|
||||
df = df.with_columns(
|
||||
pl.when(has_price)
|
||||
.then(
|
||||
pl.col("Last known price").cast(pl.Float64)
|
||||
* (pl.col("_log_index_current") - pl.col("_log_index_sale")).exp()
|
||||
)
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("Estimated current price"),
|
||||
)
|
||||
|
||||
n_adjusted = df.filter(pl.col("log_index_sale").is_not_null()).height
|
||||
print(f" {n_adjusted:,} properties adjusted by index ({n_adjusted / len(df) * 100:.1f}%)")
|
||||
n_adjusted = df.filter(
|
||||
has_price & pl.col("_log_index_sale").is_not_null()
|
||||
).height
|
||||
n_with_price = df.filter(has_price).height
|
||||
print(f" {n_adjusted:,} of {n_with_price:,} properties adjusted by index ({n_adjusted / max(n_with_price, 1) * 100:.1f}%)")
|
||||
|
||||
# Select output columns
|
||||
output = df.select(
|
||||
"Postcode",
|
||||
"Address per Property Register",
|
||||
pl.col("Last known price").alias("last_price"),
|
||||
"sale_year",
|
||||
"sector",
|
||||
"estimated_price",
|
||||
)
|
||||
# Drop all temporary columns
|
||||
temp_cols = [c for c in df.columns if c.startswith("_") or c.startswith("log_idx_")]
|
||||
df = df.drop(temp_cols)
|
||||
|
||||
output.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
|
||||
print(f" {len(output):,} rows")
|
||||
df.write_parquet(args.input)
|
||||
size_mb = args.input.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.input} ({size_mb:.1f} MB)")
|
||||
print(f" {len(df):,} rows, {len(df.columns)} columns (including 'Estimated current price')")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -1,10 +1,12 @@
|
|||
"""Stage 1: Repeat-Sales Price Index
|
||||
"""Repeat-Sales Price Index (improved)
|
||||
|
||||
Builds a hierarchical Case-Shiller repeat-sales price index from historical
|
||||
transaction data. Solves WLS regression per postcode sector, district, area,
|
||||
and nationally, then applies Bayesian shrinkage toward parent geographies.
|
||||
Builds a hierarchical repeat-sales price index with:
|
||||
1. Stratification by property type (Detached/Semi-Detached/Terraced/Flats)
|
||||
2. Robust regression (IRLS with Huber weights) instead of hard outlier cutoff
|
||||
3. National hedonic time-dummy model as ultimate shrinkage fallback
|
||||
4. Spatial smoothing for sparse sectors via KD-tree nearest neighbors
|
||||
|
||||
Output: price_index.parquet with columns: sector, year, log_index, n_pairs
|
||||
Output: price_index.parquet — sector × type_group × year → log_index
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
|
@ -14,31 +16,74 @@ import numpy as np
|
|||
import polars as pl
|
||||
from scipy.sparse import csc_matrix
|
||||
from scipy.sparse.linalg import lsqr
|
||||
from scipy.spatial import KDTree
|
||||
from tqdm import tqdm
|
||||
|
||||
MIN_PAIRS = 5 # minimum pairs to compute an index
|
||||
SHRINKAGE_K = 50 # shrinkage parameter: higher = more shrinkage toward parent
|
||||
OUTLIER_THRESHOLD = 2.5 # |log_ratio| > this → drop (>12x price change)
|
||||
# --- Constants ---
|
||||
MIN_PAIRS = 5
|
||||
SHRINKAGE_K = 50
|
||||
OUTLIER_THRESHOLD = 3.0 # hard pre-filter; Huber handles the rest
|
||||
HUBER_K = 1.345
|
||||
IRLS_ITERATIONS = 5
|
||||
SPATIAL_NEIGHBORS = 5
|
||||
SPATIAL_BLEND_K = 30
|
||||
CURRENT_YEAR = 2025
|
||||
|
||||
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats"]
|
||||
TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
|
||||
AGE_BREAKS = [1900, 1930, 1950, 1967, 1983, 2000, 2010]
|
||||
AGE_LABELS = ["pre-1900", "1900-1929", "1930-1949", "1950-1966", "1967-1982", "1983-1999", "2000-2009", "2010+"]
|
||||
|
||||
|
||||
def type_group_expr():
|
||||
"""Polars expression: Property type → type_group."""
|
||||
return (
|
||||
pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
|
||||
.when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
|
||||
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("type_group")
|
||||
)
|
||||
|
||||
|
||||
def age_band_expr():
|
||||
"""Polars expression: Construction age (UInt16 year) → age band string."""
|
||||
expr = pl.when(pl.col("Construction age").is_null()).then(pl.lit(None))
|
||||
for i, brk in enumerate(AGE_BREAKS):
|
||||
expr = expr.when(pl.col("Construction age") < brk).then(pl.lit(AGE_LABELS[i]))
|
||||
return expr.otherwise(pl.lit(AGE_LABELS[-1])).alias("age_band")
|
||||
|
||||
|
||||
def sector_expr():
|
||||
"""Polars expression: Postcode → sector (drop last 2 chars, strip)."""
|
||||
return pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector")
|
||||
|
||||
|
||||
def hierarchy_keys(sector: str) -> tuple[str, str]:
|
||||
"""Return (district, area) for a sector string."""
|
||||
district = sector.rsplit(" ", 1)[0] if " " in sector else sector
|
||||
area = ""
|
||||
for ch in district:
|
||||
if ch.isalpha():
|
||||
area += ch
|
||||
else:
|
||||
break
|
||||
return district, area
|
||||
|
||||
|
||||
# --- Pair extraction ---
|
||||
|
||||
def extract_pairs(input_path: Path) -> pl.DataFrame:
|
||||
"""Extract consecutive sale pairs from historical_prices."""
|
||||
print("Loading historical prices...")
|
||||
print("Extracting repeat-sale pairs...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Postcode", "historical_prices")
|
||||
.filter(
|
||||
pl.col("Postcode").is_not_null(),
|
||||
pl.col("historical_prices").list.len() >= 2,
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector"),
|
||||
)
|
||||
.select("Postcode", "historical_prices", "Property type")
|
||||
.filter(pl.col("Postcode").is_not_null(), pl.col("historical_prices").list.len() >= 2)
|
||||
.with_columns(sector_expr(), type_group_expr())
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(df):,} properties with 2+ transactions")
|
||||
|
||||
print("Extracting consecutive pairs...")
|
||||
pairs = (
|
||||
df.lazy()
|
||||
.with_columns(
|
||||
|
|
@ -52,12 +97,8 @@ def extract_pairs(input_path: Path) -> pl.DataFrame:
|
|||
pl.col("to_txn").struct.field("year").alias("year2"),
|
||||
pl.col("to_txn").struct.field("price").alias("price2"),
|
||||
)
|
||||
.select("sector", "year1", "price1", "year2", "price2")
|
||||
.filter(
|
||||
pl.col("price1") > 0,
|
||||
pl.col("price2") > 0,
|
||||
pl.col("year2") > pl.col("year1"),
|
||||
)
|
||||
.select("sector", "type_group", "year1", "price1", "year2", "price2")
|
||||
.filter(pl.col("price1") > 0, pl.col("price2") > 0, pl.col("year2") > pl.col("year1"))
|
||||
.with_columns(
|
||||
(pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64)).log().alias("log_ratio"),
|
||||
(1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias("weight"),
|
||||
|
|
@ -65,207 +106,428 @@ def extract_pairs(input_path: Path) -> pl.DataFrame:
|
|||
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(pairs):,} consecutive pairs extracted")
|
||||
|
||||
# Add hierarchy columns
|
||||
pairs = pairs.with_columns(
|
||||
pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
|
||||
).with_columns(
|
||||
pl.col("district").str.replace(r"\d.*$", "").alias("area"),
|
||||
)
|
||||
|
||||
print(f" {len(pairs):,} pairs extracted")
|
||||
return pairs
|
||||
|
||||
|
||||
def solve_wls_index(years1: np.ndarray, years2: np.ndarray, log_ratios: np.ndarray, weights: np.ndarray) -> dict[int, float]:
|
||||
"""Solve WLS repeat-sales regression for a set of pairs.
|
||||
# --- Sector centroids ---
|
||||
|
||||
Model: log(P2/P1) = beta[year2] - beta[year1], weighted by 1/sqrt(gap).
|
||||
Pin beta[min_year] = 0.
|
||||
Returns dict mapping year -> log_index (cumulative).
|
||||
"""
|
||||
if len(years1) < MIN_PAIRS:
|
||||
def extract_centroids(input_path: Path) -> dict[str, tuple[float, float]]:
|
||||
print("Computing sector centroids...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Postcode", "lat", "lon")
|
||||
.filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null())
|
||||
.with_columns(sector_expr())
|
||||
.group_by("sector")
|
||||
.agg(pl.col("lat").mean(), pl.col("lon").mean())
|
||||
.collect()
|
||||
)
|
||||
centroids = {}
|
||||
for row in df.iter_rows(named=True):
|
||||
centroids[row["sector"]] = (row["lat"], row["lon"])
|
||||
print(f" {len(centroids):,} sector centroids")
|
||||
return centroids
|
||||
|
||||
|
||||
# --- Robust IRLS solver ---
|
||||
|
||||
def solve_robust_index(
|
||||
years1: np.ndarray, years2: np.ndarray,
|
||||
log_ratios: np.ndarray, base_weights: np.ndarray,
|
||||
) -> dict[int, float]:
|
||||
"""IRLS Huber M-estimation for the Case-Shiller repeat-sales model."""
|
||||
n = len(years1)
|
||||
if n < MIN_PAIRS:
|
||||
return {}
|
||||
|
||||
all_years = np.union1d(years1, years2)
|
||||
min_year = int(all_years.min())
|
||||
# Map years to column indices, skipping min_year (pinned to 0)
|
||||
|
||||
col = 0
|
||||
year_to_col = {}
|
||||
for y in all_years:
|
||||
if int(y) != min_year:
|
||||
year_to_col[int(y)] = col
|
||||
iy = int(y)
|
||||
if iy != min_year:
|
||||
year_to_col[iy] = col
|
||||
col += 1
|
||||
n_cols = len(year_to_col)
|
||||
if n_cols == 0:
|
||||
return {}
|
||||
|
||||
n_rows = len(years1)
|
||||
row_idx = []
|
||||
col_idx = []
|
||||
data = []
|
||||
# Vectorized column index mapping
|
||||
col2 = np.full(n, -1, dtype=np.int32)
|
||||
col1 = np.full(n, -1, dtype=np.int32)
|
||||
for year, c in year_to_col.items():
|
||||
col2[years2 == year] = c
|
||||
col1[years1 == year] = c
|
||||
|
||||
for i in range(n_rows):
|
||||
y1, y2 = int(years1[i]), int(years2[i])
|
||||
if y2 in year_to_col:
|
||||
row_idx.append(i)
|
||||
col_idx.append(year_to_col[y2])
|
||||
data.append(weights[i])
|
||||
if y1 in year_to_col:
|
||||
row_idx.append(i)
|
||||
col_idx.append(year_to_col[y1])
|
||||
data.append(-weights[i])
|
||||
# Sparse matrix structure (fixed across iterations)
|
||||
mask2 = col2 >= 0
|
||||
mask1 = col1 >= 0
|
||||
rows_arr = np.concatenate([np.where(mask2)[0], np.where(mask1)[0]])
|
||||
cols_arr = np.concatenate([col2[mask2], col1[mask1]])
|
||||
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
|
||||
|
||||
A = csc_matrix((data, (row_idx, col_idx)), shape=(n_rows, n_cols))
|
||||
b = log_ratios * weights
|
||||
weights = base_weights.copy()
|
||||
|
||||
result = lsqr(A, b, atol=1e-10, btol=1e-10)
|
||||
betas = result[0]
|
||||
for _ in range(IRLS_ITERATIONS):
|
||||
data = signs_arr * weights[rows_arr]
|
||||
A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
|
||||
b = log_ratios * weights
|
||||
betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]
|
||||
|
||||
# Residuals
|
||||
predicted = np.zeros(n)
|
||||
predicted[mask2] += betas[col2[mask2]]
|
||||
predicted[mask1] -= betas[col1[mask1]]
|
||||
residuals = log_ratios - predicted
|
||||
|
||||
# Huber reweighting
|
||||
abs_r = np.abs(residuals)
|
||||
huber_w = np.where(abs_r <= HUBER_K, 1.0, HUBER_K / np.maximum(abs_r, 1e-10))
|
||||
weights = base_weights * huber_w
|
||||
|
||||
index = {min_year: 0.0}
|
||||
for year, col in year_to_col.items():
|
||||
index[year] = float(betas[col])
|
||||
for year, c in year_to_col.items():
|
||||
index[year] = float(betas[c])
|
||||
return index
|
||||
|
||||
|
||||
def compute_indices_for_level(pairs: pl.DataFrame, group_col: str) -> dict[str, dict[int, float]]:
|
||||
"""Compute raw indices for each geographic group."""
|
||||
def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
|
||||
"""Solve robust indices for each group. Returns (indices, n_pairs) dicts."""
|
||||
groups = pairs.group_by(group_col).agg(
|
||||
pl.col("year1"), pl.col("year2"), pl.col("log_ratio"), pl.col("weight"),
|
||||
)
|
||||
|
||||
indices = {}
|
||||
n_pairs_map = {}
|
||||
for row in tqdm(groups.iter_rows(named=True), total=len(groups), desc=f" Solving {group_col}"):
|
||||
n_pairs = {}
|
||||
for row in tqdm(groups.iter_rows(named=True), total=len(groups), desc=f" {group_col}"):
|
||||
key = row[group_col]
|
||||
y1 = np.array(row["year1"], dtype=np.int32)
|
||||
y2 = np.array(row["year2"], dtype=np.int32)
|
||||
lr = np.array(row["log_ratio"], dtype=np.float64)
|
||||
w = np.array(row["weight"], dtype=np.float64)
|
||||
idx = solve_wls_index(y1, y2, lr, w)
|
||||
idx = solve_robust_index(y1, y2, lr, w)
|
||||
if idx:
|
||||
indices[key] = idx
|
||||
n_pairs_map[key] = len(y1)
|
||||
return indices, n_pairs_map
|
||||
n_pairs[key] = len(y1)
|
||||
return indices, n_pairs
|
||||
|
||||
|
||||
def shrink_index(raw: dict[int, float], parent: dict[int, float], n_pairs: int) -> dict[int, float]:
|
||||
"""Bayesian shrinkage toward parent index."""
|
||||
w = n_pairs / (n_pairs + SHRINKAGE_K)
|
||||
# --- Hedonic model ---
|
||||
|
||||
def compute_hedonic_index(input_path: Path, min_year: int, max_year: int) -> dict[int, float]:
|
||||
"""Two-step hedonic index: regress log(price) on features, average residual by year."""
|
||||
print("Computing hedonic index...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select(
|
||||
"Last known price", "Date of last transaction", "Property type",
|
||||
"Total floor area (sqm)", "Current energy rating",
|
||||
"Number of bedrooms & living rooms", "Construction age",
|
||||
)
|
||||
.filter(
|
||||
pl.col("Last known price").is_not_null(),
|
||||
pl.col("Total floor area (sqm)").is_not_null(),
|
||||
pl.col("Total floor area (sqm)") > 0,
|
||||
pl.col("Current energy rating").is_in(["A", "B", "C", "D", "E", "F", "G"]),
|
||||
pl.col("Number of bedrooms & living rooms").is_not_null(),
|
||||
pl.col("Construction age").is_not_null(),
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("Date of last transaction").dt.year().alias("sale_year"),
|
||||
type_group_expr(),
|
||||
age_band_expr(),
|
||||
)
|
||||
.filter(
|
||||
pl.col("type_group").is_not_null(),
|
||||
pl.col("sale_year").is_not_null(),
|
||||
pl.col("sale_year") >= min_year,
|
||||
pl.col("sale_year") <= max_year,
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
print(f" {len(df):,} complete cases for hedonic model")
|
||||
|
||||
# Target
|
||||
log_price = np.log(df["Last known price"].to_numpy().astype(np.float64))
|
||||
sale_years = df["sale_year"].to_numpy()
|
||||
|
||||
# Build feature matrix
|
||||
parts = []
|
||||
# log(floor_area)
|
||||
fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32)
|
||||
parts.append(np.log(np.maximum(fa, 1.0)).reshape(-1, 1))
|
||||
# Type dummies (ref: Detached)
|
||||
tg = df["type_group"].to_numpy()
|
||||
for t in ["Terraced", "Semi-Detached", "Flats"]:
|
||||
parts.append((tg == t).astype(np.float32).reshape(-1, 1))
|
||||
# EPC dummies (ref: D)
|
||||
epc = df["Current energy rating"].to_numpy()
|
||||
for r in ["A", "B", "C", "E", "F", "G"]:
|
||||
parts.append((epc == r).astype(np.float32).reshape(-1, 1))
|
||||
# Rooms
|
||||
parts.append(df["Number of bedrooms & living rooms"].to_numpy().astype(np.float32).reshape(-1, 1))
|
||||
# Age band dummies (ref: pre-1900)
|
||||
ab = df["age_band"].to_numpy()
|
||||
for band in AGE_LABELS[1:]:
|
||||
parts.append((ab == band).astype(np.float32).reshape(-1, 1))
|
||||
# Intercept
|
||||
parts.append(np.ones((len(df), 1), dtype=np.float32))
|
||||
|
||||
F = np.hstack(parts)
|
||||
print(f" Feature matrix: {F.shape[0]:,} × {F.shape[1]}")
|
||||
|
||||
# Step 1: regress log(price) on features → quality score
|
||||
betas = np.linalg.lstsq(F.astype(np.float64), log_price, rcond=None)[0]
|
||||
quality_score = F.astype(np.float64) @ betas
|
||||
residuals = log_price - quality_score
|
||||
|
||||
# Step 2: average residual by year = hedonic index
|
||||
hedonic = {}
|
||||
for y in range(min_year, max_year + 1):
|
||||
mask = sale_years == y
|
||||
if mask.sum() > 0:
|
||||
hedonic[y] = float(np.mean(residuals[mask]))
|
||||
|
||||
# Normalize: min_year = 0
|
||||
base = hedonic.get(min_year, 0.0)
|
||||
for y in hedonic:
|
||||
hedonic[y] -= base
|
||||
|
||||
print(f" Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}")
|
||||
return hedonic
|
||||
|
||||
|
||||
# --- Shrinkage ---
|
||||
|
||||
def shrink_index(raw: dict, parent: dict, n_pairs: int, k: int = SHRINKAGE_K) -> dict:
|
||||
w = n_pairs / (n_pairs + k)
|
||||
result = {}
|
||||
all_years = set(raw.keys()) | set(parent.keys())
|
||||
for y in all_years:
|
||||
raw_val = raw.get(y, parent.get(y, 0.0))
|
||||
parent_val = parent.get(y, raw.get(y, 0.0))
|
||||
result[y] = w * raw_val + (1 - w) * parent_val
|
||||
for y in set(raw) | set(parent):
|
||||
r = raw.get(y, parent.get(y, 0.0))
|
||||
p = parent.get(y, raw.get(y, 0.0))
|
||||
result[y] = w * r + (1 - w) * p
|
||||
return result
|
||||
|
||||
|
||||
def forward_fill_index(index: dict[int, float], min_year: int, max_year: int) -> dict[int, float]:
|
||||
"""Forward-fill missing years so index is continuous."""
|
||||
def apply_shrinkage(
|
||||
sector_idx, sector_n, district_idx, district_n,
|
||||
area_idx, area_n, national_idx, national_n,
|
||||
hedonic_idx, all_sectors, sector_to_dist, dist_to_area,
|
||||
):
|
||||
"""Top-down hierarchical shrinkage: national→hedonic, area→national, etc."""
|
||||
# National → hedonic
|
||||
national_shrunk = shrink_index(national_idx, hedonic_idx, national_n)
|
||||
|
||||
# Area → national
|
||||
area_shrunk = {}
|
||||
for area, idx in area_idx.items():
|
||||
area_shrunk[area] = shrink_index(idx, national_shrunk, area_n[area])
|
||||
|
||||
# District → area
|
||||
district_shrunk = {}
|
||||
for dist, idx in district_idx.items():
|
||||
a = dist_to_area.get(dist, "")
|
||||
parent = area_shrunk.get(a, national_shrunk)
|
||||
district_shrunk[dist] = shrink_index(idx, parent, district_n[dist])
|
||||
|
||||
# Sector → district
|
||||
sector_shrunk = {}
|
||||
for sec, idx in sector_idx.items():
|
||||
d = sector_to_dist.get(sec, "")
|
||||
parent = district_shrunk.get(d, national_shrunk)
|
||||
sector_shrunk[sec] = shrink_index(idx, parent, sector_n[sec])
|
||||
|
||||
# Fill sectors without their own index
|
||||
for sec in all_sectors:
|
||||
if sec not in sector_shrunk:
|
||||
d = sector_to_dist.get(sec, "")
|
||||
a = dist_to_area.get(d, "")
|
||||
sector_shrunk[sec] = district_shrunk.get(
|
||||
d, area_shrunk.get(a, national_shrunk)
|
||||
)
|
||||
|
||||
return sector_shrunk
|
||||
|
||||
|
||||
# --- Spatial smoothing ---
|
||||
|
||||
def spatial_smooth(
|
||||
sector_indices: dict, centroids: dict, n_pairs_map: dict,
|
||||
) -> dict:
|
||||
"""Blend sparse sector indices with K nearest neighbors."""
|
||||
# Build coordinate arrays for sectors with centroids
|
||||
sectors_with_coords = [s for s in sector_indices if s in centroids]
|
||||
if len(sectors_with_coords) < SPATIAL_NEIGHBORS + 1:
|
||||
return sector_indices
|
||||
|
||||
coords = np.array([centroids[s] for s in sectors_with_coords])
|
||||
# Scale longitude by cos(mean_lat) for approximate Euclidean distance
|
||||
mean_lat = np.mean(coords[:, 0])
|
||||
scale = np.cos(np.radians(mean_lat))
|
||||
scaled_coords = np.column_stack([coords[:, 0], coords[:, 1] * scale])
|
||||
tree = KDTree(scaled_coords)
|
||||
|
||||
result = dict(sector_indices)
|
||||
for i, sec in enumerate(sectors_with_coords):
|
||||
n = n_pairs_map.get(sec, 0)
|
||||
self_w = n / (n + SPATIAL_BLEND_K)
|
||||
if self_w > 0.95:
|
||||
continue # enough data, skip smoothing
|
||||
|
||||
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
|
||||
# Skip self (index 0, distance ~0)
|
||||
neighbor_dists = dists[1:]
|
||||
neighbor_idxs = idxs[1:]
|
||||
|
||||
inv_dists = []
|
||||
neighbor_indices = []
|
||||
for d, j in zip(neighbor_dists, neighbor_idxs):
|
||||
ns = sectors_with_coords[j]
|
||||
if d > 0 and ns in sector_indices:
|
||||
inv_dists.append(1.0 / d)
|
||||
neighbor_indices.append(sector_indices[ns])
|
||||
|
||||
if not neighbor_indices:
|
||||
continue
|
||||
|
||||
total_inv = sum(inv_dists)
|
||||
nbr_w = 1.0 - self_w
|
||||
ws = [iw / total_inv * nbr_w for iw in inv_dists]
|
||||
|
||||
blended = {}
|
||||
all_years = set(sector_indices[sec])
|
||||
for ni in neighbor_indices:
|
||||
all_years |= set(ni)
|
||||
for y in all_years:
|
||||
val = self_w * sector_indices[sec].get(y, 0.0)
|
||||
for ni, w in zip(neighbor_indices, ws):
|
||||
val += w * ni.get(y, 0.0)
|
||||
blended[y] = val
|
||||
result[sec] = blended
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# --- Forward fill ---
|
||||
|
||||
def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
|
||||
filled = {}
|
||||
last_val = 0.0
|
||||
last = 0.0
|
||||
for y in range(min_year, max_year + 1):
|
||||
if y in index:
|
||||
last_val = index[y]
|
||||
filled[y] = last_val
|
||||
last = index[y]
|
||||
filled[y] = last
|
||||
return filled
|
||||
|
||||
|
||||
# --- Main ---
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Build repeat-sales price index")
|
||||
parser.add_argument("--input", type=Path, required=True, help="Path to wide.parquet")
|
||||
parser.add_argument("--output", type=Path, required=True, help="Output price_index.parquet")
|
||||
parser = argparse.ArgumentParser(description="Build improved repeat-sales price index")
|
||||
parser.add_argument("--input", type=Path, required=True)
|
||||
parser.add_argument("--output", type=Path, required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
pairs = extract_pairs(args.input)
|
||||
centroids = extract_centroids(args.input)
|
||||
|
||||
# Derive geographic hierarchy columns
|
||||
pairs = pairs.with_columns(
|
||||
# district = sector minus trailing digit(s), e.g. "SW1A 1" -> "SW1A"
|
||||
pl.col("sector").str.replace(r"\s+\d+$", "").alias("district"),
|
||||
).with_columns(
|
||||
# area = leading letters only, e.g. "SW1A" -> "SW"
|
||||
pl.col("district").str.replace(r"\d.*$", "").alias("area"),
|
||||
)
|
||||
|
||||
# Solve indices at each level
|
||||
print("\nComputing national index...")
|
||||
pairs_np = pairs.select("year1", "year2", "log_ratio", "weight")
|
||||
national_idx = solve_wls_index(
|
||||
pairs_np["year1"].to_numpy(),
|
||||
pairs_np["year2"].to_numpy(),
|
||||
pairs_np["log_ratio"].to_numpy(),
|
||||
pairs_np["weight"].to_numpy(),
|
||||
)
|
||||
print(f" National index: {len(national_idx)} years")
|
||||
|
||||
print("\nComputing area indices...")
|
||||
area_indices, area_pairs = compute_indices_for_level(pairs, "area")
|
||||
print(f" {len(area_indices)} areas with indices")
|
||||
|
||||
print("\nComputing district indices...")
|
||||
district_indices, district_pairs = compute_indices_for_level(pairs, "district")
|
||||
print(f" {len(district_indices)} districts with indices")
|
||||
|
||||
print("\nComputing sector indices...")
|
||||
sector_indices, sector_pairs = compute_indices_for_level(pairs, "sector")
|
||||
print(f" {len(sector_indices)} sectors with indices")
|
||||
|
||||
# Shrink area -> national
|
||||
print("\nApplying hierarchical shrinkage...")
|
||||
for area, idx in tqdm(area_indices.items(), desc=" Area shrinkage"):
|
||||
area_indices[area] = shrink_index(idx, national_idx, area_pairs[area])
|
||||
|
||||
# Shrink district -> area
|
||||
for dist, idx in tqdm(district_indices.items(), desc=" District shrinkage"):
|
||||
area = dist.replace(r"\d.*$", "")
|
||||
# Extract area from district (leading letters)
|
||||
area_key = ""
|
||||
for ch in dist:
|
||||
if ch.isalpha():
|
||||
area_key += ch
|
||||
else:
|
||||
break
|
||||
parent = area_indices.get(area_key, national_idx)
|
||||
district_indices[dist] = shrink_index(idx, parent, district_pairs[dist])
|
||||
|
||||
# Shrink sector -> district
|
||||
for sector, idx in tqdm(sector_indices.items(), desc=" Sector shrinkage"):
|
||||
# District = sector minus trailing space+digit
|
||||
dist_key = sector.rsplit(" ", 1)[0] if " " in sector else sector
|
||||
parent = district_indices.get(dist_key, national_idx)
|
||||
sector_indices[sector] = shrink_index(idx, parent, sector_pairs[sector])
|
||||
|
||||
# For sectors without enough data, fall back to district/area/national
|
||||
all_sectors = pairs["sector"].unique().to_list()
|
||||
min_year = int(pairs["year1"].min())
|
||||
max_year = max(int(pairs["year2"].max()), 2025)
|
||||
max_year = max(int(pairs["year2"].max()), CURRENT_YEAR)
|
||||
|
||||
print(f"\nFilling gaps and forward-filling ({min_year}-{max_year})...")
|
||||
hedonic_idx = compute_hedonic_index(args.input, min_year, max_year)
|
||||
|
||||
# Precompute hierarchy
|
||||
all_sectors = pairs["sector"].unique().to_list()
|
||||
sector_to_dist = {}
|
||||
dist_to_area = {}
|
||||
for s in all_sectors:
|
||||
d, a = hierarchy_keys(s)
|
||||
sector_to_dist[s] = d
|
||||
dist_to_area[d] = a
|
||||
|
||||
# Process each type group + "All"
|
||||
all_type_groups = ["All"] + TYPE_GROUPS
|
||||
final = {} # {type_group: {sector: {year: log_index}}}
|
||||
final_n = {} # {type_group: {sector: n_pairs}}
|
||||
|
||||
for tg in all_type_groups:
|
||||
print(f"\n--- {tg} ---")
|
||||
typed = pairs if tg == "All" else pairs.filter(pl.col("type_group") == tg)
|
||||
if len(typed) < MIN_PAIRS:
|
||||
print(f" Skipping (only {len(typed)} pairs)")
|
||||
final[tg] = {s: dict(hedonic_idx) for s in all_sectors}
|
||||
final_n[tg] = {s: 0 for s in all_sectors}
|
||||
continue
|
||||
|
||||
print(f" {len(typed):,} pairs")
|
||||
|
||||
# National
|
||||
np_arrs = typed.select("year1", "year2", "log_ratio", "weight")
|
||||
national_idx = solve_robust_index(
|
||||
np_arrs["year1"].to_numpy(), np_arrs["year2"].to_numpy(),
|
||||
np_arrs["log_ratio"].to_numpy(), np_arrs["weight"].to_numpy(),
|
||||
)
|
||||
national_n = len(typed)
|
||||
print(f" National: {len(national_idx)} years")
|
||||
|
||||
# Area, district, sector
|
||||
print(" Computing per-level indices:")
|
||||
area_idx, area_n = compute_indices_for_level(typed, "area")
|
||||
district_idx, district_n = compute_indices_for_level(typed, "district")
|
||||
sector_idx, sector_n = compute_indices_for_level(typed, "sector")
|
||||
print(f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors")
|
||||
|
||||
# Shrinkage
|
||||
print(" Applying shrinkage...")
|
||||
sector_shrunk = apply_shrinkage(
|
||||
sector_idx, sector_n, district_idx, district_n,
|
||||
area_idx, area_n, national_idx, national_n,
|
||||
hedonic_idx, all_sectors, sector_to_dist, dist_to_area,
|
||||
)
|
||||
|
||||
# Spatial smoothing
|
||||
print(" Spatial smoothing...")
|
||||
sector_smoothed = spatial_smooth(sector_shrunk, centroids, sector_n)
|
||||
|
||||
# Forward fill
|
||||
for sec in all_sectors:
|
||||
sector_smoothed[sec] = forward_fill(
|
||||
sector_smoothed.get(sec, hedonic_idx), min_year, max_year
|
||||
)
|
||||
|
||||
final[tg] = sector_smoothed
|
||||
final_n[tg] = sector_n
|
||||
|
||||
# Assemble output
|
||||
print("\nAssembling output...")
|
||||
rows = []
|
||||
for sector in tqdm(all_sectors, desc=" Forward-fill"):
|
||||
if sector in sector_indices:
|
||||
idx = sector_indices[sector]
|
||||
else:
|
||||
# Fall back to district, area, national
|
||||
dist_key = sector.rsplit(" ", 1)[0] if " " in sector else sector
|
||||
area_key = ""
|
||||
for ch in dist_key:
|
||||
if ch.isalpha():
|
||||
area_key += ch
|
||||
else:
|
||||
break
|
||||
idx = district_indices.get(dist_key, area_indices.get(area_key, national_idx))
|
||||
|
||||
n = sector_pairs.get(sector, 0)
|
||||
filled = forward_fill_index(idx, min_year, max_year)
|
||||
for year, log_idx in filled.items():
|
||||
rows.append((sector, year, log_idx, n))
|
||||
for tg in all_type_groups:
|
||||
for sec in all_sectors:
|
||||
n = final_n[tg].get(sec, 0)
|
||||
for year, log_idx in final[tg][sec].items():
|
||||
rows.append((sec, tg, year, log_idx, n))
|
||||
|
||||
result = pl.DataFrame(
|
||||
rows,
|
||||
schema={"sector": pl.String, "year": pl.Int32, "log_index": pl.Float64, "n_pairs": pl.Int64},
|
||||
schema={"sector": pl.String, "type_group": pl.String, "year": pl.Int32,
|
||||
"log_index": pl.Float64, "n_pairs": pl.Int64},
|
||||
orient="row",
|
||||
)
|
||||
).sort("type_group", "sector", "year")
|
||||
|
||||
result = result.sort("sector", "year")
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
|
||||
print(f" {result['sector'].n_unique():,} sectors × {max_year - min_year + 1} years = {len(result):,} rows")
|
||||
print(f" {result['sector'].n_unique():,} sectors × {len(all_type_groups)} types × {max_year - min_year + 1} years = {len(result):,} rows")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from pathlib import Path
|
|||
|
||||
import polars as pl
|
||||
|
||||
from pipeline.utils.poi_counts import _count_pois_per_postcode
|
||||
from pipeline.utils.poi_counts import count_pois_per_postcode
|
||||
|
||||
SCHOOL_GROUPS = {
|
||||
"good_primary": ["good_primary"],
|
||||
|
|
@ -60,7 +60,7 @@ def main():
|
|||
# Load all postcodes for proximity counting
|
||||
postcodes = arcgis.rename({"lng": "lon"})
|
||||
|
||||
result = _count_pois_per_postcode(
|
||||
result = count_pois_per_postcode(
|
||||
postcodes, schools, radius_km=5, groups=SCHOOL_GROUPS
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue