Changes
This commit is contained in:
parent
3a3f899ea2
commit
128b3191e7
68 changed files with 28060 additions and 1152 deletions
|
|
@ -19,66 +19,38 @@ from scipy.sparse.linalg import lsqr
|
|||
from scipy.spatial import KDTree
|
||||
from tqdm import tqdm
|
||||
|
||||
from pipeline.transform._price_utils import (
|
||||
CURRENT_YEAR,
|
||||
SHRINKAGE_K,
|
||||
TYPE_GROUPS,
|
||||
build_hedonic_features,
|
||||
extract_centroids,
|
||||
hierarchy_keys,
|
||||
sector_expr,
|
||||
type_group_expr,
|
||||
)
|
||||
|
||||
# --- Constants ---
|
||||
MIN_PAIRS = 5
|
||||
SHRINKAGE_K = 50
|
||||
OUTLIER_THRESHOLD = 3.0 # hard pre-filter; Huber handles the rest
|
||||
HUBER_K = 1.345
|
||||
IRLS_ITERATIONS = 5
|
||||
SPATIAL_NEIGHBORS = 5
|
||||
SPATIAL_BLEND_K = 30
|
||||
CURRENT_YEAR = 2025
|
||||
|
||||
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats"]
|
||||
TERRACE_TYPES = ["Mid-Terrace", "End-Terrace", "Enclosed Mid-Terrace", "Enclosed End-Terrace"]
|
||||
AGE_BREAKS = [1900, 1930, 1950, 1967, 1983, 2000, 2010]
|
||||
AGE_LABELS = ["pre-1900", "1900-1929", "1930-1949", "1950-1966", "1967-1982", "1983-1999", "2000-2009", "2010+"]
|
||||
|
||||
|
||||
def type_group_expr():
|
||||
"""Polars expression: Property type → type_group."""
|
||||
return (
|
||||
pl.when(pl.col("Property type").is_in(TERRACE_TYPES)).then(pl.lit("Terraced"))
|
||||
.when(pl.col("Property type") == "Flats/Maisonettes").then(pl.lit("Flats"))
|
||||
.when(pl.col("Property type").is_in(["Detached", "Semi-Detached"])).then(pl.col("Property type"))
|
||||
.otherwise(pl.lit(None))
|
||||
.alias("type_group")
|
||||
)
|
||||
|
||||
|
||||
def age_band_expr():
|
||||
"""Polars expression: Construction age (UInt16 year) → age band string."""
|
||||
expr = pl.when(pl.col("Construction age").is_null()).then(pl.lit(None))
|
||||
for i, brk in enumerate(AGE_BREAKS):
|
||||
expr = expr.when(pl.col("Construction age") < brk).then(pl.lit(AGE_LABELS[i]))
|
||||
return expr.otherwise(pl.lit(AGE_LABELS[-1])).alias("age_band")
|
||||
|
||||
|
||||
def sector_expr():
|
||||
"""Polars expression: Postcode → sector (drop last 2 chars, strip)."""
|
||||
return pl.col("Postcode").str.slice(0, pl.col("Postcode").str.len_chars() - 2).str.strip_chars().alias("sector")
|
||||
|
||||
|
||||
def hierarchy_keys(sector: str) -> tuple[str, str]:
|
||||
"""Return (district, area) for a sector string."""
|
||||
district = sector.rsplit(" ", 1)[0] if " " in sector else sector
|
||||
area = ""
|
||||
for ch in district:
|
||||
if ch.isalpha():
|
||||
area += ch
|
||||
else:
|
||||
break
|
||||
return district, area
|
||||
|
||||
|
||||
# --- Pair extraction ---
|
||||
|
||||
|
||||
def extract_pairs(input_path: Path) -> pl.DataFrame:
|
||||
print("Extracting repeat-sale pairs...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Postcode", "historical_prices", "Property type")
|
||||
.filter(pl.col("Postcode").is_not_null(), pl.col("historical_prices").list.len() >= 2)
|
||||
.filter(
|
||||
pl.col("Postcode").is_not_null(),
|
||||
pl.col("historical_prices").list.len() >= 2,
|
||||
)
|
||||
.with_columns(sector_expr(), type_group_expr())
|
||||
.collect()
|
||||
)
|
||||
|
|
@ -87,7 +59,9 @@ def extract_pairs(input_path: Path) -> pl.DataFrame:
|
|||
pairs = (
|
||||
df.lazy()
|
||||
.with_columns(
|
||||
pl.col("historical_prices").list.slice(0, pl.col("historical_prices").list.len() - 1).alias("from_txn"),
|
||||
pl.col("historical_prices")
|
||||
.list.slice(0, pl.col("historical_prices").list.len() - 1)
|
||||
.alias("from_txn"),
|
||||
pl.col("historical_prices").list.slice(1).alias("to_txn"),
|
||||
)
|
||||
.explode("from_txn", "to_txn")
|
||||
|
|
@ -98,10 +72,18 @@ def extract_pairs(input_path: Path) -> pl.DataFrame:
|
|||
pl.col("to_txn").struct.field("price").alias("price2"),
|
||||
)
|
||||
.select("sector", "type_group", "year1", "price1", "year2", "price2")
|
||||
.filter(pl.col("price1") > 0, pl.col("price2") > 0, pl.col("year2") > pl.col("year1"))
|
||||
.filter(
|
||||
pl.col("price1") > 0,
|
||||
pl.col("price2") > 0,
|
||||
pl.col("year2") > pl.col("year1"),
|
||||
)
|
||||
.with_columns(
|
||||
(pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64)).log().alias("log_ratio"),
|
||||
(1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias("weight"),
|
||||
(pl.col("price2").cast(pl.Float64) / pl.col("price1").cast(pl.Float64))
|
||||
.log()
|
||||
.alias("log_ratio"),
|
||||
(1.0 / (pl.col("year2") - pl.col("year1")).cast(pl.Float64).sqrt()).alias(
|
||||
"weight"
|
||||
),
|
||||
)
|
||||
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
|
||||
.collect()
|
||||
|
|
@ -118,31 +100,14 @@ def extract_pairs(input_path: Path) -> pl.DataFrame:
|
|||
return pairs
|
||||
|
||||
|
||||
# --- Sector centroids ---
|
||||
|
||||
def extract_centroids(input_path: Path) -> dict[str, tuple[float, float]]:
|
||||
print("Computing sector centroids...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select("Postcode", "lat", "lon")
|
||||
.filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null())
|
||||
.with_columns(sector_expr())
|
||||
.group_by("sector")
|
||||
.agg(pl.col("lat").mean(), pl.col("lon").mean())
|
||||
.collect()
|
||||
)
|
||||
centroids = {}
|
||||
for row in df.iter_rows(named=True):
|
||||
centroids[row["sector"]] = (row["lat"], row["lon"])
|
||||
print(f" {len(centroids):,} sector centroids")
|
||||
return centroids
|
||||
|
||||
|
||||
# --- Robust IRLS solver ---
|
||||
|
||||
|
||||
def solve_robust_index(
|
||||
years1: np.ndarray, years2: np.ndarray,
|
||||
log_ratios: np.ndarray, base_weights: np.ndarray,
|
||||
years1: np.ndarray,
|
||||
years2: np.ndarray,
|
||||
log_ratios: np.ndarray,
|
||||
base_weights: np.ndarray,
|
||||
) -> dict[int, float]:
|
||||
"""IRLS Huber M-estimation for the Case-Shiller repeat-sales model."""
|
||||
n = len(years1)
|
||||
|
|
@ -205,11 +170,16 @@ def solve_robust_index(
|
|||
def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
|
||||
"""Solve robust indices for each group. Returns (indices, n_pairs) dicts."""
|
||||
groups = pairs.group_by(group_col).agg(
|
||||
pl.col("year1"), pl.col("year2"), pl.col("log_ratio"), pl.col("weight"),
|
||||
pl.col("year1"),
|
||||
pl.col("year2"),
|
||||
pl.col("log_ratio"),
|
||||
pl.col("weight"),
|
||||
)
|
||||
indices = {}
|
||||
n_pairs = {}
|
||||
for row in tqdm(groups.iter_rows(named=True), total=len(groups), desc=f" {group_col}"):
|
||||
for row in tqdm(
|
||||
groups.iter_rows(named=True), total=len(groups), desc=f" {group_col}"
|
||||
):
|
||||
key = row[group_col]
|
||||
y1 = np.array(row["year1"], dtype=np.int32)
|
||||
y2 = np.array(row["year2"], dtype=np.int32)
|
||||
|
|
@ -224,28 +194,28 @@ def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
|
|||
|
||||
# --- Hedonic model ---
|
||||
|
||||
def compute_hedonic_index(input_path: Path, min_year: int, max_year: int) -> dict[int, float]:
|
||||
|
||||
def compute_hedonic_index(
|
||||
input_path: Path, min_year: int, max_year: int
|
||||
) -> dict[int, float]:
|
||||
"""Two-step hedonic index: regress log(price) on features, average residual by year."""
|
||||
print("Computing hedonic index...")
|
||||
df = (
|
||||
pl.scan_parquet(input_path)
|
||||
.select(
|
||||
"Last known price", "Date of last transaction", "Property type",
|
||||
"Total floor area (sqm)", "Current energy rating",
|
||||
"Number of bedrooms & living rooms", "Construction age",
|
||||
"Last known price",
|
||||
"Date of last transaction",
|
||||
"Property type",
|
||||
"Total floor area (sqm)",
|
||||
)
|
||||
.filter(
|
||||
pl.col("Last known price").is_not_null(),
|
||||
pl.col("Total floor area (sqm)").is_not_null(),
|
||||
pl.col("Total floor area (sqm)") > 0,
|
||||
pl.col("Current energy rating").is_in(["A", "B", "C", "D", "E", "F", "G"]),
|
||||
pl.col("Number of bedrooms & living rooms").is_not_null(),
|
||||
pl.col("Construction age").is_not_null(),
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("Date of last transaction").dt.year().alias("sale_year"),
|
||||
type_group_expr(),
|
||||
age_band_expr(),
|
||||
)
|
||||
.filter(
|
||||
pl.col("type_group").is_not_null(),
|
||||
|
|
@ -261,29 +231,9 @@ def compute_hedonic_index(input_path: Path, min_year: int, max_year: int) -> dic
|
|||
log_price = np.log(df["Last known price"].to_numpy().astype(np.float64))
|
||||
sale_years = df["sale_year"].to_numpy()
|
||||
|
||||
# Build feature matrix
|
||||
parts = []
|
||||
# log(floor_area)
|
||||
fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32)
|
||||
parts.append(np.log(np.maximum(fa, 1.0)).reshape(-1, 1))
|
||||
# Type dummies (ref: Detached)
|
||||
tg = df["type_group"].to_numpy()
|
||||
for t in ["Terraced", "Semi-Detached", "Flats"]:
|
||||
parts.append((tg == t).astype(np.float32).reshape(-1, 1))
|
||||
# EPC dummies (ref: D)
|
||||
epc = df["Current energy rating"].to_numpy()
|
||||
for r in ["A", "B", "C", "E", "F", "G"]:
|
||||
parts.append((epc == r).astype(np.float32).reshape(-1, 1))
|
||||
# Rooms
|
||||
parts.append(df["Number of bedrooms & living rooms"].to_numpy().astype(np.float32).reshape(-1, 1))
|
||||
# Age band dummies (ref: pre-1900)
|
||||
ab = df["age_band"].to_numpy()
|
||||
for band in AGE_LABELS[1:]:
|
||||
parts.append((ab == band).astype(np.float32).reshape(-1, 1))
|
||||
# Intercept
|
||||
parts.append(np.ones((len(df), 1), dtype=np.float32))
|
||||
|
||||
F = np.hstack(parts)
|
||||
# Build feature matrix (18 hedonic features + intercept)
|
||||
X = build_hedonic_features(df)
|
||||
F = np.hstack([X, np.ones((len(df), 1), dtype=np.float32)])
|
||||
print(f" Feature matrix: {F.shape[0]:,} × {F.shape[1]}")
|
||||
|
||||
# Step 1: regress log(price) on features → quality score
|
||||
|
|
@ -303,12 +253,15 @@ def compute_hedonic_index(input_path: Path, min_year: int, max_year: int) -> dic
|
|||
for y in hedonic:
|
||||
hedonic[y] -= base
|
||||
|
||||
print(f" Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}")
|
||||
print(
|
||||
f" Hedonic index: {len(hedonic)} years, range {min(hedonic.values()):.3f} to {max(hedonic.values()):.3f}"
|
||||
)
|
||||
return hedonic
|
||||
|
||||
|
||||
# --- Shrinkage ---
|
||||
|
||||
|
||||
def shrink_index(raw: dict, parent: dict, n_pairs: int, k: int = SHRINKAGE_K) -> dict:
|
||||
w = n_pairs / (n_pairs + k)
|
||||
result = {}
|
||||
|
|
@ -320,9 +273,18 @@ def shrink_index(raw: dict, parent: dict, n_pairs: int, k: int = SHRINKAGE_K) ->
|
|||
|
||||
|
||||
def apply_shrinkage(
|
||||
sector_idx, sector_n, district_idx, district_n,
|
||||
area_idx, area_n, national_idx, national_n,
|
||||
hedonic_idx, all_sectors, sector_to_dist, dist_to_area,
|
||||
sector_idx,
|
||||
sector_n,
|
||||
district_idx,
|
||||
district_n,
|
||||
area_idx,
|
||||
area_n,
|
||||
national_idx,
|
||||
national_n,
|
||||
hedonic_idx,
|
||||
all_sectors,
|
||||
sector_to_dist,
|
||||
dist_to_area,
|
||||
):
|
||||
"""Top-down hierarchical shrinkage: national→hedonic, area→national, etc."""
|
||||
# National → hedonic
|
||||
|
|
@ -361,8 +323,11 @@ def apply_shrinkage(
|
|||
|
||||
# --- Spatial smoothing ---
|
||||
|
||||
|
||||
def spatial_smooth(
|
||||
sector_indices: dict, centroids: dict, n_pairs_map: dict,
|
||||
sector_indices: dict,
|
||||
centroids: dict,
|
||||
n_pairs_map: dict,
|
||||
) -> dict:
|
||||
"""Blend sparse sector indices with K nearest neighbors."""
|
||||
# Build coordinate arrays for sectors with centroids
|
||||
|
|
@ -420,6 +385,7 @@ def spatial_smooth(
|
|||
|
||||
# --- Forward fill ---
|
||||
|
||||
|
||||
def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
|
||||
filled = {}
|
||||
last = 0.0
|
||||
|
|
@ -432,8 +398,11 @@ def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
|
|||
|
||||
# --- Main ---
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Build improved repeat-sales price index")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Build improved repeat-sales price index"
|
||||
)
|
||||
parser.add_argument("--input", type=Path, required=True)
|
||||
parser.add_argument("--output", type=Path, required=True)
|
||||
args = parser.parse_args()
|
||||
|
|
@ -474,8 +443,10 @@ def main():
|
|||
# National
|
||||
np_arrs = typed.select("year1", "year2", "log_ratio", "weight")
|
||||
national_idx = solve_robust_index(
|
||||
np_arrs["year1"].to_numpy(), np_arrs["year2"].to_numpy(),
|
||||
np_arrs["log_ratio"].to_numpy(), np_arrs["weight"].to_numpy(),
|
||||
np_arrs["year1"].to_numpy(),
|
||||
np_arrs["year2"].to_numpy(),
|
||||
np_arrs["log_ratio"].to_numpy(),
|
||||
np_arrs["weight"].to_numpy(),
|
||||
)
|
||||
national_n = len(typed)
|
||||
print(f" National: {len(national_idx)} years")
|
||||
|
|
@ -485,14 +456,25 @@ def main():
|
|||
area_idx, area_n = compute_indices_for_level(typed, "area")
|
||||
district_idx, district_n = compute_indices_for_level(typed, "district")
|
||||
sector_idx, sector_n = compute_indices_for_level(typed, "sector")
|
||||
print(f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors")
|
||||
print(
|
||||
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
|
||||
)
|
||||
|
||||
# Shrinkage
|
||||
print(" Applying shrinkage...")
|
||||
sector_shrunk = apply_shrinkage(
|
||||
sector_idx, sector_n, district_idx, district_n,
|
||||
area_idx, area_n, national_idx, national_n,
|
||||
hedonic_idx, all_sectors, sector_to_dist, dist_to_area,
|
||||
sector_idx,
|
||||
sector_n,
|
||||
district_idx,
|
||||
district_n,
|
||||
area_idx,
|
||||
area_n,
|
||||
national_idx,
|
||||
national_n,
|
||||
hedonic_idx,
|
||||
all_sectors,
|
||||
sector_to_dist,
|
||||
dist_to_area,
|
||||
)
|
||||
|
||||
# Spatial smoothing
|
||||
|
|
@ -519,15 +501,22 @@ def main():
|
|||
|
||||
result = pl.DataFrame(
|
||||
rows,
|
||||
schema={"sector": pl.String, "type_group": pl.String, "year": pl.Int32,
|
||||
"log_index": pl.Float64, "n_pairs": pl.Int64},
|
||||
schema={
|
||||
"sector": pl.String,
|
||||
"type_group": pl.String,
|
||||
"year": pl.Int32,
|
||||
"log_index": pl.Float64,
|
||||
"n_pairs": pl.Int64,
|
||||
},
|
||||
orient="row",
|
||||
).sort("type_group", "sector", "year")
|
||||
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(f"\nWrote {args.output} ({size_mb:.1f} MB)")
|
||||
print(f" {result['sector'].n_unique():,} sectors × {len(all_type_groups)} types × {max_year - min_year + 1} years = {len(result):,} rows")
|
||||
print(
|
||||
f" {result['sector'].n_unique():,} sectors × {len(all_type_groups)} types × {max_year - min_year + 1} years = {len(result):,} rows"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue