This commit is contained in:
Andras Schmelczer 2026-03-15 21:22:28 +00:00
parent 479ef92236
commit c38d654ac7
44 changed files with 2526 additions and 701 deletions

View file

@ -94,11 +94,18 @@ def _build(
# Remap terminated postcodes to nearest active successor
postcode_mapping = build_postcode_mapping(arcgis_path)
wide = wide.join(
postcode_mapping.lazy(), left_on="postcode", right_on="old_postcode", how="left"
).with_columns(
pl.coalesce("new_postcode", "postcode").alias("postcode"),
).drop("new_postcode")
wide = (
wide.join(
postcode_mapping.lazy(),
left_on="postcode",
right_on="old_postcode",
how="left",
)
.with_columns(
pl.coalesce("new_postcode", "postcode").alias("postcode"),
)
.drop("new_postcode")
)
arcgis = (
pl.scan_parquet(arcgis_path)
@ -252,16 +259,18 @@ def _build(
.otherwise(pl.col("pp_property_type"))
# Unify EPC's "Flat"/"Maisonette" with price-paid's "Flats/Maisonettes",
# collapse terrace sub-types, and fold rare types into "Other"
.replace({
"Flat": "Flats/Maisonettes",
"Maisonette": "Flats/Maisonettes",
"End-Terrace": "Terraced",
"Mid-Terrace": "Terraced",
"Enclosed End-Terrace": "Terraced",
"Enclosed Mid-Terrace": "Terraced",
"Bungalow": "Other",
"Park home": "Other",
})
.replace(
{
"Flat": "Flats/Maisonettes",
"Maisonette": "Flats/Maisonettes",
"End-Terrace": "Terraced",
"Mid-Terrace": "Terraced",
"Enclosed End-Terrace": "Terraced",
"Enclosed Mid-Terrace": "Terraced",
"Bungalow": "Other",
"Park home": "Other",
}
)
.alias("property_type")
)
@ -426,10 +435,16 @@ def main():
help="Census 2021 population by LSOA parquet file",
)
parser.add_argument(
"--output-postcodes", type=Path, required=True, help="Output postcode parquet file path"
"--output-postcodes",
type=Path,
required=True,
help="Output postcode parquet file path",
)
parser.add_argument(
"--output-properties", type=Path, required=True, help="Output properties parquet file path"
"--output-properties",
type=Path,
required=True,
help="Output properties parquet file path",
)
args = parser.parse_args()

View file

@ -454,9 +454,7 @@ class TestFillHoles:
hole1 = [(10, 10), (20, 10), (20, 20), (10, 20), (10, 10)]
outer2 = [(60, 60), (110, 60), (110, 110), (60, 110), (60, 60)]
hole2 = [(70, 70), (80, 70), (80, 80), (70, 80), (70, 70)]
mp = MultiPolygon(
[Polygon(outer1, [hole1]), Polygon(outer2, [hole2])]
)
mp = MultiPolygon([Polygon(outer1, [hole1]), Polygon(outer2, [hole2])])
result = _fill_holes(mp)
assert result.geom_type == "MultiPolygon"
for p in result.geoms:

View file

@ -112,7 +112,9 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0) & (predicted > 0)
valid = (
np.isfinite(predicted) & np.isfinite(actual) & (actual > 0) & (predicted > 0)
)
actual = actual[valid]
predicted = predicted[valid]
@ -176,7 +178,10 @@ def main():
"--input", type=Path, required=True, help="Path to properties.parquet"
)
parser.add_argument(
"--postcodes", type=Path, required=True, help="Path to postcode.parquet (for lat/lon)"
"--postcodes",
type=Path,
required=True,
help="Path to postcode.parquet (for lat/lon)",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output backtest_results.parquet"
@ -185,7 +190,9 @@ def main():
# Build index from pre-test data only (temporal holdout)
print(f"Building price index (pairs with year2 < {TEST_YEAR_MIN})...")
index = build_index(args.input, max_pair_year=TEST_YEAR_MIN, postcodes_path=args.postcodes)
index = build_index(
args.input, max_pair_year=TEST_YEAR_MIN, postcodes_path=args.postcodes
)
print(
f"\nHoldout index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
f"{index['type_group'].n_unique()} type groups"
@ -233,7 +240,9 @@ def main():
knn_est = knn_psm * fa * temporal_adj
n_knn = int((np.isfinite(knn_est) & (knn_est > 0)).sum())
print(f" kNN estimates: {n_knn:,} of {len(test):,} ({n_knn / len(test) * 100:.1f}%)")
print(
f" kNN estimates: {n_knn:,} of {len(test):,} ({n_knn / len(test) * 100:.1f}%)"
)
# Blend: (1-w)*index + w*kNN where both available
index_est = test["predicted"].to_numpy().astype(np.float64)

View file

@ -107,9 +107,7 @@ def main():
pl.when(has_price)
.then(
pl.col("Last known price").cast(pl.Float64)
* (
pl.col("_log_index_current_interp") - pl.col("_log_index_sale_interp")
)
* (pl.col("_log_index_current_interp") - pl.col("_log_index_sale_interp"))
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
.exp()
)

View file

@ -105,9 +105,7 @@ def extract_pairs(input_path: Path, max_year2: int | None = None) -> pl.DataFram
.alias("log_ratio"),
(
1.0
/ (pl.col("frac_year2") - pl.col("frac_year1"))
.cast(pl.Float64)
.sqrt()
/ (pl.col("frac_year2") - pl.col("frac_year1")).cast(pl.Float64).sqrt()
).alias("weight"),
)
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
@ -453,8 +451,12 @@ def main():
description="Build improved repeat-sales price index"
)
parser.add_argument("--input", type=Path, required=True)
parser.add_argument("--postcodes", type=Path, required=True,
help="Path to postcode.parquet (for lat/lon centroids)")
parser.add_argument(
"--postcodes",
type=Path,
required=True,
help="Path to postcode.parquet (for lat/lon centroids)",
)
parser.add_argument("--output", type=Path, required=True)
args = parser.parse_args()

View file

@ -43,48 +43,39 @@ def build_knn_pool(
"""
print("Building kNN pool...")
lf = pl.scan_parquet(source) if isinstance(source, Path) else source
query = (
lf
.select(
"Postcode",
"Property type",
"lat",
"lon",
"Total floor area (sqm)",
"Last known price",
"Date of last transaction",
)
.filter(
pl.col("lat").is_not_null(),
pl.col("lon").is_not_null(),
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") > 0,
pl.col("Last known price").is_not_null(),
pl.col("Last known price") > 0,
pl.col("Postcode").is_not_null(),
pl.col("Date of last transaction").is_not_null(),
)
query = lf.select(
"Postcode",
"Property type",
"lat",
"lon",
"Total floor area (sqm)",
"Last known price",
"Date of last transaction",
).filter(
pl.col("lat").is_not_null(),
pl.col("lon").is_not_null(),
pl.col("Total floor area (sqm)").is_not_null(),
pl.col("Total floor area (sqm)") > 0,
pl.col("Last known price").is_not_null(),
pl.col("Last known price") > 0,
pl.col("Postcode").is_not_null(),
pl.col("Date of last transaction").is_not_null(),
)
if max_sale_year is not None:
query = query.filter(
pl.col("Date of last transaction").dt.year() < max_sale_year
)
pool = (
query.with_columns(
sector_expr(),
type_group_expr(),
(
pl.col("Date of last transaction").dt.year().cast(pl.Float64)
+ (
pl.col("Date of last transaction").dt.month().cast(pl.Float64)
- 1.0
)
/ 12.0
).alias("_sale_fy"),
pl.lit(ref_frac_year).alias("_ref_fy"),
).collect()
)
pool = query.with_columns(
sector_expr(),
type_group_expr(),
(
pl.col("Date of last transaction").dt.year().cast(pl.Float64)
+ (pl.col("Date of last transaction").dt.month().cast(pl.Float64) - 1.0)
/ 12.0
).alias("_sale_fy"),
pl.lit(ref_frac_year).alias("_ref_fy"),
).collect()
pool = pool.filter(pl.col("type_group").is_not_null())
print(f" {len(pool):,} pool properties with lat/lon, floor area, price")

View file

@ -1085,7 +1085,9 @@ def transform(
if cat not in all_set:
mapped_but_absent.append(cat)
if mapped_but_absent:
print(f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}")
print(
f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}"
)
# Drop unwanted categories
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))