From 806213ce3c381b88e98a3f67df7394a82706e097 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sun, 1 Feb 2026 11:05:46 +0000 Subject: [PATCH] Add actual construction age --- pipeline/transform/join_epc_pp.py | 32 +++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/pipeline/transform/join_epc_pp.py b/pipeline/transform/join_epc_pp.py index aa42b4e..630813a 100644 --- a/pipeline/transform/join_epc_pp.py +++ b/pipeline/transform/join_epc_pp.py @@ -4,6 +4,8 @@ from pathlib import Path from ..utils import fuzzy_join_on_postcode +MIN_FLOOR_AREA_M2 = 10 + pl.Config.set_tbl_cols(-1) @@ -69,6 +71,7 @@ def main(): "locality", "town_city", pl.col("duration").replace(duration_map), + "old_new", ) .filter(pl.col("pp_property_type") != "Other") .with_columns( @@ -89,6 +92,8 @@ def main(): pl.col("duration").last(), pl.col("price").last().alias("latest_price"), pl.col("date_of_transfer").last(), + pl.col("date_of_transfer").first().alias("first_transfer_date"), + pl.col("old_new").first(), ) ).filter(pl.col("pp_address").is_not_null()) @@ -116,6 +121,33 @@ def main(): print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)") print(f"Unmatched: {total - matched.height}") + matched = matched.filter(pl.col("TOTAL_FLOOR_AREA") >= MIN_FLOOR_AREA_M2) + + # For new-builds (old_new == "Y"), use the first transaction date year as + # the exact construction date; otherwise fall back to the EPC age band. + epc_band_year = ( + pl.col("CONSTRUCTION_AGE_BAND") + .str.replace("England and Wales: ", "") + .str.replace(" onwards", "") + .str.extract(r"(\d{4})", 1) + .cast(pl.UInt16, strict=False) + ) + transfer_year = pl.col("first_transfer_date").dt.year().cast(pl.UInt16, strict=False) + is_new_build = pl.col("old_new") == "Y" + + matched = matched.with_columns( + pl.when(is_new_build & transfer_year.is_not_null()) + .then(transfer_year) + .otherwise(epc_band_year) + .alias("CONSTRUCTION_AGE_BAND"), + pl.when(is_new_build & transfer_year.is_not_null()) + .then(pl.lit(0, dtype=pl.UInt8)) + .when(epc_band_year.is_not_null()) + .then(pl.lit(1, dtype=pl.UInt8)) + .otherwise(pl.lit(None, dtype=pl.UInt8)) + .alias("is_construction_date_approximate"), + ).drop("old_new", "first_transfer_date") + matched = matched.rename({col: col.lower() for col in joined.columns}) print(matched.head())