diff --git a/pipeline/transform/join_epc_pp.py b/pipeline/transform/join_epc_pp.py index a2364dd..7c73e96 100644 --- a/pipeline/transform/join_epc_pp.py +++ b/pipeline/transform/join_epc_pp.py @@ -4,8 +4,6 @@ from pathlib import Path from ..utils import fuzzy_join_on_postcode -MIN_FLOOR_AREA_M2 = 10 - pl.Config.set_tbl_cols(-1) @@ -121,8 +119,6 @@ def main(): print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)") print(f"Unmatched: {total - matched.height}") - matched = matched.filter(pl.col("TOTAL_FLOOR_AREA") >= MIN_FLOOR_AREA_M2) - # For new-builds (old_new == "Y"), use the first transaction date year as # the exact construction date; otherwise fall back to the EPC age band. epc_band_year = ( @@ -137,7 +133,7 @@ def main(): ) is_new_build = pl.col("old_new") == "Y" - matched = matched.with_columns( + joined = joined.with_columns( pl.when(is_new_build & transfer_year.is_not_null()) .then(transfer_year) .otherwise(epc_band_year) @@ -150,10 +146,10 @@ def main(): .alias("is_construction_date_approximate"), ).drop("old_new", "first_transfer_date") - matched = matched.rename({col: col.lower() for col in joined.columns}) + joined = joined.rename({col: col.lower() for col in joined.columns}) - print(matched.head()) - matched.write_parquet(args.output) + print(joined.head()) + joined.write_parquet(args.output) print(f"Wrote {args.output}")