Outer join epc

This commit is contained in:
Andras Schmelczer 2026-02-07 09:49:42 +00:00
parent 6268dbda4d
commit 609dd5278c

View file

@ -4,8 +4,6 @@ from pathlib import Path
from ..utils import fuzzy_join_on_postcode from ..utils import fuzzy_join_on_postcode
MIN_FLOOR_AREA_M2 = 10
pl.Config.set_tbl_cols(-1) pl.Config.set_tbl_cols(-1)
@ -121,8 +119,6 @@ def main():
print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)") print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)")
print(f"Unmatched: {total - matched.height}") print(f"Unmatched: {total - matched.height}")
matched = matched.filter(pl.col("TOTAL_FLOOR_AREA") >= MIN_FLOOR_AREA_M2)
# For new-builds (old_new == "Y"), use the first transaction date year as # For new-builds (old_new == "Y"), use the first transaction date year as
# the exact construction date; otherwise fall back to the EPC age band. # the exact construction date; otherwise fall back to the EPC age band.
epc_band_year = ( epc_band_year = (
@ -137,7 +133,7 @@ def main():
) )
is_new_build = pl.col("old_new") == "Y" is_new_build = pl.col("old_new") == "Y"
matched = matched.with_columns( joined = joined.with_columns(
pl.when(is_new_build & transfer_year.is_not_null()) pl.when(is_new_build & transfer_year.is_not_null())
.then(transfer_year) .then(transfer_year)
.otherwise(epc_band_year) .otherwise(epc_band_year)
@ -150,10 +146,10 @@ def main():
.alias("is_construction_date_approximate"), .alias("is_construction_date_approximate"),
).drop("old_new", "first_transfer_date") ).drop("old_new", "first_transfer_date")
matched = matched.rename({col: col.lower() for col in joined.columns}) joined = joined.rename({col: col.lower() for col in joined.columns})
print(matched.head()) print(joined.head())
matched.write_parquet(args.output) joined.write_parquet(args.output)
print(f"Wrote {args.output}") print(f"Wrote {args.output}")