Can't even keep track anymore
This commit is contained in:
parent
dccc1e439d
commit
3a3f899ea2
50 changed files with 1144 additions and 560 deletions
|
|
@ -6,6 +6,8 @@ from ..utils import fuzzy_join_on_postcode
|
|||
|
||||
pl.Config.set_tbl_cols(-1)
|
||||
|
||||
RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Fuzzy join EPC and Price Paid data")
|
||||
|
|
@ -20,7 +22,7 @@ def main():
|
|||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
epc = (
|
||||
epc_base = (
|
||||
pl.scan_csv(args.epc)
|
||||
.select(
|
||||
pl.col("ADDRESS").alias("epc_address"),
|
||||
|
|
@ -42,11 +44,90 @@ def main():
|
|||
.otherwise(pl.col("NUMBER_HABITABLE_ROOMS"))
|
||||
.alias("NUMBER_HABITABLE_ROOMS"),
|
||||
)
|
||||
.sort("INSPECTION_DATE", descending=True)
|
||||
)
|
||||
|
||||
# Dedup fork: keep latest certificate per property (existing logic)
|
||||
epc = (
|
||||
epc_base.sort("INSPECTION_DATE", descending=True)
|
||||
.group_by("epc_address", "POSTCODE")
|
||||
.first()
|
||||
)
|
||||
|
||||
# Events fork: detect renovation events between consecutive certificates
|
||||
# Collect eagerly because .over() window functions don't work in streaming
|
||||
# engine (fuzzy_join.py:50 uses sink_parquet which requires streaming).
|
||||
events = (
|
||||
epc_base.sort("INSPECTION_DATE")
|
||||
.with_columns(
|
||||
pl.col("CURRENT_ENERGY_RATING")
|
||||
.replace_strict(RATING_RANK, default=None, return_dtype=pl.Int32)
|
||||
.alias("_rating_rank"),
|
||||
)
|
||||
.with_columns(
|
||||
pl.col("NUMBER_HABITABLE_ROOMS")
|
||||
.shift(1)
|
||||
.over("epc_address", "POSTCODE")
|
||||
.alias("_prev_rooms"),
|
||||
pl.col("TOTAL_FLOOR_AREA")
|
||||
.shift(1)
|
||||
.over("epc_address", "POSTCODE")
|
||||
.alias("_prev_area"),
|
||||
pl.col("_rating_rank")
|
||||
.shift(1)
|
||||
.over("epc_address", "POSTCODE")
|
||||
.alias("_prev_rating_rank"),
|
||||
)
|
||||
.with_columns(
|
||||
pl.when(
|
||||
pl.col("NUMBER_HABITABLE_ROOMS").is_not_null()
|
||||
& pl.col("_prev_rooms").is_not_null()
|
||||
& (pl.col("NUMBER_HABITABLE_ROOMS") != pl.col("_prev_rooms"))
|
||||
)
|
||||
.then(pl.lit("Remodeling"))
|
||||
.when(
|
||||
pl.col("TOTAL_FLOOR_AREA").is_not_null()
|
||||
& pl.col("_prev_area").is_not_null()
|
||||
& (pl.col("TOTAL_FLOOR_AREA") > pl.col("_prev_area"))
|
||||
)
|
||||
.then(pl.lit("Extension"))
|
||||
.when(
|
||||
pl.col("_rating_rank").is_not_null()
|
||||
& pl.col("_prev_rating_rank").is_not_null()
|
||||
& (pl.col("_rating_rank") < pl.col("_prev_rating_rank"))
|
||||
)
|
||||
.then(pl.lit("Renovation"))
|
||||
.otherwise(pl.lit(None, dtype=pl.String))
|
||||
.alias("_event"),
|
||||
)
|
||||
.filter(pl.col("_event").is_not_null())
|
||||
.with_columns(
|
||||
pl.col("INSPECTION_DATE")
|
||||
.cast(pl.String)
|
||||
.str.slice(0, 4)
|
||||
.cast(pl.Int32)
|
||||
.alias("_event_year"),
|
||||
)
|
||||
.group_by("epc_address", "POSTCODE")
|
||||
.agg(
|
||||
pl.struct(
|
||||
pl.col("_event_year").alias("year"),
|
||||
pl.col("_event").alias("event"),
|
||||
).alias("renovation_history"),
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
|
||||
event_counts = events["renovation_history"].explode().struct.field("event").value_counts()
|
||||
print(f"Renovation events: {events.height} properties with events")
|
||||
print(event_counts)
|
||||
|
||||
# Left-join events back onto dedup EPC
|
||||
epc = epc.join(
|
||||
events.lazy(),
|
||||
on=["epc_address", "POSTCODE"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
print("EPC dataset")
|
||||
print(epc.head().collect())
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue