Outer join

This commit is contained in:
Andras Schmelczer 2026-02-07 15:31:03 +00:00
parent d4d79f0d99
commit adaebfbd2a

View file

@ -44,7 +44,16 @@ def _build_wide(
geosure_path: Path,
) -> pl.DataFrame:
"""Build the wide dataframe by joining epc_pp with all auxiliary data."""
wide = pl.scan_parquet(epc_pp_path)
wide = (
pl.scan_parquet(epc_pp_path)
.filter(
pl.col("total_floor_area").is_null()
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
)
.filter(
pl.col("latest_price").is_null() | (pl.col("latest_price") >= MIN_PRICE)
)
)
arcgis = pl.scan_parquet(arcgis_path).select(
pl.col("pcds").alias("postcode"),
@ -53,7 +62,7 @@ def _build_wide(
"lsoa21",
"oa21",
)
wide = wide.join(arcgis, on="postcode", how="inner")
wide = wide.join(arcgis, on="postcode", how="full", coalesce=True)
wide = _join_journey_times(wide, journey_times_bank_path, "Bank")
wide = _join_journey_times(wide, journey_times_fitzrovia_path, "Fitzrovia")
@ -149,9 +158,7 @@ def _build_wide(
)
wide = (
wide.filter(pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
.filter(pl.col("latest_price") >= MIN_PRICE)
.with_columns(
wide.with_columns(
pl.when(pl.col("duration") == "U")
.then(None)
.otherwise(pl.col("duration"))