Outer join

This commit is contained in:
Andras Schmelczer 2026-02-07 15:31:03 +00:00
parent d4d79f0d99
commit adaebfbd2a

View file

@ -44,7 +44,16 @@ def _build_wide(
geosure_path: Path, geosure_path: Path,
) -> pl.DataFrame: ) -> pl.DataFrame:
"""Build the wide dataframe by joining epc_pp with all auxiliary data.""" """Build the wide dataframe by joining epc_pp with all auxiliary data."""
wide = pl.scan_parquet(epc_pp_path) wide = (
pl.scan_parquet(epc_pp_path)
.filter(
pl.col("total_floor_area").is_null()
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
)
.filter(
pl.col("latest_price").is_null() | (pl.col("latest_price") >= MIN_PRICE)
)
)
arcgis = pl.scan_parquet(arcgis_path).select( arcgis = pl.scan_parquet(arcgis_path).select(
pl.col("pcds").alias("postcode"), pl.col("pcds").alias("postcode"),
@ -53,7 +62,7 @@ def _build_wide(
"lsoa21", "lsoa21",
"oa21", "oa21",
) )
wide = wide.join(arcgis, on="postcode", how="inner") wide = wide.join(arcgis, on="postcode", how="full", coalesce=True)
wide = _join_journey_times(wide, journey_times_bank_path, "Bank") wide = _join_journey_times(wide, journey_times_bank_path, "Bank")
wide = _join_journey_times(wide, journey_times_fitzrovia_path, "Fitzrovia") wide = _join_journey_times(wide, journey_times_fitzrovia_path, "Fitzrovia")
@ -149,9 +158,7 @@ def _build_wide(
) )
wide = ( wide = (
wide.filter(pl.col("total_floor_area") > MIN_FLOOR_AREA_M2) wide.with_columns(
.filter(pl.col("latest_price") >= MIN_PRICE)
.with_columns(
pl.when(pl.col("duration") == "U") pl.when(pl.col("duration") == "U")
.then(None) .then(None)
.otherwise(pl.col("duration")) .otherwise(pl.col("duration"))