improve

2026-05-31 20:20:41 +01:00 · 2026-05-31 20:20:41 +01:00 · e8345cbdc1
commit e8345cbdc1
parent 8688b7475e
40 changed files with 1980 additions and 904 deletions
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -116,6 +116,66 @@ TREE_DENSITY_FEATURE = "Street tree density percentile"
 _POSTCODE_TREE_DENSITY_PERCENTILE_RE = re.compile(
    r"^Tree canopy density percentile within \d+m$"
 )
+_FINAL_DROP_COLUMNS = [
+    "inspection_date",
+    "_bedrooms",
+    "LSOA name (2021)",
+    "Local Authority District code (2024)",
+    "Local Authority District name (2024)",
+    "Wider Barriers Sub-domain Score",
+    "Geographical Barriers Sub-domain Score",
+    "Adult Skills Sub-domain Score",
+    "Children and Young People Sub-domain Score",
+    "Crime Score",
+    "Living Environment Score",
+    "Index of Multiple Deprivation (IMD) Score",
+    "Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
+    "Income Deprivation Affecting Children Index (IDACI) Score (rate)",
+    "Barriers to Housing and Services Score",
+    "oa21",
+    "pcon",
+    "epc_property_type",
+    "pp_property_type",
+    "built_form",
+]
+_FINAL_RENAME_COLUMNS = {
+    "date_of_transfer": "Date of last transaction",
+    "construction_age_band": "Construction year",
+    "is_construction_date_approximate": "Is construction date approximate",
+    "Income Score (rate)": "Income Score",
+    "Employment Score (rate)": "Employment Score",
+    "Indoors Sub-domain Score": "Housing Conditions Score",
+    "Outdoors Sub-domain Score": "Air Quality and Road Safety Score",
+    "pp_address": "Address per Property Register",
+    "epc_address": "Address per EPC",
+    "postcode": "Postcode",
+    "duration": "Leasehold/Freehold",
+    "current_energy_rating": "Current energy rating",
+    "potential_energy_rating": "Potential energy rating",
+    "total_floor_area": "Total floor area (sqm)",
+    "property_type": "Property type",
+    "restaurants_2km": "Number of restaurants within 2km",
+    "groceries_2km": "Number of grocery shops and supermarkets within 2km",
+    "latest_price": "Last known price",
+    "number_habitable_rooms": "Number of bedrooms & living rooms",
+    "noise_lden_db": "Noise (dB)",
+    "good_primary_5km": "Good+ primary schools within 5km",
+    "good_secondary_5km": "Good+ secondary schools within 5km",
+    "good_primary_2km": "Good+ primary schools within 2km",
+    "good_secondary_2km": "Good+ secondary schools within 2km",
+    "outstanding_primary_5km": "Outstanding primary schools within 5km",
+    "outstanding_secondary_5km": "Outstanding secondary schools within 5km",
+    "outstanding_primary_2km": "Outstanding primary schools within 2km",
+    "outstanding_secondary_2km": "Outstanding secondary schools within 2km",
+    "max_download_speed": "Max available download speed (Mbps)",
+    "serious_crime_avg_yr": "Serious crime (avg/yr)",
+    "minor_crime_avg_yr": "Minor crime (avg/yr)",
+    "mean_monthly_rent": "Estimated monthly rent",
+    "floor_height": "Interior height (m)",
+    "was_council_house": "Former council house",
+    "median_age": "Median age",
+    "turnout_pct": "Voter turnout (%)",
+}
 _RENT_SOURCE_UNAVAILABLE_LADS = {
    # ONS PIPR does not publish LAD-level private-rent estimates for these
    # small authorities. Keep rent null there, but fail on any other LAD miss.
@ -707,6 +767,181 @@ def _validate_property_postcodes(df: pl.DataFrame) -> None:
    )


+def _active_english_postcode_area(arcgis_raw: pl.LazyFrame) -> pl.LazyFrame:
+    """Return the supported postcode universe with geography join keys."""
+    return (
+        arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")
+        .filter(pl.col("doterm").is_null())
+        .select(
+            pl.col("pcds").alias("postcode"),
+            "lat",
+            pl.col("long").alias("lon"),
+            "ctry25cd",
+            pl.col("lsoa21cd").alias("lsoa21"),
+            pl.col("oa21cd").alias("oa21"),
+            pl.col("pcon24cd").alias("pcon"),
+        )
+        .drop_nulls(["postcode"])
+        .unique(["postcode"])
+    )
+
+
+def _remap_terminated_postcodes(
+    wide: pl.LazyFrame, postcode_mapping: pl.LazyFrame
+) -> pl.LazyFrame:
+    return (
+        wide.join(
+            postcode_mapping,
+            left_on="postcode",
+            right_on="old_postcode",
+            how="left",
+        )
+        .with_columns(
+            pl.coalesce("new_postcode", "postcode").alias("postcode"),
+        )
+        .drop("new_postcode")
+    )
+
+
+def _filter_to_active_english_postcodes(
+    wide: pl.LazyFrame, active_postcodes: pl.LazyFrame
+) -> pl.LazyFrame:
+    return wide.join(active_postcodes, on="postcode", how="semi")
+
+
+def _join_area_side_tables(
+    base: pl.LazyFrame,
+    *,
+    iod: pl.LazyFrame,
+    ethnicity: pl.LazyFrame,
+    crime: pl.LazyFrame,
+    median_age: pl.LazyFrame,
+    election: pl.LazyFrame,
+    poi_counts: pl.LazyFrame,
+    noise: pl.LazyFrame,
+    school_proximity: pl.LazyFrame,
+    conservation_areas: pl.LazyFrame,
+    tree_density: pl.LazyFrame | None,
+    broadband: pl.LazyFrame,
+) -> pl.LazyFrame:
+    base = base.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
+    base = base.join(
+        ethnicity,
+        left_on="Local Authority District code (2024)",
+        right_on="Geography_code",
+        how="left",
+    )
+
+    # Crime is counted spatially per postcode (incidents within 50m of the
+    # postcode boundary), so it joins on postcode rather than LSOA.
+    base = base.join(crime, on="postcode", how="left")
+    base = base.with_columns(
+        pl.sum_horizontal(
+            "Violence and sexual offences (avg/yr)",
+            "Robbery (avg/yr)",
+            "Burglary (avg/yr)",
+            "Possession of weapons (avg/yr)",
+        ).alias("serious_crime_avg_yr"),
+        pl.sum_horizontal(
+            "Anti-social behaviour (avg/yr)",
+            "Criminal damage and arson (avg/yr)",
+            "Shoplifting (avg/yr)",
+            "Bicycle theft (avg/yr)",
+            "Theft from the person (avg/yr)",
+            "Other theft (avg/yr)",
+            "Vehicle crime (avg/yr)",
+            "Public order (avg/yr)",
+            "Drugs (avg/yr)",
+            "Other crime (avg/yr)",
+        ).alias("minor_crime_avg_yr"),
+    )
+
+    base = base.join(median_age, on="lsoa21", how="left")
+    base = base.join(election, on="pcon", how="left")
+    base = base.join(poi_counts, on="postcode", how="left")
+    base = base.join(noise, on="postcode", how="left")
+    base = base.join(school_proximity, on="postcode", how="left")
+    base = base.join(conservation_areas, on="postcode", how="left").with_columns(
+        pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
+    )
+    if tree_density is not None:
+        base = base.join(tree_density, on="postcode", how="left")
+    return base.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
+
+
+def _finalize_merged_columns(frame: pl.LazyFrame) -> pl.LazyFrame:
+    return frame.drop(_FINAL_DROP_COLUMNS, strict=False).rename(
+        _FINAL_RENAME_COLUMNS, strict=False
+    )
+
+
+def _area_columns_from(columns: list[str]) -> list[str]:
+    return [
+        c for c in columns if c in _AREA_COLUMNS or _is_dynamic_poi_metric_column(c)
+    ]
+
+
+def _property_columns_from(columns: list[str]) -> list[str]:
+    return [
+        c
+        for c in columns
+        if (c not in _AREA_COLUMNS and not _is_dynamic_poi_metric_column(c))
+        or c == "Postcode"
+    ]
+
+
+def _validate_postcode_feature_output(
+    postcode_df: pl.DataFrame, expected_postcode_count: int
+) -> None:
+    required = {"Postcode", "lat", "lon", "ctry25cd"}
+    missing = sorted(required - set(postcode_df.columns))
+    if missing:
+        raise ValueError(f"Postcode feature output missing columns: {missing}")
+
+    unique_count = postcode_df["Postcode"].n_unique()
+    if (
+        postcode_df.height != expected_postcode_count
+        or unique_count != expected_postcode_count
+    ):
+        raise ValueError(
+            "Postcode feature output no longer matches the active England "
+            "postcode universe: "
+            f"rows={postcode_df.height:,}, unique={unique_count:,}, "
+            f"expected={expected_postcode_count:,}"
+        )
+
+    invalid = postcode_df.filter(
+        pl.col("Postcode").is_null()
+        | (pl.col("Postcode").cast(pl.Utf8).str.strip_chars() == "")
+        | pl.col("lat").is_null()
+        | pl.col("lon").is_null()
+        | pl.col("ctry25cd").is_null()
+        | (pl.col("ctry25cd") != "E92000001")
+    )
+    if invalid.height > 0:
+        sample = (
+            invalid.select("Postcode", "ctry25cd", "lat", "lon").head(10).to_dicts()
+        )
+        raise ValueError(
+            "Postcode feature output contains unsupported or ungeocoded rows: "
+            f"{invalid.height} rows. Sample: {sample}"
+        )
+
+
+def _split_normal_outputs(
+    df: pl.DataFrame,
+    postcode_features: pl.DataFrame,
+    *,
+    expected_postcode_count: int,
+) -> tuple[pl.DataFrame, pl.DataFrame]:
+    postcode_df = postcode_features.select(
+        _area_columns_from(postcode_features.columns)
+    )
+    _validate_postcode_feature_output(postcode_df, expected_postcode_count)
+    properties_df = df.select(_property_columns_from(df.columns))
+    return postcode_df, properties_df
+
+
 # Map listings-parquet source columns to the `_actual_*` overlay columns
 # carried alongside the wide frame through the postcode-keyed joins. After the
 # rest of the pipeline finalises, listing rows pick their canonical dashboard
@ -927,9 +1162,7 @@ def _best_listing_match(
    return best, float(best_score), "address", best_field


-def _load_listings_for_merge(
-    listings_path: Path, arcgis_path: Path
-) -> pl.DataFrame:
+def _load_listings_for_merge(listings_path: Path, arcgis_path: Path) -> pl.DataFrame:
    """Read the listings parquet and prepare it for the wide-frame merge.

    Output is keyed by `_listing_idx` and carries:
@ -1032,7 +1265,11 @@ def _load_direct_epc_candidates(
        "_direct_epc_outcode": pl.Utf8,
        "_direct_epc_canonical_property_type": pl.Utf8,
        "_direct_epc_uprn": pl.Utf8,
-        **{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS if column.startswith("_direct_")},
+        **{
+            column: dtype
+            for column, dtype in _DIRECT_EPC_COLUMNS
+            if column.startswith("_direct_")
+        },
    }
    if not listing_outcodes:
        return pl.DataFrame(schema=schema)
@ -1089,9 +1326,7 @@ def _load_direct_epc_candidates(
            pl.col("epc_address").alias("_direct_epc_address"),
            pl.col("uprn").alias("_direct_epc_uprn"),
            pl.col("total_floor_area").alias("_direct_total_floor_area"),
-            pl.col("number_habitable_rooms").alias(
-                "_direct_number_habitable_rooms"
-            ),
+            pl.col("number_habitable_rooms").alias("_direct_number_habitable_rooms"),
            pl.col("floor_height").alias("_direct_floor_height"),
            pl.col("_direct_was_council_house").fill_null("No"),
        )
@ -1141,9 +1376,7 @@ def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
    )


-def _optional_lazy_col(
-    schema: pl.Schema, column: str, dtype: pl.DataType
-) -> pl.Expr:
+def _optional_lazy_col(schema: pl.Schema, column: str, dtype: pl.DataType) -> pl.Expr:
    if column in schema:
        return pl.col(column).cast(dtype, strict=False).alias(column)
    return pl.lit(None, dtype=dtype).alias(column)
@ -1640,27 +1873,18 @@ def _build(
        | (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
    )

-    # Remap terminated postcodes to nearest active successor
+    # Remap terminated postcodes to nearest active successor before filtering to
+    # the supported active-English postcode universe. Historical properties from
+    # terminated English postcodes are retained under their successor postcode.
    postcode_mapping = build_postcode_mapping(arcgis_path)
-    wide = (
-        wide.join(
-            postcode_mapping.lazy(),
-            left_on="postcode",
-            right_on="old_postcode",
-            how="left",
-        )
-        .with_columns(
-            pl.coalesce("new_postcode", "postcode").alias("postcode"),
-        )
-        .drop("new_postcode")
-    )
-
+    wide = _remap_terminated_postcodes(wide, postcode_mapping.lazy())
    arcgis_raw = pl.scan_parquet(arcgis_path)
-    postcode_country = arcgis_raw.select(
-        pl.col("pcds").alias("postcode"),
-        pl.col("ctry25cd"),
-    ).unique(["postcode"])
-    wide = wide.join(postcode_country, on="postcode", how="left")
+    arcgis = _active_english_postcode_area(arcgis_raw)
+    active_postcodes = arcgis.select("postcode").unique()
+    active_postcode_count = (
+        active_postcodes.select(pl.len()).collect(engine="streaming").item()
+    )
+    wide = _filter_to_active_english_postcodes(wide, active_postcodes)

    if listed_buildings_path is not None:
        active_postcodes_for_listed = (
@ -1691,92 +1915,25 @@ def _build(
            arcgis_path,
            epc_path=actual_listings_epc_path,
        )
+        wide = _filter_to_active_english_postcodes(wide, active_postcodes)

    wide = wide.with_columns(pl.col(LISTED_BUILDING_FEATURE).fill_null("No"))

-    arcgis = (
-        arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")  # England only
-        .filter(pl.col("doterm").is_null())  # Active postcodes only
-        # NSPL Feb 2026 renamed geographic code columns to {field}{year}cd.
-        # Alias them back to the short canonical names used across the
-        # pipeline so downstream joins don't need to know about NSPL's
-        # versioning scheme.
-        .select(
-            pl.col("pcds").alias("postcode"),
-            "lat",
-            pl.col("long").alias("lon"),
-            pl.col("lsoa21cd").alias("lsoa21"),
-            pl.col("oa21cd").alias("oa21"),
-            pl.col("pcon24cd").alias("pcon"),
-        )
-    )
+    # NSPL Feb 2026 renamed geographic code columns to {field}{year}cd.
+    # `_active_english_postcode_area` aliases them back to the short canonical
+    # names used across the pipeline so downstream joins don't need to know
+    # about NSPL's versioning scheme.
    wide = wide.join(arcgis, on="postcode", how="left")
+    postcode_area = arcgis

    iod = pl.scan_parquet(iod_path).with_columns(
        *(_less_deprived_percentile_expr(c) for c in _IOD_PERCENTILE_COLUMNS)
    )
-    wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
-
    ethnicity = pl.scan_parquet(ethnicity_path)
-    wide = wide.join(
-        ethnicity,
-        left_on="Local Authority District code (2024)",
-        right_on="Geography_code",
-        how="left",
-    )
-
-    # Derive bedroom count: habitable rooms - 1 (assuming 1 reception room), clipped to 0..4
-    wide = wide.with_columns(
-        (pl.col("number_habitable_rooms") - 1)
-        .clip(0, 4)
-        .cast(pl.UInt8)
-        .alias("_bedrooms"),
-    )
-    rental = pl.scan_parquet(rental_prices_path).select(
-        "area_code", "bedrooms", "mean_monthly_rent"
-    )
-    wide = wide.join(
-        rental,
-        left_on=["Local Authority District code (2024)", "_bedrooms"],
-        right_on=["area_code", "bedrooms"],
-        how="left",
-    )
-
-    # Crime is counted spatially per postcode (incidents within 50m of the
-    # postcode boundary), so it joins on postcode rather than LSOA.
    crime = pl.scan_parquet(crime_path)
-    wide = wide.join(crime, on="postcode", how="left")
-
-    wide = wide.with_columns(
-        pl.sum_horizontal(
-            "Violence and sexual offences (avg/yr)",
-            "Robbery (avg/yr)",
-            "Burglary (avg/yr)",
-            "Possession of weapons (avg/yr)",
-        ).alias("serious_crime_avg_yr"),
-        pl.sum_horizontal(
-            "Anti-social behaviour (avg/yr)",
-            "Criminal damage and arson (avg/yr)",
-            "Shoplifting (avg/yr)",
-            "Bicycle theft (avg/yr)",
-            "Theft from the person (avg/yr)",
-            "Other theft (avg/yr)",
-            "Vehicle crime (avg/yr)",
-            "Public order (avg/yr)",
-            "Drugs (avg/yr)",
-            "Other crime (avg/yr)",
-        ).alias("minor_crime_avg_yr"),
-    )
-
    median_age = pl.scan_parquet(median_age_path)
-    wide = wide.join(median_age, on="lsoa21", how="left")
-
    election = pl.scan_parquet(election_results_path)
-    wide = wide.join(election, on="pcon", how="left")
-
    poi_counts = pl.scan_parquet(poi_proximity_path)
-    wide = wide.join(poi_counts, on="postcode", how="left")
-
    noise_cols = ["road_noise_lden_db", "rail_noise_lden_db", "airport_noise_lden_db"]
    noise = (
        pl.scan_parquet(noise_path)
@ -1789,21 +1946,13 @@ def _build(
        )
        .select("postcode", "noise_lden_db")
    )
-    wide = wide.join(noise, on="postcode", how="left")
-
    school_proximity = pl.scan_parquet(school_proximity_path)
-    wide = wide.join(school_proximity, on="postcode", how="left")
-
    conservation_areas = _conservation_area_by_postcode(
        arcgis.select("postcode", "lat", "lon"), conservation_areas_path
    )
-    wide = wide.join(conservation_areas, on="postcode", how="left").with_columns(
-        pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
-    )
-
+    tree_density = None
    if tree_density_postcodes_path is not None:
        tree_density = _tree_density_by_postcode(tree_density_postcodes_path)
-        wide = wide.join(tree_density, on="postcode", how="left")

    # Broadband: derive max available download speed tier per postcode from
    # Ofcom availability percentages.  Tiers: Gigabit ≥1000, UFBB ≥300,
@ -1828,7 +1977,38 @@ def _build(
        .agg(pl.col("max_download_speed").max())
        .with_columns(pl.col("max_download_speed").cast(pl.Utf8))
    )
-    wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
+    area_side_tables = {
+        "iod": iod,
+        "ethnicity": ethnicity,
+        "crime": crime,
+        "median_age": median_age,
+        "election": election,
+        "poi_counts": poi_counts,
+        "noise": noise,
+        "school_proximity": school_proximity,
+        "conservation_areas": conservation_areas,
+        "tree_density": tree_density,
+        "broadband": broadband,
+    }
+    wide = _join_area_side_tables(wide, **area_side_tables)
+    postcode_area = _join_area_side_tables(postcode_area, **area_side_tables)
+
+    # Derive bedroom count: habitable rooms - 1 (assuming 1 reception room), clipped to 0..4
+    wide = wide.with_columns(
+        (pl.col("number_habitable_rooms") - 1)
+        .clip(0, 4)
+        .cast(pl.UInt8)
+        .alias("_bedrooms"),
+    )
+    rental = pl.scan_parquet(rental_prices_path).select(
+        "area_code", "bedrooms", "mean_monthly_rent"
+    )
+    wide = wide.join(
+        rental,
+        left_on=["Local Authority District code (2024)", "_bedrooms"],
+        right_on=["area_code", "bedrooms"],
+        how="left",
+    )

    # Derive property_type: prefer EPC data, fall back to price-paid.
    # For Houses, use built_form (e.g. Semi-Detached, Mid-Terrace) for finer detail.
@ -1862,112 +2042,40 @@ def _build(
        .alias("property_type")
    )

-    wide = (
-        wide.with_columns(
-            pl.when(pl.col("duration") == "U")
-            .then(None)
-            .otherwise(pl.col("duration"))
-            .alias("duration"),
-            pl.when(pl.col("current_energy_rating") == "INVALID!")
-            .then(None)
-            .otherwise(pl.col("current_energy_rating"))
-            .alias("current_energy_rating"),
-        )
-        .with_columns(
-            (pl.col("latest_price") / pl.col("total_floor_area"))
-            .round(0)
-            .cast(pl.Int32)
-            .alias("Price per sqm"),
-        )
-        .drop(
-            "inspection_date",
-            "_bedrooms",
-            "LSOA name (2021)",
-            "Local Authority District code (2024)",
-            "Local Authority District name (2024)",
-            "Wider Barriers Sub-domain Score",
-            "Geographical Barriers Sub-domain Score",
-            "Adult Skills Sub-domain Score",
-            "Children and Young People Sub-domain Score",
-            "Crime Score",
-            "Living Environment Score",
-            "Index of Multiple Deprivation (IMD) Score",
-            "Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
-            "Income Deprivation Affecting Children Index (IDACI) Score (rate)",
-            "Barriers to Housing and Services Score",
-            "oa21",
-            "pcon",
-            "epc_property_type",
-            "pp_property_type",
-            "built_form",
-        )
-        .rename(
-            {
-                "date_of_transfer": "Date of last transaction",
-                "construction_age_band": "Construction year",
-                "is_construction_date_approximate": "Is construction date approximate",
-                "Income Score (rate)": "Income Score",
-                "Employment Score (rate)": "Employment Score",
-                "Indoors Sub-domain Score": "Housing Conditions Score",
-                "Outdoors Sub-domain Score": "Air Quality and Road Safety Score",
-                "pp_address": "Address per Property Register",
-                "epc_address": "Address per EPC",
-                "postcode": "Postcode",
-                "duration": "Leasehold/Freehold",
-                "current_energy_rating": "Current energy rating",
-                "potential_energy_rating": "Potential energy rating",
-                "total_floor_area": "Total floor area (sqm)",
-                "property_type": "Property type",
-                "restaurants_2km": "Number of restaurants within 2km",
-                "groceries_2km": "Number of grocery shops and supermarkets within 2km",
-                "latest_price": "Last known price",
-                "number_habitable_rooms": "Number of bedrooms & living rooms",
-                "noise_lden_db": "Noise (dB)",
-                "good_primary_5km": "Good+ primary schools within 5km",
-                "good_secondary_5km": "Good+ secondary schools within 5km",
-                "good_primary_2km": "Good+ primary schools within 2km",
-                "good_secondary_2km": "Good+ secondary schools within 2km",
-                "outstanding_primary_5km": "Outstanding primary schools within 5km",
-                "outstanding_secondary_5km": "Outstanding secondary schools within 5km",
-                "outstanding_primary_2km": "Outstanding primary schools within 2km",
-                "outstanding_secondary_2km": "Outstanding secondary schools within 2km",
-                "max_download_speed": "Max available download speed (Mbps)",
-                "serious_crime_avg_yr": "Serious crime (avg/yr)",
-                "minor_crime_avg_yr": "Minor crime (avg/yr)",
-                "mean_monthly_rent": "Estimated monthly rent",
-                "floor_height": "Interior height (m)",
-                "was_council_house": "Former council house",
-                "median_age": "Median age",
-                "turnout_pct": "Voter turnout (%)",
-            }
-        )
+    wide = wide.with_columns(
+        pl.when(pl.col("duration") == "U")
+        .then(None)
+        .otherwise(pl.col("duration"))
+        .alias("duration"),
+        pl.when(pl.col("current_energy_rating") == "INVALID!")
+        .then(None)
+        .otherwise(pl.col("current_energy_rating"))
+        .alias("current_energy_rating"),
+    ).with_columns(
+        (pl.col("latest_price") / pl.col("total_floor_area"))
+        .round(0)
+        .cast(pl.Int32)
+        .alias("Price per sqm"),
    )
+    wide = _finalize_merged_columns(wide)
+    postcode_area = _finalize_merged_columns(postcode_area)

    print("Collecting with streaming engine...")
-    df = wide.collect(engine="streaming")

    if mode == "listings":
+        df = wide.collect(engine="streaming")
        enriched_listings = _finalize_listings(df)
        _validate_property_postcodes(enriched_listings)
        print(f"Enriched listings rows: {enriched_listings.height}")
        return _BuildResult(listings=enriched_listings)

+    df, postcode_features = pl.collect_all([wide, postcode_area], engine="streaming")
    _validate_property_postcodes(df)

-    # Split into postcode-level and property-level dataframes
-    area_cols = [
-        c for c in df.columns if c in _AREA_COLUMNS or _is_dynamic_poi_metric_column(c)
-    ]
-    postcode_df = df.select(area_cols).group_by("Postcode").first()
+    postcode_df, properties_df = _split_normal_outputs(
+        df, postcode_features, expected_postcode_count=active_postcode_count
+    )
    print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
-
-    property_cols = [
-        c
-        for c in df.columns
-        if (c not in _AREA_COLUMNS and not _is_dynamic_poi_metric_column(c))
-        or c == "Postcode"
-    ]
-    properties_df = df.select(property_cols)
    print(f"Property rows: {properties_df.height}")

    return _BuildResult(postcode=postcode_df, properties=properties_df)