perfect-postcode/pipeline/transform/merge.py

import argparse

import polars as pl
from pathlib import Path

from pipeline.utils.postcode_mapping import build_postcode_mapping

MIN_FLOOR_AREA_M2 = 10


def _join_journey_times(
    wide: pl.LazyFrame,
    journey_times_path: Path,
    destination_name: str,
) -> pl.LazyFrame:
    """Join journey times for a single destination, renaming columns appropriately."""
    journey_times = (
        pl.scan_parquet(journey_times_path)
        .select(
            "postcode",
            pl.col("public_transport_quick_minutes").alias(
                f"Public transport to {destination_name} (mins)"
            ),
            pl.col("cycling_minutes").alias(f"Cycling to {destination_name} (mins)"),
        )
        .sort(f"Public transport to {destination_name} (mins)", nulls_last=True)
        .group_by("postcode")
        .first()
    )
    return wide.join(journey_times, on="postcode", how="left")


_AREA_COLUMNS = [
    "Postcode",
    "lat",
    "lon",
    # Transport
    "Public transport to Bank (mins)",
    "Cycling to Bank (mins)",
    "Public transport to Fitzrovia (mins)",
    "Cycling to Fitzrovia (mins)",
    # Deprivation
    "Income Score (rate)",
    "Employment Score (rate)",
    "Education, Skills and Training Score",
    "Health Deprivation and Disability Score",
    "Living Environment Score",
    "Indoors Sub-domain Score",
    "Outdoors Sub-domain Score",
    # Ethnicity
    "% Asian",
    "% Black",
    "% Mixed",
    "% White",
    "% Other",
    # Crime
    "Anti-social behaviour (avg/yr)",
    "Violence and sexual offences (avg/yr)",
    "Criminal damage and arson (avg/yr)",
    "Burglary (avg/yr)",
    "Vehicle crime (avg/yr)",
    "Robbery (avg/yr)",
    "Other theft (avg/yr)",
    "Shoplifting (avg/yr)",
    "Drugs (avg/yr)",
    "Possession of weapons (avg/yr)",
    "Public order (avg/yr)",
    "Bicycle theft (avg/yr)",
    "Theft from the person (avg/yr)",
    "Other crime (avg/yr)",
    "Serious crime (avg/yr)",
    "Minor crime (avg/yr)",
    # Amenities
    "Number of restaurants within 2km",
    "Number of grocery shops and supermarkets within 2km",
    "Number of parks within 2km",
    "Number of public transport stations within 2km",
    # Environment
    "Noise (dB)",
    "Max available download speed (Mbps)",
    # Schools
    "Good+ primary schools within 5km",
    "Good+ secondary schools within 5km",
    # GeoSure
    "Environmental risk",
    "Collapsible deposits risk",
    "Compressible ground risk",
    "Landslide risk",
    "Running sand risk",
    "Shrink-swell risk",
    "Soluble rocks risk",
]


def _build(
    epc_pp_path: Path,
    arcgis_path: Path,
    iod_path: Path,
    poi_proximity_path: Path,
    journey_times_bank_path: Path,
    journey_times_fitzrovia_path: Path,
    ethnicity_path: Path,
    crime_path: Path,
    noise_path: Path,
    school_proximity_path: Path,
    broadband_path: Path,
    geosure_path: Path,
    rental_prices_path: Path,
) -> tuple[pl.DataFrame, pl.DataFrame]:
    """Build postcode and properties dataframes from epc_pp + auxiliary data.

    Returns (postcode_df, properties_df).
    """
    wide = pl.scan_parquet(epc_pp_path).filter(
        pl.col("total_floor_area").is_null()
        | (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
    )

    # Remap terminated postcodes to nearest active successor
    postcode_mapping = build_postcode_mapping(arcgis_path)
    wide = wide.join(
        postcode_mapping.lazy(), left_on="postcode", right_on="old_postcode", how="left"
    ).with_columns(
        pl.coalesce("new_postcode", "postcode").alias("postcode"),
    ).drop("new_postcode")

    arcgis = (
        pl.scan_parquet(arcgis_path)
        .filter(pl.col("ctry") == "E92000001")  # England only
        .filter(pl.col("doterm").is_null())  # Active postcodes only
        .select(
            pl.col("pcds").alias("postcode"),
            "lat",
            pl.col("long").alias("lon"),
            "lsoa21",
            "oa21",
        )
    )
    wide = wide.join(arcgis, on="postcode", how="left")

    wide = _join_journey_times(wide, journey_times_bank_path, "Bank")
    wide = _join_journey_times(wide, journey_times_fitzrovia_path, "Fitzrovia")

    iod = pl.scan_parquet(iod_path)
    wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")

    # Invert deprivation scores so that higher values = less deprived (better)
    iod_score_cols = [
        "Education, Skills and Training Score",
        "Income Score (rate)",
        "Employment Score (rate)",
        "Health Deprivation and Disability Score",
        "Living Environment Score",
        "Indoors Sub-domain Score",
        "Outdoors Sub-domain Score",
    ]
    wide = wide.with_columns(*(pl.col(c).max() - pl.col(c) for c in iod_score_cols))

    ethnicity = pl.scan_parquet(ethnicity_path)
    wide = wide.join(
        ethnicity,
        left_on="Local Authority District code (2024)",
        right_on="Geography_code",
        how="left",
    )

    # Derive bedroom count: habitable rooms - 1 (assuming 1 reception room), clipped to 0..4
    wide = wide.with_columns(
        (pl.col("number_habitable_rooms") - 1)
        .clip(0, 4)
        .cast(pl.UInt8)
        .alias("_bedrooms"),
    )
    rental = pl.scan_parquet(rental_prices_path)
    wide = wide.join(
        rental,
        left_on=["Local Authority District code (2024)", "_bedrooms"],
        right_on=["area_code", "bedrooms"],
        how="left",
    )

    crime = pl.scan_parquet(crime_path)
    wide = wide.join(crime, left_on="lsoa21", right_on="LSOA code", how="left")

    wide = wide.with_columns(
        pl.sum_horizontal(
            "Violence and sexual offences (avg/yr)",
            "Robbery (avg/yr)",
            "Burglary (avg/yr)",
            "Possession of weapons (avg/yr)",
        ).alias("serious_crime_avg_yr"),
        pl.sum_horizontal(
            "Anti-social behaviour (avg/yr)",
            "Criminal damage and arson (avg/yr)",
            "Shoplifting (avg/yr)",
            "Bicycle theft (avg/yr)",
            "Theft from the person (avg/yr)",
            "Other theft (avg/yr)",
            "Vehicle crime (avg/yr)",
            "Public order (avg/yr)",
            "Drugs (avg/yr)",
            "Other crime (avg/yr)",
        ).alias("minor_crime_avg_yr"),
    )

    poi_counts = pl.scan_parquet(poi_proximity_path)
    wide = wide.join(poi_counts, on="postcode", how="left")

    noise_cols = ["road_noise_lden_db", "rail_noise_lden_db", "airport_noise_lden_db"]
    noise = (
        pl.scan_parquet(noise_path)
        .with_columns(
            # NaN → null so max_horizontal ignores missing instead of propagating NaN
            *[pl.col(c).fill_nan(None) for c in noise_cols],
        )
        .with_columns(
            pl.max_horizontal(*noise_cols).alias("noise_lden_db"),
        )
        .select("postcode", "noise_lden_db")
    )
    wide = wide.join(noise, on="postcode", how="left")

    school_proximity = pl.scan_parquet(school_proximity_path)
    wide = wide.join(school_proximity, on="postcode", how="left")

    # Broadband: derive max available download speed tier per postcode from
    # Ofcom availability percentages.  Tiers: Gigabit ≥1000, UFBB ≥300,
    # UFBB(100) ≥100, SFBB ≥30 Mbps.
    broadband = (
        pl.scan_parquet(broadband_path)
        .select(
            pl.col("postcode_space").alias("bb_postcode"),
            pl.when(pl.col("Gigabit availability (% premises)") > 0)
            .then(1000)
            .when(pl.col("UFBB availability (% premises)") > 0)
            .then(300)
            .when(pl.col("UFBB (100Mbit/s) availability (% premises)") > 0)
            .then(100)
            .when(pl.col("SFBB availability (% premises)") > 0)
            .then(30)
            .otherwise(10)
            .cast(pl.UInt16)
            .alias("max_download_speed"),
        )
        .group_by("bb_postcode")
        .agg(pl.col("max_download_speed").max())
    )
    wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")

    geosure = pl.scan_parquet(geosure_path)
    wide = wide.join(geosure, on="postcode", how="left")

    # Derive property_type: prefer EPC data, fall back to price-paid.
    # For Houses, use built_form (e.g. Semi-Detached, Mid-Terrace) for finer detail.
    bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
        ["NO DATA!", "Not Recorded"]
    )
    has_epc = pl.col("epc_property_type").is_not_null()
    is_house = pl.col("epc_property_type") == "House"
    wide = wide.with_columns(
        pl.when(has_epc & is_house & ~bad_built_form)
        .then(pl.col("built_form"))
        .when(has_epc & is_house)
        .then(pl.col("pp_property_type"))
        .when(has_epc)
        .then(pl.col("epc_property_type"))
        .otherwise(pl.col("pp_property_type"))
        # Unify EPC's "Flat"/"Maisonette" with price-paid's "Flats/Maisonettes"
        .replace({"Flat": "Flats/Maisonettes", "Maisonette": "Flats/Maisonettes"})
        .alias("property_type")
    )

    wide = (
        wide.with_columns(
            pl.when(pl.col("duration") == "U")
            .then(None)
            .otherwise(pl.col("duration"))
            .alias("duration"),
            pl.when(pl.col("current_energy_rating") == "INVALID!")
            .then(None)
            .otherwise(pl.col("current_energy_rating"))
            .alias("current_energy_rating"),
        )
        .with_columns(
            (pl.col("latest_price") / pl.col("total_floor_area"))
            .round(0)
            .cast(pl.Int32)
            .alias("Price per sqm"),
        )
        .drop(
            "inspection_date",
            "_bedrooms",
            "LSOA name (2021)",
            "Local Authority District code (2024)",
            "Local Authority District name (2024)",
            "Wider Barriers Sub-domain Score",
            "Geographical Barriers Sub-domain Score",
            "Adult Skills Sub-domain Score",
            "Children and Young People Sub-domain Score",
            "Crime Score",
            "Index of Multiple Deprivation (IMD) Score",
            "Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
            "Income Deprivation Affecting Children Index (IDACI) Score (rate)",
            "Barriers to Housing and Services Score",
            "lsoa21",
            "oa21",
            "epc_property_type",
            "pp_property_type",
            "built_form",
        )
        .rename(
            {
                "date_of_transfer": "Date of last transaction",
                "construction_age_band": "Construction age",
                "is_construction_date_approximate": "Is construction date approximate",
                "pp_address": "Address per Property Register",
                "epc_address": "Address per EPC",
                "postcode": "Postcode",
                "duration": "Leashold/Freehold",
                "current_energy_rating": "Current energy rating",
                "potential_energy_rating": "Potential energy rating",
                "total_floor_area": "Total floor area (sqm)",
                "property_type": "Property type",
                "restaurants_2km": "Number of restaurants within 2km",
                "groceries_2km": "Number of grocery shops and supermarkets within 2km",
                "parks_2km": "Number of parks within 2km",
                "public_transport_2km": "Number of public transport stations within 2km",
                "latest_price": "Last known price",
                "number_habitable_rooms": "Number of bedrooms & living rooms",
                "noise_lden_db": "Noise (dB)",
                "good_primary_5km": "Good+ primary schools within 5km",
                "good_secondary_5km": "Good+ secondary schools within 5km",
                "max_download_speed": "Max available download speed (Mbps)",
                "serious_crime_avg_yr": "Serious crime (avg/yr)",
                "minor_crime_avg_yr": "Minor crime (avg/yr)",
                "environmental_risk": "Environmental risk",
                "collapsible_deposits_risk": "Collapsible deposits risk",
                "compressible_ground_risk": "Compressible ground risk",
                "landslide_risk": "Landslide risk",
                "running_sand_risk": "Running sand risk",
                "shrink_swell_risk": "Shrink-swell risk",
                "soluble_rocks_risk": "Soluble rocks risk",
                "median_monthly_rent": "Estimated monthly rent",
                "floor_height": "Interior height (m)",
            }
        )
    )

    print("Collecting with streaming engine...")
    df = wide.collect(engine="streaming")

    # Split into postcode-level and property-level dataframes
    area_cols = [c for c in _AREA_COLUMNS if c in df.columns]
    postcode_df = df.select(area_cols).group_by("Postcode").first()
    print(f"Postcode rows: {postcode_df.height} (unique postcodes)")

    property_cols = [c for c in df.columns if c not in _AREA_COLUMNS or c == "Postcode"]
    properties_df = df.select(property_cols)
    print(f"Property rows: {properties_df.height}")

    return postcode_df, properties_df


def main():
    parser = argparse.ArgumentParser(
        description="Build wide property dataframe with all joins"
    )
    parser.add_argument(
        "--epc-pp", type=Path, required=True, help="EPC-Price Paid joined parquet file"
    )
    parser.add_argument(
        "--arcgis", type=Path, required=True, help="ArcGIS postcode data parquet file"
    )
    parser.add_argument(
        "--iod",
        type=Path,
        required=True,
        help="Index of Deprivation parquet file (optional)",
    )
    parser.add_argument(
        "--poi-proximity",
        type=Path,
        help="POI proximity counts parquet file (optional)",
    )
    parser.add_argument(
        "--journey-times-bank",
        type=Path,
        default=None,
        help="Journey times to Bank parquet file",
    )
    parser.add_argument(
        "--journey-times-fitzrovia",
        type=Path,
        default=None,
        help="Journey times to Fitzrovia parquet file",
    )
    parser.add_argument(
        "--ethnicity",
        type=Path,
        required=True,
        help="Ethnicity by local authority parquet file (optional)",
    )
    parser.add_argument(
        "--crime",
        type=Path,
        required=True,
        help="Crime by LSOA parquet file (optional)",
    )
    parser.add_argument(
        "--noise", type=Path, required=True, help="Road noise by postcode parquet file"
    )
    parser.add_argument(
        "--school-proximity",
        type=Path,
        required=True,
        help="School proximity counts parquet file",
    )
    parser.add_argument(
        "--broadband",
        type=Path,
        required=True,
        help="Broadband performance by output area parquet file",
    )
    parser.add_argument(
        "--geosure",
        type=Path,
        required=True,
        help="GeoSure ground stability parquet file",
    )
    parser.add_argument(
        "--rental-prices",
        type=Path,
        required=True,
        help="ONS rental prices by LA and bedroom count parquet file",
    )
    parser.add_argument(
        "--output-postcodes", type=Path, required=True, help="Output postcode parquet file path"
    )
    parser.add_argument(
        "--output-properties", type=Path, required=True, help="Output properties parquet file path"
    )
    args = parser.parse_args()

    postcode_df, properties_df = _build(
        epc_pp_path=args.epc_pp,
        arcgis_path=args.arcgis,
        iod_path=args.iod,
        poi_proximity_path=args.poi_proximity,
        journey_times_bank_path=args.journey_times_bank,
        journey_times_fitzrovia_path=args.journey_times_fitzrovia,
        ethnicity_path=args.ethnicity,
        crime_path=args.crime,
        noise_path=args.noise,
        school_proximity_path=args.school_proximity,
        broadband_path=args.broadband,
        geosure_path=args.geosure,
        rental_prices_path=args.rental_prices,
    )

    print(f"\nPostcode columns: {postcode_df.columns}")
    print(f"Postcode rows: {postcode_df.height}")
    postcode_df.write_parquet(args.output_postcodes)
    size_mb = args.output_postcodes.stat().st_size / (1024 * 1024)
    print(f"Wrote {args.output_postcodes} ({size_mb:.1f} MB)")

    print(f"\nProperty columns: {properties_df.columns}")
    print(f"Property rows: {properties_df.height}")
    properties_df.write_parquet(args.output_properties)
    size_mb = args.output_properties.stat().st_size / (1024 * 1024)
    print(f"Wrote {args.output_properties} ({size_mb:.1f} MB)")


if __name__ == "__main__":
    main()