perfect-postcode/pipeline/transform/merge.py

import argparse
import re
import tempfile
from dataclasses import dataclass
from typing import Literal

import numpy as np
import polars as pl
from pathlib import Path

import pyogrio
from pyproj import Transformer
from scipy.spatial import cKDTree
from shapely import from_wkb, points
from shapely.geometry.base import BaseGeometry
from shapely.strtree import STRtree
from thefuzz import fuzz

from pipeline.local_temp import local_tmp_dir
from pipeline.transform.join_epc_pp import _scan_epc_certificates
from pipeline.utils.fuzzy_join import (
    normalize_address_key,
    normalize_postcode_key,
)
from pipeline.utils.postcode_mapping import build_postcode_mapping

MIN_FLOOR_AREA_M2 = 10
CONSERVATION_AREA_FEATURE = "Within conservation area"
LISTED_BUILDING_FEATURE = "Listed building"
LISTED_BUILDING_MATCH_RADIUS_M = 250.0
LISTED_BUILDING_NEAREST_POSTCODES = 3
LISTED_BUILDING_MIN_MATCH_SCORE = 95
PLANNING_DATA_CONSERVATION_AREA_DATASET = "conservation-area"

_IOD_PERCENTILE_COLUMNS = [
    "Education, Skills and Training Score",
    "Income Score (rate)",
    "Employment Score (rate)",
    "Health Deprivation and Disability Score",
    "Indoors Sub-domain Score",
    "Outdoors Sub-domain Score",
]


_AREA_COLUMNS = [
    "Postcode",
    "lat",
    "lon",
    # Runtime provenance for deciding whether missing coordinates are skippable.
    "ctry25cd",
    # Keyed lookup for postcode-level side tables (e.g. crime time series).
    "lsoa21",
    # Deprivation
    "Income Score",
    "Employment Score",
    "Education, Skills and Training Score",
    "Health Deprivation and Disability Score",
    "Housing Conditions Score",
    "Air Quality and Road Safety Score",
    # Ethnicity
    "% South Asian",
    "% East Asian",
    "% Black",
    "% Mixed",
    "% White",
    "% Other",
    # Crime
    "Anti-social behaviour (avg/yr)",
    "Violence and sexual offences (avg/yr)",
    "Criminal damage and arson (avg/yr)",
    "Burglary (avg/yr)",
    "Vehicle crime (avg/yr)",
    "Robbery (avg/yr)",
    "Other theft (avg/yr)",
    "Shoplifting (avg/yr)",
    "Drugs (avg/yr)",
    "Possession of weapons (avg/yr)",
    "Public order (avg/yr)",
    "Bicycle theft (avg/yr)",
    "Theft from the person (avg/yr)",
    "Other crime (avg/yr)",
    "Serious crime (avg/yr)",
    "Minor crime (avg/yr)",
    "Serious crime per 1k residents (avg/yr)",
    "Minor crime per 1k residents (avg/yr)",
    # Amenities
    "Number of restaurants within 2km",
    "Number of grocery shops and supermarkets within 2km",
    # Environment
    "Noise (dB)",
    "Max available download speed (Mbps)",
    CONSERVATION_AREA_FEATURE,
    # Schools
    "Good+ primary schools within 5km",
    "Good+ secondary schools within 5km",
    "Good+ primary schools within 2km",
    "Good+ secondary schools within 2km",
    "Outstanding primary schools within 5km",
    "Outstanding secondary schools within 5km",
    "Outstanding primary schools within 2km",
    "Outstanding secondary schools within 2km",
    # Demographics
    "Median age",
    # Politics
    "Voter turnout (%)",
    "% Labour",
    "% Conservative",
    "% Liberal Democrat",
    "% Reform UK",
    "% Green",
    "% Other parties",
]


_DYNAMIC_POI_DISTANCE_RE = re.compile(r"^Distance to nearest amenity \(.+\) \(km\)$")
_DYNAMIC_POI_COUNT_RE = re.compile(r"^Number of amenities \(.+\) within (2|5)km$")
TREE_DENSITY_FEATURE = "Street tree density percentile"
_POSTCODE_TREE_DENSITY_PERCENTILE_RE = re.compile(
    r"^Tree canopy density percentile within \d+m$"
)
_RENT_SOURCE_UNAVAILABLE_LADS = {
    # ONS PIPR does not publish LAD-level private-rent estimates for these
    # small authorities. Keep rent null there, but fail on any other LAD miss.
    "E06000053": "Isles of Scilly",
    "E09000001": "City of London",
}
_NUMBER_RE = re.compile(r"\d+")
_LISTED_NAME_STOP_WORDS = {
    "A",
    "AN",
    "AND",
    "AT",
    "BY",
    "IN",
    "OF",
    "ON",
    "THE",
    "TO",
    "WITH",
}


def _is_dynamic_poi_metric_column(column: str) -> bool:
    return bool(
        _DYNAMIC_POI_DISTANCE_RE.match(column) or _DYNAMIC_POI_COUNT_RE.match(column)
    )


def _numbers_compatible(left: str, right: str) -> bool:
    """Require address/list-entry numbers to agree when either side has numbers."""
    left_nums = set(_NUMBER_RE.findall(left))
    right_nums = set(_NUMBER_RE.findall(right))
    smaller, larger = (
        (left_nums, right_nums)
        if len(left_nums) <= len(right_nums)
        else (right_nums, left_nums)
    )
    if not smaller and larger:
        return False
    return smaller.issubset(larger)


def _listed_candidate_schema() -> dict[str, pl.DataType]:
    return {
        "postcode": pl.Utf8,
        "_listed_match_name": pl.Utf8,
        "_listed_grade": pl.Utf8,
        "_listed_entry": pl.Int64,
    }


def _empty_listed_candidates() -> pl.DataFrame:
    return pl.DataFrame(schema=_listed_candidate_schema())


def _empty_listed_property_flags() -> pl.DataFrame:
    return pl.DataFrame(
        schema={
            "postcode": pl.Utf8,
            "pp_address": pl.Utf8,
            LISTED_BUILDING_FEATURE: pl.Utf8,
        }
    )


def _is_matchable_listed_name(name_key: str | None) -> bool:
    if not name_key:
        return False
    if _NUMBER_RE.search(name_key):
        return True
    substantive_tokens = [
        token
        for token in name_key.split()
        if token not in _LISTED_NAME_STOP_WORDS and len(token) >= 3
    ]
    return len(substantive_tokens) >= 2


def _load_listed_building_points(listed_buildings_path: Path) -> pl.DataFrame:
    """Load Historic England NHLE listed-building point attributes."""
    columns = ["ListEntry", "Name", "Grade", "Easting", "Northing"]
    info = pyogrio.read_info(listed_buildings_path)
    geometry_type = str(info.get("geometry_type") or "")
    if "Point" not in geometry_type:
        raise ValueError(
            f"Expected listed-building point data, got geometry {geometry_type!r}"
        )
    _, table = pyogrio.read_arrow(
        listed_buildings_path,
        columns=columns,
        read_geometry=False,
    )
    df = pl.from_arrow(table)
    missing = sorted(set(columns) - set(df.columns))
    if missing:
        raise ValueError(
            f"{listed_buildings_path} is missing listed-building columns: {missing}"
        )
    return (
        df.select(
            pl.col("ListEntry").cast(pl.Int64),
            pl.col("Name").cast(pl.Utf8),
            pl.col("Grade").cast(pl.Utf8),
            pl.col("Easting").cast(pl.Float64),
            pl.col("Northing").cast(pl.Float64),
        )
        .drop_nulls(["Name", "Easting", "Northing"])
        .with_columns(normalize_address_key(pl.col("Name")).alias("_listed_match_name"))
        .filter(pl.col("_listed_match_name").is_not_null())
    )


def _postcode_listed_building_candidates(
    listed_points: pl.DataFrame,
    active_postcodes: pl.DataFrame,
    *,
    nearest_postcodes: int = LISTED_BUILDING_NEAREST_POSTCODES,
    max_distance_m: float = LISTED_BUILDING_MATCH_RADIUS_M,
) -> pl.DataFrame:
    """Assign each listed-building point to nearby active postcode candidates."""
    if listed_points.is_empty() or active_postcodes.is_empty():
        return _empty_listed_candidates()

    required_postcode_cols = {"postcode", "east1m", "north1m"}
    missing = sorted(required_postcode_cols - set(active_postcodes.columns))
    if missing:
        raise ValueError(f"Active postcode data missing required columns: {missing}")

    required_listed_cols = {
        "_listed_match_name",
        "Grade",
        "ListEntry",
        "Easting",
        "Northing",
    }
    missing = sorted(required_listed_cols - set(listed_points.columns))
    if missing:
        raise ValueError(f"Listed-building data missing required columns: {missing}")

    postcodes = active_postcodes.drop_nulls(["postcode", "east1m", "north1m"])
    postcodes = postcodes.filter(
        pl.col("east1m").is_finite() & pl.col("north1m").is_finite()
    )
    listed = listed_points.drop_nulls(["_listed_match_name", "Easting", "Northing"])
    listed = listed.filter(
        pl.col("Easting").is_finite() & pl.col("Northing").is_finite()
    )
    if postcodes.is_empty() or listed.is_empty():
        return _empty_listed_candidates()

    postcode_coords = np.column_stack(
        [postcodes["east1m"].to_numpy(), postcodes["north1m"].to_numpy()]
    )
    listed_coords = np.column_stack(
        [listed["Easting"].to_numpy(), listed["Northing"].to_numpy()]
    )
    k = max(1, min(nearest_postcodes, postcodes.height))
    distances, indices = cKDTree(postcode_coords).query(
        listed_coords,
        k=k,
        distance_upper_bound=max_distance_m,
    )
    if k == 1:
        distances = distances[:, np.newaxis]
        indices = indices[:, np.newaxis]

    postcode_values = postcodes["postcode"].to_list()
    listed_names = listed["_listed_match_name"].to_list()
    listed_grades = listed["Grade"].to_list()
    listed_entries = listed["ListEntry"].to_list()

    rows: list[tuple[str, str, str | None, int | None]] = []
    for listed_idx in range(listed.height):
        name_key = listed_names[listed_idx]
        if not _is_matchable_listed_name(name_key):
            continue
        seen_postcodes: set[str] = set()
        for distance, postcode_idx in zip(distances[listed_idx], indices[listed_idx]):
            if not np.isfinite(distance) or postcode_idx >= postcodes.height:
                continue
            postcode = postcode_values[int(postcode_idx)]
            if postcode in seen_postcodes:
                continue
            seen_postcodes.add(postcode)
            rows.append(
                (
                    postcode,
                    name_key,
                    listed_grades[listed_idx],
                    listed_entries[listed_idx],
                )
            )

    if not rows:
        return _empty_listed_candidates()

    return (
        pl.DataFrame(
            rows,
            schema=[
                "postcode",
                "_listed_match_name",
                "_listed_grade",
                "_listed_entry",
            ],
            orient="row",
        )
        .cast(_listed_candidate_schema())
        .unique(["postcode", "_listed_match_name", "_listed_entry"])
    )


def _matched_listed_building_flags(
    properties: pl.LazyFrame,
    listed_candidates: pl.DataFrame,
    *,
    min_score: int = LISTED_BUILDING_MIN_MATCH_SCORE,
) -> pl.DataFrame:
    """Return property keys that conservatively match an NHLE listed entry."""
    if listed_candidates.is_empty():
        return _empty_listed_property_flags()

    candidate_postcodes = listed_candidates.select("postcode").unique()
    property_candidates = (
        properties.select("postcode", "pp_address", "epc_address")
        .join(candidate_postcodes.lazy(), on="postcode", how="semi")
        .with_columns(
            normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
            normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"),
        )
        .filter(
            pl.col("pp_address").is_not_null()
            & (
                pl.col("_pp_match_address").is_not_null()
                | pl.col("_epc_match_address").is_not_null()
            )
        )
        .collect(engine="streaming")
    )
    if property_candidates.is_empty():
        return _empty_listed_property_flags()

    listed_by_postcode: dict[str, list[str]] = {}
    for postcode, name in listed_candidates.select(
        "postcode", "_listed_match_name"
    ).iter_rows():
        if postcode and name:
            listed_by_postcode.setdefault(postcode, []).append(name)

    matches: list[tuple[str, str, str]] = []
    for row in property_candidates.iter_rows(named=True):
        postcode = row["postcode"]
        listed_names = listed_by_postcode.get(postcode)
        if not listed_names:
            continue

        address_keys = []
        for col in ("_pp_match_address", "_epc_match_address"):
            value = row.get(col)
            if value and value not in address_keys:
                address_keys.append(value)

        matched = False
        for address_key in address_keys:
            for listed_name in listed_names:
                if not _numbers_compatible(address_key, listed_name):
                    continue
                if fuzz.token_set_ratio(address_key, listed_name) >= min_score:
                    matched = True
                    break
            if matched:
                break

        if matched:
            matches.append((postcode, row["pp_address"], "Yes"))

    if not matches:
        return _empty_listed_property_flags()

    return (
        pl.DataFrame(
            matches,
            schema=["postcode", "pp_address", LISTED_BUILDING_FEATURE],
            orient="row",
        )
        .cast(
            {
                "postcode": pl.Utf8,
                "pp_address": pl.Utf8,
                LISTED_BUILDING_FEATURE: pl.Utf8,
            }
        )
        .unique(["postcode", "pp_address"])
    )


def _listed_building_flags(
    properties: pl.LazyFrame,
    active_postcodes: pl.DataFrame,
    listed_buildings_path: Path,
) -> pl.DataFrame:
    print(f"Loading listed-building points from {listed_buildings_path}...")
    listed_points = _load_listed_building_points(listed_buildings_path)
    print(f"Loaded {listed_points.height} listed-building point records")
    listed_candidates = _postcode_listed_building_candidates(
        listed_points, active_postcodes
    )
    print(
        "Matching listed-building names to property addresses across "
        f"{listed_candidates['postcode'].n_unique()} nearby postcodes..."
    )
    flags = _matched_listed_building_flags(properties, listed_candidates)
    print(f"Matched {flags.height} property addresses to listed-building entries")
    return flags


def _normalise_crs(crs: object | None) -> str:
    return str(crs) if crs else "EPSG:4326"


def _geometry_column(metadata: dict, column_names: list[str]) -> str:
    geometry_name = metadata.get("geometry_name")
    if geometry_name:
        return str(geometry_name)
    for name in ("wkb_geometry", "geometry", "geom"):
        if name in column_names:
            return name
    return column_names[-1]


def _column_values(table, column: str, default: object = None) -> list[object]:
    if column not in table.column_names:
        return [default] * table.num_rows
    return table[column].combine_chunks().to_pylist()


def _is_planning_conservation_area_record(dataset: object) -> bool:
    return (
        dataset is None
        or str(dataset).strip().casefold() == PLANNING_DATA_CONSERVATION_AREA_DATASET
    )


def _is_current_planning_record(end_date: object) -> bool:
    if end_date is None:
        return True
    if isinstance(end_date, str):
        return end_date.strip() == ""
    return False


def _load_conservation_area_geometries(
    conservation_areas_path: Path,
) -> tuple[list[BaseGeometry], str]:
    metadata, table = pyogrio.read_arrow(conservation_areas_path)
    geometry_name = _geometry_column(metadata, table.column_names)
    datasets = _column_values(table, "dataset")
    end_dates = _column_values(table, "end-date")
    geometries = []
    skipped_other_dataset = 0
    skipped_ended = 0
    skipped_non_polygon = 0
    skipped_empty = 0
    for dataset, end_date, geom in zip(
        datasets,
        end_dates,
        from_wkb(table[geometry_name].combine_chunks().to_pylist()),
        strict=True,
    ):
        if not _is_planning_conservation_area_record(dataset):
            skipped_other_dataset += 1
            continue
        if not _is_current_planning_record(end_date):
            skipped_ended += 1
            continue
        if geom is None or geom.is_empty:
            skipped_empty += 1
            continue
        if geom.geom_type not in {"Polygon", "MultiPolygon"}:
            skipped_non_polygon += 1
            continue
        geometries.append(geom)
    if not geometries:
        raise ValueError(
            f"{conservation_areas_path} does not contain any usable polygon geometries"
        )
    if skipped_other_dataset or skipped_ended or skipped_empty or skipped_non_polygon:
        print(
            "Skipped conservation-area records during load: "
            f"other_dataset={skipped_other_dataset}, "
            f"ended={skipped_ended}, "
            f"empty_geometry={skipped_empty}, "
            f"non_polygon={skipped_non_polygon}"
        )
    return geometries, _normalise_crs(metadata.get("crs"))


def _postcode_conservation_area_flags(
    postcodes: pl.DataFrame,
    conservation_geometries: list[BaseGeometry],
    conservation_crs: object | None,
    batch_size: int = 100_000,
) -> pl.DataFrame:
    required = {"postcode", "lat", "lon"}
    missing = sorted(required - set(postcodes.columns))
    if missing:
        raise ValueError(f"Postcode data missing required columns: {missing}")

    all_postcodes = postcodes.select("postcode").drop_nulls().unique()
    valid_points = postcodes.select("postcode", "lat", "lon").drop_nulls()
    if valid_points.is_empty():
        return all_postcodes.with_columns(pl.lit("No").alias(CONSERVATION_AREA_FEATURE))

    lat = valid_points["lat"].to_numpy()
    lon = valid_points["lon"].to_numpy()
    finite = np.isfinite(lat) & np.isfinite(lon)
    valid_points = valid_points.filter(pl.Series(finite))
    if valid_points.is_empty():
        return all_postcodes.with_columns(pl.lit("No").alias(CONSERVATION_AREA_FEATURE))

    lat = valid_points["lat"].to_numpy()
    lon = valid_points["lon"].to_numpy()
    transformer = Transformer.from_crs(
        "EPSG:4326", _normalise_crs(conservation_crs), always_xy=True
    )
    x, y = transformer.transform(lon, lat)

    tree = STRtree(conservation_geometries)
    inside = np.zeros(valid_points.height, dtype=bool)
    for start in range(0, valid_points.height, batch_size):
        end = min(start + batch_size, valid_points.height)
        point_batch = points(x[start:end], y[start:end])
        matches = tree.query(point_batch, predicate="intersects")
        if matches.size > 0:
            inside[start + matches[0]] = True

    matched = (
        valid_points.select("postcode")
        .with_columns(pl.Series("_within_conservation_area", inside))
        .group_by("postcode")
        .agg(pl.col("_within_conservation_area").max())
        .with_columns(
            pl.when(pl.col("_within_conservation_area"))
            .then(pl.lit("Yes"))
            .otherwise(pl.lit("No"))
            .alias(CONSERVATION_AREA_FEATURE)
        )
        .select("postcode", CONSERVATION_AREA_FEATURE)
    )
    return (
        all_postcodes.join(matched, on="postcode", how="left")
        .with_columns(pl.col(CONSERVATION_AREA_FEATURE).fill_null("No"))
        .select("postcode", CONSERVATION_AREA_FEATURE)
    )


def _conservation_area_by_postcode(
    postcodes: pl.LazyFrame,
    conservation_areas_path: Path,
) -> pl.LazyFrame:
    print(f"Loading conservation area polygons from {conservation_areas_path}...")
    geometries, crs = _load_conservation_area_geometries(conservation_areas_path)
    postcode_points = postcodes.select("postcode", "lat", "lon").collect(
        engine="streaming"
    )
    print(
        "Computing conservation area membership for "
        f"{postcode_points.height} active English postcodes..."
    )
    return _postcode_conservation_area_flags(postcode_points, geometries, crs).lazy()


def _less_deprived_percentile_expr(column: str) -> pl.Expr:
    """Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
    non_null_count = pl.col(column).count()
    descending_rank = pl.col(column).rank("average", descending=True)
    return (
        pl.when(pl.col(column).is_null())
        .then(None)
        .when(pl.col(column) == pl.col(column).min())
        .then(100.0)
        .when(pl.col(column) == pl.col(column).max())
        .then(0.0)
        .when(non_null_count > 1)
        .then(((descending_rank - 1) / (non_null_count - 1) * 100).round(1))
        .otherwise(100.0)
        .alias(column)
    )


def _tree_density_by_postcode(tree_density_postcodes_path: Path) -> pl.LazyFrame:
    tree_density = pl.scan_parquet(tree_density_postcodes_path)
    columns = set(tree_density.collect_schema().names())
    if "postcode" not in columns:
        raise ValueError(
            f"{tree_density_postcodes_path} is missing required column: postcode"
        )

    if TREE_DENSITY_FEATURE in columns:
        density_column = TREE_DENSITY_FEATURE
    else:
        candidates = sorted(
            c for c in columns if _POSTCODE_TREE_DENSITY_PERCENTILE_RE.match(c)
        )
        if len(candidates) != 1:
            raise ValueError(
                f'{tree_density_postcodes_path} must contain column "{TREE_DENSITY_FEATURE}" '
                'or exactly one "Tree canopy density percentile within {radius}m" column; '
                f"found {len(candidates)} postcode percentile columns"
            )
        density_column = candidates[0]

    return (
        tree_density.select(
            pl.col("postcode"),
            pl.col(density_column).cast(pl.Float32).alias(TREE_DENSITY_FEATURE),
        )
        .drop_nulls(["postcode"])
        .unique(["postcode"])
    )


def _validate_lad_source_coverage(
    iod_path: Path, ethnicity_path: Path, rental_prices_path: Path
) -> None:
    iod_lads = (
        pl.read_parquet(
            iod_path,
            columns=[
                "Local Authority District code (2024)",
                "Local Authority District name (2024)",
            ],
        )
        .rename(
            {
                "Local Authority District code (2024)": "lad",
                "Local Authority District name (2024)": "lad_name",
            }
        )
        .unique(["lad"])
    )

    ethnicity_lads = pl.read_parquet(ethnicity_path, columns=["Geography_code"]).rename(
        {"Geography_code": "lad"}
    )
    missing_ethnicity = iod_lads.join(ethnicity_lads, on="lad", how="anti").sort("lad")
    if missing_ethnicity.height > 0:
        raise ValueError(
            "Ethnicity data is missing 2024 LAD coverage: "
            f"{missing_ethnicity.to_dicts()}"
        )

    rental_lads = pl.read_parquet(rental_prices_path, columns=["area_code"]).rename(
        {"area_code": "lad"}
    )
    missing_rent = iod_lads.join(rental_lads, on="lad", how="anti").sort("lad")
    unexpected_missing_rent = missing_rent.filter(
        ~pl.col("lad").is_in(list(_RENT_SOURCE_UNAVAILABLE_LADS))
    )
    if unexpected_missing_rent.height > 0:
        raise ValueError(
            "Rental data is missing 2024 LAD coverage: "
            f"{unexpected_missing_rent.to_dicts()}"
        )
    if missing_rent.height > 0:
        print(
            "PIPR has no LAD-level rent estimates for source-unavailable LADs; "
            f"rent will remain null there: {missing_rent.to_dicts()}"
        )


def _validate_property_postcodes(df: pl.DataFrame) -> None:
    invalid = df.filter(
        pl.col("Postcode").is_null()
        | (pl.col("Postcode").cast(pl.Utf8).str.strip_chars() == "")
    )
    if invalid.height == 0:
        return

    sample_cols = [
        col
        for col in ("Postcode", "Address per Property Register", "Last known price")
        if col in invalid.columns
    ]
    sample = invalid.select(sample_cols).head(10).to_dicts()
    raise ValueError(
        "Property rows missing a postcode after merge: "
        f"{invalid.height} rows. Sample: {sample}"
    )


# Map listings-parquet source columns to the `_actual_*` overlay columns
# carried alongside the wide frame through the postcode-keyed joins. After the
# rest of the pipeline finalises, listing rows pick their canonical dashboard
# values from these overlays in `_finalize_listings`.
_LISTING_OVERLAY_SOURCES: tuple[tuple[str, str, pl.DataType], ...] = (
    ("Listing URL", "_actual_listing_url", pl.Utf8),
    ("Asking price", "_actual_asking_price", pl.Int64),
    ("Asking price per sqm", "_actual_asking_price_per_sqm", pl.Int32),
    ("Listing date", "_actual_listing_date", pl.Datetime("us")),
    ("Listing status", "_actual_listing_status", pl.Utf8),
    ("Listing features", "_actual_listing_features", pl.List(pl.Utf8)),
    ("Bedrooms", "_actual_bedrooms", pl.Int32),
    ("Bathrooms", "_actual_bathrooms", pl.Int32),
    ("Price qualifier", "_actual_price_qualifier", pl.Utf8),
    ("Property sub-type", "_actual_property_sub_type", pl.Utf8),
    ("lat", "_actual_lat", pl.Float64),
    ("lon", "_actual_lon", pl.Float64),
    # Seeds for the wide row that an unmatched listing produces.
    ("Total floor area (sqm)", "_actual_total_floor_area", pl.Float64),
    ("Number of bedrooms & living rooms", "_actual_number_habitable_rooms", pl.Int16),
    ("Property type", "_actual_property_type", pl.Utf8),
    ("Leasehold/Freehold", "_actual_leasehold_freehold", pl.Utf8),
)
_LISTING_FLAG_COLUMN = "_actual_listing_url"
_TENURE_VALUES = ["Freehold", "Leasehold"]
_PROPERTY_TYPE_VALUES = [
    "Detached",
    "Semi-Detached",
    "Terraced",
    "Flats/Maisonettes",
    "Other",
]
_EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"]
_PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
_PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS = 82
_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS = 96
_PROPERTY_MATCH_MIN_MARGIN = 4.0
_DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
_DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
_DIRECT_EPC_MATCH_MIN_MARGIN = 4.0
_DIRECT_EPC_NEARBY_RADIUS_M = 500.0
_DIRECT_EPC_NEAREST_POSTCODES = 40
_DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = (
    ("_direct_epc_address", pl.Utf8),
    ("_direct_current_energy_rating", pl.Utf8),
    ("_direct_potential_energy_rating", pl.Utf8),
    ("_direct_total_floor_area", pl.Float64),
    ("_direct_number_habitable_rooms", pl.Int16),
    ("_direct_floor_height", pl.Float64),
    ("_direct_construction_age_band", pl.UInt16),
    ("_direct_is_construction_date_approximate", pl.UInt8),
    ("_direct_was_council_house", pl.Utf8),
    ("_direct_epc_match_status", pl.Utf8),
    ("_direct_epc_match_score", pl.Float32),
    ("_direct_epc_match_margin", pl.Float32),
)
_DIRECT_EPC_RAW_COLUMN_MAP = {
    "epc_address": "_direct_epc_address",
    "current_energy_rating": "_direct_current_energy_rating",
    "potential_energy_rating": "_direct_potential_energy_rating",
    "total_floor_area": "_direct_total_floor_area",
    "number_habitable_rooms": "_direct_number_habitable_rooms",
    "floor_height": "_direct_floor_height",
    "construction_age_band": "_direct_construction_age_band",
    "is_construction_date_approximate": "_direct_is_construction_date_approximate",
    "was_council_house": "_direct_was_council_house",
}


def _canonical_postcode_expr(column: str) -> pl.Expr:
    """Re-format a postcode into NSPL `pcds` style (e.g. `AB1 2CD`) or null."""
    compact = (
        pl.col(column)
        .cast(pl.Utf8)
        .str.to_uppercase()
        .str.replace_all(r"[^A-Z0-9]+", "")
        .str.strip_chars()
    )
    return (
        pl.when(compact.str.contains(r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"))
        .then(compact.str.replace(r"^(.+)([0-9][A-Z]{2})$", "${1} ${2}"))
        .otherwise(None)
    )


def _postcode_outcode_expr(column: str) -> pl.Expr:
    return normalize_postcode_key(pl.col(column)).str.extract(
        r"^([A-Z]{1,2}\d[A-Z\d]?)\d[A-Z]{2}$", 1
    )


def _canonical_epc_property_type_expr() -> pl.Expr:
    bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
        ["NO DATA!", "Not Recorded"]
    )
    has_epc = pl.col("epc_property_type").is_not_null()
    is_house = pl.col("epc_property_type") == "House"
    return (
        pl.when(has_epc & is_house & ~bad_built_form)
        .then(pl.col("built_form"))
        .when(has_epc)
        .then(pl.col("epc_property_type"))
        .otherwise(None)
        .replace(
            {
                "Flat": "Flats/Maisonettes",
                "Maisonette": "Flats/Maisonettes",
                "End-Terrace": "Terraced",
                "Mid-Terrace": "Terraced",
                "Enclosed End-Terrace": "Terraced",
                "Enclosed Mid-Terrace": "Terraced",
                "Bungalow": "Other",
                "Park home": "Other",
                "House": "Other",
            }
        )
    )


def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
    return (
        pl.col(column)
        .cast(pl.Utf8)
        .str.replace("England and Wales: ", "")
        .str.replace(" onwards", "")
        .str.extract(r"(\d{4})", 1)
        .cast(pl.UInt16, strict=False)
    )


def _ratio_bonus(
    left: float | int | None, right: float | int | None, pct: float, cap: float
) -> float:
    if left is None or right is None:
        return 0.0
    try:
        left_f = float(left)
        right_f = float(right)
    except (TypeError, ValueError):
        return 0.0
    if left_f <= 0 or right_f <= 0:
        return 0.0
    rel = abs(left_f - right_f) / max(left_f, right_f)
    if rel > pct:
        return 0.0
    return cap * (1.0 - rel / pct)


def _rooms_bonus(left: int | None, right: int | None) -> float:
    if left is None or right is None:
        return 0.0
    try:
        diff = abs(int(left) - int(right))
    except (TypeError, ValueError):
        return 0.0
    if diff == 0:
        return 4.0
    if diff == 1:
        return 2.0
    return 0.0


def _enum_bonus(
    left: str | None, right: str | None, *, exact: float, mismatch: float
) -> float:
    if not left or not right:
        return 0.0
    return exact if left == right else mismatch


def _address_score(query: str, candidate: str | None) -> int:
    if not candidate:
        return 0
    return max(
        fuzz.token_set_ratio(query, candidate),
        fuzz.token_sort_ratio(query, candidate),
    )


def _has_number(address: str | None) -> bool:
    return bool(address and _NUMBER_RE.search(address))


def _load_listings_for_merge(
    listings_path: Path, arcgis_path: Path
) -> pl.DataFrame:
    """Read the listings parquet and prepare it for the wide-frame merge.

    Output is keyed by `_listing_idx` and carries:
      * `postcode` — canonical (NSPL `pcds`) form, with terminated postcodes
        remapped to their nearest active successor;
      * `pp_address` — the listing's raw register address (used as the
        address half of the fuzzy match);
      * one `_actual_*` overlay column per `_LISTING_OVERLAY_SOURCES` entry.
    """
    raw = pl.scan_parquet(listings_path).with_row_index("_listing_idx")
    postcode_mapping = build_postcode_mapping(arcgis_path).lazy()

    # Listings parquets occasionally carry Float NaNs (e.g. floor area). Polars
    # treats NaN as distinct from null and the downstream `latest_price /
    # total_floor_area` cast to Int32 explodes on a NaN, so we normalise floats
    # to null at load time.
    def _overlay_expr(src: str, dst: str, dtype: pl.DataType) -> pl.Expr:
        expr = pl.col(src).cast(dtype, strict=False)
        if dtype in (pl.Float32, pl.Float64):
            expr = expr.fill_nan(None)
        return expr.alias(dst)

    overlay = [
        _overlay_expr(src, dst, dtype) for src, dst, dtype in _LISTING_OVERLAY_SOURCES
    ]
    return (
        raw.with_columns(
            _canonical_postcode_expr("Postcode").alias("_canonical_postcode"),
        )
        .join(
            postcode_mapping,
            left_on="_canonical_postcode",
            right_on="old_postcode",
            how="left",
        )
        .with_columns(
            pl.coalesce("new_postcode", "_canonical_postcode", "Postcode").alias(
                "postcode"
            ),
            pl.col("Address per Property Register").alias("pp_address"),
            *overlay,
        )
        .select(
            "_listing_idx",
            "postcode",
            "pp_address",
            *[dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES],
        )
        .collect(engine="streaming")
    )


def _ensure_direct_epc_columns(df: pl.DataFrame) -> pl.DataFrame:
    missing_exprs = [
        pl.lit(None, dtype=dtype).alias(column)
        for column, dtype in _DIRECT_EPC_COLUMNS
        if column not in df.columns
    ]
    if not missing_exprs:
        return df
    return df.with_columns(missing_exprs)


def _direct_epc_match_schema() -> dict[str, pl.DataType]:
    return {
        "_listing_idx": pl.UInt32,
        **{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS},
    }


def _empty_direct_epc_matches() -> pl.DataFrame:
    return pl.DataFrame(schema=_direct_epc_match_schema())


def _load_direct_epc_candidates(
    epc_path: Path,
    arcgis_path: Path,
    listing_outcodes: list[str],
    temp_dir: Path,
) -> pl.DataFrame:
    schema = {
        "_direct_epc_row": pl.UInt32,
        "_direct_epc_match_address": pl.Utf8,
        "_direct_epc_match_postcode": pl.Utf8,
        "_direct_epc_outcode": pl.Utf8,
        "_direct_epc_canonical_property_type": pl.Utf8,
        "_direct_epc_east": pl.Float64,
        "_direct_epc_north": pl.Float64,
        **{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS if column.startswith("_direct_")},
    }
    if not listing_outcodes:
        return pl.DataFrame(schema=schema)

    epc_base = (
        _scan_epc_certificates(epc_path, temp_dir)
        .with_columns(
            normalize_address_key(pl.col("epc_address")).alias(
                "_direct_epc_match_address"
            ),
            normalize_postcode_key(pl.col("epc_postcode")).alias(
                "_direct_epc_match_postcode"
            ),
        )
        .with_columns(
            pl.col("_direct_epc_match_postcode")
            .str.extract(r"^([A-Z]{1,2}\d[A-Z\d]?)\d[A-Z]{2}$", 1)
            .alias("_direct_epc_outcode")
        )
        .filter(pl.col("_direct_epc_outcode").is_in(listing_outcodes))
        .filter(pl.col("_direct_epc_match_address").is_not_null())
        .filter(pl.col("_direct_epc_match_postcode").is_not_null())
    )

    social_tenure = (
        epc_base.filter(pl.col("tenure").str.to_lowercase().str.contains("social"))
        .select("_direct_epc_match_address", "_direct_epc_match_postcode")
        .unique()
        .with_columns(pl.lit("Yes").alias("_direct_was_council_house"))
    )

    arcgis = pl.scan_parquet(arcgis_path).select(
        normalize_postcode_key(pl.col("pcds")).alias("_direct_epc_match_postcode"),
        pl.col("east1m").alias("_direct_epc_east"),
        pl.col("north1m").alias("_direct_epc_north"),
    )

    return (
        epc_base.sort("inspection_date", descending=True)
        .group_by("_direct_epc_match_address", "_direct_epc_match_postcode")
        .first()
        .join(
            social_tenure,
            on=["_direct_epc_match_address", "_direct_epc_match_postcode"],
            how="left",
        )
        .join(arcgis, on="_direct_epc_match_postcode", how="left")
        .with_columns(
            _canonical_epc_property_type_expr().alias(
                "_direct_epc_canonical_property_type"
            ),
            _construction_year_expr().alias("_direct_construction_age_band"),
            pl.when(pl.col("current_energy_rating").is_in(_EPC_RATING_VALUES))
            .then(pl.col("current_energy_rating"))
            .otherwise(None)
            .alias("_direct_current_energy_rating"),
            pl.when(pl.col("potential_energy_rating").is_in(_EPC_RATING_VALUES))
            .then(pl.col("potential_energy_rating"))
            .otherwise(None)
            .alias("_direct_potential_energy_rating"),
            pl.col("epc_address").alias("_direct_epc_address"),
            pl.col("total_floor_area").alias("_direct_total_floor_area"),
            pl.col("number_habitable_rooms").alias(
                "_direct_number_habitable_rooms"
            ),
            pl.col("floor_height").alias("_direct_floor_height"),
            pl.col("_direct_was_council_house").fill_null("No"),
        )
        .with_columns(
            pl.when(pl.col("_direct_construction_age_band").is_not_null())
            .then(pl.lit(1, dtype=pl.UInt8))
            .otherwise(pl.lit(None, dtype=pl.UInt8))
            .alias("_direct_is_construction_date_approximate")
        )
        .with_row_index("_direct_epc_row")
        .select(
            "_direct_epc_row",
            "_direct_epc_match_address",
            "_direct_epc_match_postcode",
            "_direct_epc_outcode",
            "_direct_epc_canonical_property_type",
            "_direct_epc_east",
            "_direct_epc_north",
            "_direct_epc_address",
            "_direct_current_energy_rating",
            "_direct_potential_energy_rating",
            "_direct_total_floor_area",
            "_direct_number_habitable_rooms",
            "_direct_floor_height",
            "_direct_construction_age_band",
            "_direct_is_construction_date_approximate",
            "_direct_was_council_house",
        )
        .collect(engine="streaming")
    )


def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
    match = listings.with_columns(
        normalize_address_key(pl.col("pp_address")).alias("_listing_match_address"),
        normalize_postcode_key(pl.col("postcode")).alias("_listing_match_postcode"),
    ).with_columns(
        pl.col("_listing_match_postcode")
        .str.extract(r"^([A-Z]{1,2}\d[A-Z\d]?)\d[A-Z]{2}$", 1)
        .alias("_listing_outcode")
    )

    if match.is_empty():
        return match.with_columns(
            pl.Series("_listing_east", [], dtype=pl.Float64),
            pl.Series("_listing_north", [], dtype=pl.Float64),
        )

    transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
    east, north = transformer.transform(
        match["_actual_lon"].to_numpy(), match["_actual_lat"].to_numpy()
    )
    return match.with_columns(
        pl.Series("_listing_east", east, dtype=pl.Float64),
        pl.Series("_listing_north", north, dtype=pl.Float64),
    )


def _optional_lazy_col(
    schema: pl.Schema, column: str, dtype: pl.DataType
) -> pl.Expr:
    if column in schema:
        return pl.col(column).cast(dtype, strict=False).alias(column)
    return pl.lit(None, dtype=dtype).alias(column)


def _listing_property_match_schema() -> dict[str, pl.DataType]:
    return {
        "_listing_idx": pl.UInt32,
        "_matched_postcode": pl.Utf8,
        "_matched_pp_address": pl.Utf8,
        "_property_match_score": pl.Float32,
        "_property_match_address_score": pl.Int32,
        "_property_match_margin": pl.Float32,
        "_property_match_field": pl.Utf8,
    }


def _empty_listing_property_matches() -> pl.DataFrame:
    return pl.DataFrame(schema=_listing_property_match_schema())


def _property_match_candidate_frame(wide: pl.LazyFrame) -> pl.DataFrame:
    schema = wide.collect_schema()
    return (
        wide.select(
            pl.col("postcode").cast(pl.Utf8).alias("postcode"),
            pl.col("pp_address").cast(pl.Utf8).alias("pp_address"),
            _optional_lazy_col(schema, "epc_address", pl.Utf8),
            _optional_lazy_col(schema, "pp_property_type", pl.Utf8),
            _optional_lazy_col(schema, "duration", pl.Utf8),
            _optional_lazy_col(schema, "total_floor_area", pl.Float64),
            _optional_lazy_col(schema, "number_habitable_rooms", pl.Int16),
            _optional_lazy_col(schema, "latest_price", pl.Int64),
        )
        .with_row_index("_property_row")
        .with_columns(
            normalize_postcode_key(pl.col("postcode")).alias(
                "_property_match_postcode"
            ),
            normalize_address_key(pl.col("pp_address")).alias(
                "_property_match_address"
            ),
            normalize_address_key(pl.col("epc_address")).alias(
                "_property_epc_match_address"
            ),
        )
        .filter(pl.col("pp_address").is_not_null())
        .filter(pl.col("_property_match_postcode").is_not_null())
        .filter(
            pl.col("_property_match_address").is_not_null()
            | pl.col("_property_epc_match_address").is_not_null()
        )
        .collect(engine="streaming")
    )


def _property_candidates_by_postcode(
    candidates: pl.DataFrame,
) -> dict[str, list[dict]]:
    buckets: dict[str, list[dict]] = {}
    for row in candidates.iter_rows(named=True):
        postcode = row.get("_property_match_postcode")
        if postcode:
            buckets.setdefault(postcode, []).append(row)
    return buckets


def _best_listing_property_candidate(
    listing: dict, candidates: list[dict]
) -> dict | None:
    query = listing.get("_listing_match_address")
    if not query:
        return None

    listing_has_numbers = _has_number(query)
    scored: list[tuple[float, int, dict, str]] = []
    for candidate in candidates:
        register_address = candidate.get("_property_match_address")
        epc_address = candidate.get("_property_epc_match_address")
        register_numbers_compatible = bool(
            register_address and _numbers_compatible(query, register_address)
        )
        epc_numbers_compatible = bool(
            epc_address and _numbers_compatible(query, epc_address)
        )
        if not (register_numbers_compatible or epc_numbers_compatible):
            continue

        register_score = _address_score(query, register_address)
        epc_score = _address_score(query, epc_address)
        base_score = max(register_score, epc_score)
        if base_score == 0:
            continue

        score = float(base_score)
        score += _enum_bonus(
            listing.get("_actual_property_type"),
            candidate.get("pp_property_type"),
            exact=7.0,
            mismatch=-8.0,
        )
        score += _enum_bonus(
            listing.get("_actual_leasehold_freehold"),
            candidate.get("duration"),
            exact=3.0,
            mismatch=-3.0,
        )
        score += _ratio_bonus(
            listing.get("_actual_total_floor_area"),
            candidate.get("total_floor_area"),
            pct=0.15,
            cap=8.0,
        )
        score += _rooms_bonus(
            listing.get("_actual_number_habitable_rooms"),
            candidate.get("number_habitable_rooms"),
        )
        score += _ratio_bonus(
            listing.get("_actual_asking_price"),
            candidate.get("latest_price"),
            pct=0.25,
            cap=3.0,
        )
        matched_field = (
            "pp_address" if register_score >= epc_score else "epc_address"
        )
        scored.append((score, base_score, candidate, matched_field))

    if not scored:
        return None
    scored.sort(key=lambda item: item[0], reverse=True)
    top = scored[0]
    runner_up = scored[1][0] if len(scored) > 1 else None
    margin = top[0] - runner_up if runner_up is not None else top[0]
    score_threshold = (
        _PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS
        if listing_has_numbers
        else _PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS
    )
    address_threshold = (
        _PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS
        if listing_has_numbers
        else _PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS
    )
    if (
        top[0] < score_threshold
        or top[1] < address_threshold
        or margin < _PROPERTY_MATCH_MIN_MARGIN
    ):
        return None

    candidate = top[2]
    return {
        "_listing_idx": listing["_listing_idx"],
        "_matched_postcode": candidate.get("postcode"),
        "_matched_pp_address": candidate.get("pp_address"),
        "_property_match_score": round(top[0], 1),
        "_property_match_address_score": top[1],
        "_property_match_margin": round(margin, 1),
        "_property_match_field": top[3],
    }


def _match_listing_properties(
    listing_matches: pl.DataFrame, property_candidates: pl.DataFrame
) -> pl.DataFrame:
    if listing_matches.is_empty() or property_candidates.is_empty():
        return _empty_listing_property_matches()

    buckets = _property_candidates_by_postcode(property_candidates)
    best_matches = []
    for listing in listing_matches.iter_rows(named=True):
        postcode = listing.get("_listing_match_postcode")
        if not postcode:
            continue
        match = _best_listing_property_candidate(listing, buckets.get(postcode, []))
        if match is not None:
            best_matches.append(match)

    if not best_matches:
        return _empty_listing_property_matches()

    matches = pl.DataFrame(best_matches, schema=_listing_property_match_schema())
    return (
        matches.sort(
            ["_property_match_score", "_listing_idx"], descending=[True, False]
        )
        .unique(
            ["_matched_postcode", "_matched_pp_address"],
            keep="first",
            maintain_order=True,
        )
        .sort("_listing_idx")
    )


def _epc_candidates_by_postcode(candidates: pl.DataFrame) -> dict[str, list[dict]]:
    buckets: dict[str, list[dict]] = {}
    for row in candidates.iter_rows(named=True):
        postcode = row.get("_direct_epc_match_postcode")
        if postcode:
            buckets.setdefault(postcode, []).append(row)
    return buckets


def _epc_postcode_tree(
    candidates: pl.DataFrame,
) -> tuple[cKDTree | None, list[str]]:
    postcode_points = (
        candidates.select(
            "_direct_epc_match_postcode",
            "_direct_epc_east",
            "_direct_epc_north",
        )
        .drop_nulls()
        .filter(
            pl.col("_direct_epc_east").is_finite()
            & pl.col("_direct_epc_north").is_finite()
        )
        .unique("_direct_epc_match_postcode")
    )
    if postcode_points.is_empty():
        return None, []
    coords = np.column_stack(
        [
            postcode_points["_direct_epc_east"].to_numpy(),
            postcode_points["_direct_epc_north"].to_numpy(),
        ]
    )
    return cKDTree(coords), postcode_points["_direct_epc_match_postcode"].to_list()


def _candidate_postcodes_for_listing(
    listing: dict,
    postcode_tree: cKDTree | None,
    postcode_values: list[str],
) -> list[str]:
    postcodes: list[str] = []
    exact = listing.get("_listing_match_postcode")
    if exact:
        postcodes.append(exact)

    if postcode_tree is None:
        return postcodes

    east = listing.get("_listing_east")
    north = listing.get("_listing_north")
    try:
        east_f = float(east)
        north_f = float(north)
    except (TypeError, ValueError):
        return postcodes
    if not np.isfinite(east_f) or not np.isfinite(north_f):
        return postcodes

    k = min(_DIRECT_EPC_NEAREST_POSTCODES, len(postcode_values))
    distances, indices = postcode_tree.query(
        [east_f, north_f],
        k=k,
        distance_upper_bound=_DIRECT_EPC_NEARBY_RADIUS_M,
    )
    distances = np.atleast_1d(distances)
    indices = np.atleast_1d(indices)
    seen = set(postcodes)
    for distance, idx in zip(distances, indices, strict=False):
        if not np.isfinite(distance) or idx >= len(postcode_values):
            continue
        postcode = postcode_values[int(idx)]
        if postcode not in seen:
            postcodes.append(postcode)
            seen.add(postcode)
    return postcodes


def _best_direct_epc_candidate(listing: dict, candidates: list[dict]) -> dict | None:
    query = listing.get("_listing_match_address")
    if not query:
        return None

    listing_has_numbers = _has_number(query)
    scored: list[tuple[float, int, dict]] = []
    for candidate in candidates:
        address = candidate.get("_direct_epc_match_address")
        if listing_has_numbers and not _numbers_compatible(query, address or ""):
            continue
        base_score = _address_score(query, address)
        if base_score == 0:
            continue

        score = float(base_score)
        score += _enum_bonus(
            listing.get("_actual_property_type"),
            candidate.get("_direct_epc_canonical_property_type"),
            exact=6.0,
            mismatch=-6.0,
        )
        score += _ratio_bonus(
            listing.get("_actual_total_floor_area"),
            candidate.get("_direct_total_floor_area"),
            pct=0.12,
            cap=8.0,
        )
        score += _rooms_bonus(
            listing.get("_actual_number_habitable_rooms"),
            candidate.get("_direct_number_habitable_rooms"),
        )
        scored.append((score, base_score, candidate))

    if not scored:
        return None
    scored.sort(key=lambda item: item[0], reverse=True)
    top = scored[0]
    runner_up = scored[1][0] if len(scored) > 1 else None
    margin = top[0] - runner_up if runner_up is not None else top[0]
    threshold = (
        _DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS
        if listing_has_numbers
        else _DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS
    )
    if top[0] < threshold or margin < _DIRECT_EPC_MATCH_MIN_MARGIN:
        return None

    candidate = top[2]
    return {
        "_listing_idx": listing["_listing_idx"],
        "_direct_epc_address": candidate.get("_direct_epc_address"),
        "_direct_current_energy_rating": candidate.get("_direct_current_energy_rating"),
        "_direct_potential_energy_rating": candidate.get(
            "_direct_potential_energy_rating"
        ),
        "_direct_total_floor_area": candidate.get("_direct_total_floor_area"),
        "_direct_number_habitable_rooms": candidate.get(
            "_direct_number_habitable_rooms"
        ),
        "_direct_floor_height": candidate.get("_direct_floor_height"),
        "_direct_construction_age_band": candidate.get("_direct_construction_age_band"),
        "_direct_is_construction_date_approximate": candidate.get(
            "_direct_is_construction_date_approximate"
        ),
        "_direct_was_council_house": candidate.get("_direct_was_council_house"),
        "_direct_epc_match_status": "matched",
        "_direct_epc_match_score": round(top[0], 1),
        "_direct_epc_match_margin": round(margin, 1),
    }


def _match_direct_epc(
    listing_matches: pl.DataFrame, epc_candidates: pl.DataFrame
) -> pl.DataFrame:
    if listing_matches.is_empty() or epc_candidates.is_empty():
        return _empty_direct_epc_matches()

    buckets = _epc_candidates_by_postcode(epc_candidates)
    postcode_tree, postcode_values = _epc_postcode_tree(epc_candidates)

    matches = []
    for listing in listing_matches.iter_rows(named=True):
        candidate_postcodes = _candidate_postcodes_for_listing(
            listing, postcode_tree, postcode_values
        )
        candidate_rows: list[dict] = []
        seen_rows: set[int] = set()
        for postcode in candidate_postcodes:
            for candidate in buckets.get(postcode, []):
                row = candidate.get("_direct_epc_row")
                if row in seen_rows:
                    continue
                candidate_rows.append(candidate)
                if row is not None:
                    seen_rows.add(row)
        match = _best_direct_epc_candidate(listing, candidate_rows)
        if match is not None:
            matches.append(match)

    if not matches:
        return _empty_direct_epc_matches()
    return pl.DataFrame(matches, schema=_direct_epc_match_schema())


def _enrich_listings_with_direct_epc(
    listings: pl.DataFrame,
    epc_path: Path | None,
    arcgis_path: Path,
) -> pl.DataFrame:
    if epc_path is None:
        return _ensure_direct_epc_columns(listings)

    listing_matches = _listing_match_frame(listings)
    listing_outcodes = (
        listing_matches.select("_listing_outcode")
        .drop_nulls()
        .unique()
        .to_series()
        .to_list()
    )
    if not listing_outcodes:
        return _ensure_direct_epc_columns(listings)

    with tempfile.TemporaryDirectory(
        prefix="direct_listing_epc_", dir=local_tmp_dir()
    ) as tmpdir:
        epc_candidates = _load_direct_epc_candidates(
            epc_path, arcgis_path, listing_outcodes, Path(tmpdir)
        )
        print(f"Direct listing EPC candidates: {epc_candidates.height}")
        direct_matches = _match_direct_epc(listing_matches, epc_candidates)

    print(f"Direct listing EPC matches: {direct_matches.height}")
    if direct_matches.is_empty():
        return _ensure_direct_epc_columns(listings)

    return _ensure_direct_epc_columns(
        listings.join(direct_matches, on="_listing_idx", how="left")
    )


def _coalesce_direct_epc_columns(wide: pl.LazyFrame) -> pl.LazyFrame:
    return wide.with_columns(
        [
            pl.coalesce(pl.col(raw_column), pl.col(direct_column)).alias(raw_column)
            for raw_column, direct_column in _DIRECT_EPC_RAW_COLUMN_MAP.items()
        ]
    )


def _build_unmatched_listing_seed_rows(
    unmatched_listing_idxs: pl.DataFrame,
    listings: pl.DataFrame,
    template_schema: pl.Schema,
) -> pl.DataFrame:
    """Materialise wide-shape rows for listings that didn't match any property.

    Each seed row carries enough columns (postcode, pp_address, property type,
    tenure, floor area, room count, asking price → latest_price) for the
    postcode-keyed joins later in `_build` to fill in the rest. All other
    wide columns are null on the seed row.
    """
    if unmatched_listing_idxs.is_empty():
        return pl.DataFrame(schema=template_schema)

    listings = _ensure_direct_epc_columns(listings)
    base = unmatched_listing_idxs.join(listings, on="_listing_idx", how="inner")

    populated: dict[str, pl.Expr] = {
        "postcode": pl.col("postcode"),
        "pp_address": pl.col("pp_address"),
        "pp_property_type": pl.col("_actual_property_type"),
        "duration": pl.col("_actual_leasehold_freehold"),
        "total_floor_area": pl.coalesce(
            pl.col("_actual_total_floor_area"), pl.col("_direct_total_floor_area")
        ),
        "number_habitable_rooms": pl.coalesce(
            pl.col("_actual_number_habitable_rooms"),
            pl.col("_direct_number_habitable_rooms"),
        ),
        "latest_price": pl.col("_actual_asking_price"),
    }
    for raw_column, direct_column in _DIRECT_EPC_RAW_COLUMN_MAP.items():
        if raw_column in populated:
            continue
        populated[raw_column] = pl.col(direct_column)
    for _src, dst, _dt in _LISTING_OVERLAY_SOURCES:
        populated[dst] = pl.col(dst)

    seed_exprs: list[pl.Expr] = []
    for col_name, dtype in template_schema.items():
        if col_name in populated:
            seed_exprs.append(
                populated[col_name].cast(dtype, strict=False).alias(col_name)
            )
        else:
            seed_exprs.append(pl.lit(None, dtype=dtype).alias(col_name))

    return base.select(seed_exprs)


def _integrate_listings(
    wide: pl.LazyFrame,
    listings_path: Path,
    arcgis_path: Path,
    epc_path: Path | None = None,
) -> pl.LazyFrame:
    """Splice actual listings into the wide property frame.

    Listings are fuzzy-matched to wide rows on (postcode, pp_address). Matched
    listings stamp `_actual_*` overlay columns onto the existing wide row, so
    historical context (EPC, last sale, etc.) is preserved. Unmatched listings
    are appended as new wide rows with enough property-shape fields filled in
    that the downstream postcode-keyed joins (deprivation, crime, tree
    density, …) populate them just like any other row.
    """
    listings = _load_listings_for_merge(listings_path, arcgis_path)
    print(f"Listings loaded: {listings.height}")
    listings = _enrich_listings_with_direct_epc(listings, epc_path, arcgis_path)

    overlay_columns = [dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES]
    listing_attachment_columns = [
        *overlay_columns,
        *[column for column, _dtype in _DIRECT_EPC_COLUMNS],
    ]

    property_candidates = _property_match_candidate_frame(wide)
    joined = _match_listing_properties(
        _listing_match_frame(listings), property_candidates
    )

    total = listings.height
    matched_count = joined.height
    if total > 0:
        print(
            "Listings matched to existing wide rows: "
            f"{matched_count}/{total} "
            f"({100 * matched_count / total:.1f}%)"
        )

    overlay_for_matched = (
        joined.join(listings, on="_listing_idx", how="inner")
        .select(
            pl.col("_matched_postcode").alias("postcode"),
            pl.col("_matched_pp_address").alias("pp_address"),
            *listing_attachment_columns,
        )
        .unique(["postcode", "pp_address"], keep="first")
    )

    wide_attached = wide.join(
        overlay_for_matched.lazy(), on=["postcode", "pp_address"], how="left"
    )
    wide_attached = _coalesce_direct_epc_columns(wide_attached)
    wide_output = wide_attached.drop(
        [column for column, _dtype in _DIRECT_EPC_COLUMNS], strict=False
    )

    unmatched_listing_idxs = listings.select("_listing_idx").join(
        joined.select("_listing_idx"), on="_listing_idx", how="anti"
    )
    seed_rows = _build_unmatched_listing_seed_rows(
        unmatched_listing_idxs,
        listings,
        template_schema=wide_output.collect_schema(),
    )

    return pl.concat([wide_output, seed_rows.lazy()], how="vertical_relaxed")


def _finalize_listings(df: pl.DataFrame) -> pl.DataFrame:
    """Project the post-rename wide frame down to enriched-listing rows."""
    df = df.filter(pl.col(_LISTING_FLAG_COLUMN).is_not_null())

    df = df.with_columns(
        pl.col("_actual_listing_url").alias("Listing URL"),
        pl.col("_actual_listing_date").alias("Listing date"),
        pl.col("_actual_listing_status").alias("Listing status"),
        pl.col("_actual_listing_features").alias("Listing features"),
        pl.col("_actual_asking_price").alias("Asking price"),
        pl.col("_actual_asking_price_per_sqm").alias("Asking price per sqm"),
        pl.col("_actual_bedrooms").alias("Bedrooms"),
        pl.col("_actual_bathrooms").alias("Bathrooms"),
        pl.col("_actual_price_qualifier").alias("Price qualifier"),
        pl.col("_actual_property_sub_type").alias("Property sub-type"),
        # Listing coordinates win over the postcode centroid.
        pl.coalesce(pl.col("_actual_lat").cast(pl.Float64), pl.col("lat")).alias("lat"),
        pl.coalesce(pl.col("_actual_lon").cast(pl.Float64), pl.col("lon")).alias("lon"),
        # Listing's floor area / rooms override any EPC/PP value when present.
        pl.coalesce(
            pl.col("_actual_total_floor_area").cast(pl.Float64),
            pl.col("Total floor area (sqm)"),
        ).alias("Total floor area (sqm)"),
        pl.coalesce(
            pl.col("_actual_number_habitable_rooms").cast(pl.Int16),
            pl.col("Number of bedrooms & living rooms"),
        ).alias("Number of bedrooms & living rooms"),
        pl.when(pl.col("_actual_property_type").is_in(_PROPERTY_TYPE_VALUES))
        .then(pl.col("_actual_property_type"))
        .otherwise(pl.col("Property type"))
        .alias("Property type"),
        pl.when(pl.col("_actual_leasehold_freehold").is_in(_TENURE_VALUES))
        .then(pl.col("_actual_leasehold_freehold"))
        .otherwise(pl.col("Leasehold/Freehold"))
        .alias("Leasehold/Freehold"),
    )

    df = df.with_columns(
        pl.coalesce(
            pl.col("Asking price per sqm"),
            pl.when(
                pl.col("Asking price").is_not_null()
                & pl.col("Total floor area (sqm)").is_not_null()
                & (pl.col("Total floor area (sqm)") > MIN_FLOOR_AREA_M2)
            )
            .then(
                (
                    pl.col("Asking price").cast(pl.Float64)
                    / pl.col("Total floor area (sqm)")
                )
                .round(0)
                .cast(pl.Int32, strict=False)
            )
            .otherwise(None),
        ).alias("Asking price per sqm")
    )

    df = df.with_columns(
        pl.col("Asking price").alias("Estimated current price"),
        pl.col("Asking price per sqm").alias("Est. price per sqm"),
        pl.coalesce(pl.col("Last known price"), pl.col("Asking price")).alias(
            "Last known price"
        ),
        pl.when(pl.col("Date of last transaction").is_not_null())
        .then(pl.lit("matched"))
        .otherwise(pl.lit("unmatched"))
        .alias("Historical property match status"),
    )

    drop_cols = [dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES]
    return df.drop(drop_cols, strict=False)


@dataclass
class _BuildResult:
    """Outputs of `_build` — exactly one of the two slot pairs is populated."""

    postcode: pl.DataFrame | None = None
    properties: pl.DataFrame | None = None
    listings: pl.DataFrame | None = None


def _build(
    epc_pp_path: Path,
    arcgis_path: Path,
    iod_path: Path,
    poi_proximity_path: Path,
    ethnicity_path: Path,
    crime_path: Path,
    noise_path: Path,
    school_proximity_path: Path,
    broadband_path: Path,
    conservation_areas_path: Path,
    rental_prices_path: Path,
    lsoa_population_path: Path,
    median_age_path: Path,
    election_results_path: Path,
    tree_density_postcodes_path: Path | None = None,
    listed_buildings_path: Path | None = None,
    actual_listings_path: Path | None = None,
    actual_listings_epc_path: Path | None = None,
    mode: Literal["normal", "listings"] = "normal",
) -> _BuildResult:
    """Build postcode/properties dataframes (or enriched listings) from epc_pp + auxiliary data.

    Modes:
      * `normal` — produces (postcode_df, properties_df) as before. Ignores
        `actual_listings_path` if supplied.
      * `listings` — requires `actual_listings_path`; produces a single
        enriched-listings DataFrame and skips the postcode/properties outputs.
        Listings flow through the same enrichment joins as historical rows,
        so postcode-scoped features (tree density, crime, deprivation, …) end
        up populated on every listing with a valid postcode.
    """
    if mode == "listings" and actual_listings_path is None:
        raise ValueError("listings mode requires actual_listings_path")
    _validate_lad_source_coverage(iod_path, ethnicity_path, rental_prices_path)

    wide = pl.scan_parquet(epc_pp_path).filter(
        pl.col("total_floor_area").is_null()
        | (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
    )

    # Remap terminated postcodes to nearest active successor
    postcode_mapping = build_postcode_mapping(arcgis_path)
    wide = (
        wide.join(
            postcode_mapping.lazy(),
            left_on="postcode",
            right_on="old_postcode",
            how="left",
        )
        .with_columns(
            pl.coalesce("new_postcode", "postcode").alias("postcode"),
        )
        .drop("new_postcode")
    )

    arcgis_raw = pl.scan_parquet(arcgis_path)
    postcode_country = arcgis_raw.select(
        pl.col("pcds").alias("postcode"),
        pl.col("ctry25cd"),
    ).unique(["postcode"])
    wide = wide.join(postcode_country, on="postcode", how="left")

    if listed_buildings_path is not None:
        active_postcodes_for_listed = (
            arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")
            .filter(pl.col("doterm").is_null())
            .select(
                pl.col("pcds").alias("postcode"),
                "east1m",
                "north1m",
            )
            .collect(engine="streaming")
        )
        listed_flags = _listed_building_flags(
            wide.select("postcode", "pp_address", "epc_address"),
            active_postcodes_for_listed,
            listed_buildings_path,
        )
        wide = wide.join(listed_flags.lazy(), on=["postcode", "pp_address"], how="left")
    else:
        wide = wide.with_columns(
            pl.lit(None, dtype=pl.Utf8).alias(LISTED_BUILDING_FEATURE)
        )

    if actual_listings_path is not None:
        wide = _integrate_listings(
            wide,
            actual_listings_path,
            arcgis_path,
            epc_path=actual_listings_epc_path,
        )

    wide = wide.with_columns(pl.col(LISTED_BUILDING_FEATURE).fill_null("No"))

    arcgis = (
        arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")  # England only
        .filter(pl.col("doterm").is_null())  # Active postcodes only
        # NSPL Feb 2026 renamed geographic code columns to {field}{year}cd.
        # Alias them back to the short canonical names used across the
        # pipeline so downstream joins don't need to know about NSPL's
        # versioning scheme.
        .select(
            pl.col("pcds").alias("postcode"),
            "lat",
            pl.col("long").alias("lon"),
            pl.col("lsoa21cd").alias("lsoa21"),
            pl.col("oa21cd").alias("oa21"),
            pl.col("pcon24cd").alias("pcon"),
        )
    )
    wide = wide.join(arcgis, on="postcode", how="left")

    iod = pl.scan_parquet(iod_path).with_columns(
        *(_less_deprived_percentile_expr(c) for c in _IOD_PERCENTILE_COLUMNS)
    )
    wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")

    ethnicity = pl.scan_parquet(ethnicity_path)
    wide = wide.join(
        ethnicity,
        left_on="Local Authority District code (2024)",
        right_on="Geography_code",
        how="left",
    )

    # Derive bedroom count: habitable rooms - 1 (assuming 1 reception room), clipped to 0..4
    wide = wide.with_columns(
        (pl.col("number_habitable_rooms") - 1)
        .clip(0, 4)
        .cast(pl.UInt8)
        .alias("_bedrooms"),
    )
    rental = pl.scan_parquet(rental_prices_path).select(
        "area_code", "bedrooms", "mean_monthly_rent"
    )
    wide = wide.join(
        rental,
        left_on=["Local Authority District code (2024)", "_bedrooms"],
        right_on=["area_code", "bedrooms"],
        how="left",
    )

    crime = pl.scan_parquet(crime_path)
    wide = wide.join(crime, left_on="lsoa21", right_on="LSOA code", how="left")

    wide = wide.with_columns(
        pl.sum_horizontal(
            "Violence and sexual offences (avg/yr)",
            "Robbery (avg/yr)",
            "Burglary (avg/yr)",
            "Possession of weapons (avg/yr)",
        ).alias("serious_crime_avg_yr"),
        pl.sum_horizontal(
            "Anti-social behaviour (avg/yr)",
            "Criminal damage and arson (avg/yr)",
            "Shoplifting (avg/yr)",
            "Bicycle theft (avg/yr)",
            "Theft from the person (avg/yr)",
            "Other theft (avg/yr)",
            "Vehicle crime (avg/yr)",
            "Public order (avg/yr)",
            "Drugs (avg/yr)",
            "Other crime (avg/yr)",
        ).alias("minor_crime_avg_yr"),
    )

    lsoa_pop = pl.scan_parquet(lsoa_population_path)
    wide = wide.join(lsoa_pop, on="lsoa21", how="left")
    wide = wide.with_columns(
        pl.when(pl.col("population") > 0)
        .then((pl.col("serious_crime_avg_yr") / pl.col("population") * 1000).round(1))
        .alias("serious_crime_per_1k"),
        pl.when(pl.col("population") > 0)
        .then((pl.col("minor_crime_avg_yr") / pl.col("population") * 1000).round(1))
        .alias("minor_crime_per_1k"),
    ).drop("population")

    median_age = pl.scan_parquet(median_age_path)
    wide = wide.join(median_age, on="lsoa21", how="left")

    election = pl.scan_parquet(election_results_path)
    wide = wide.join(election, on="pcon", how="left")

    poi_counts = pl.scan_parquet(poi_proximity_path)
    wide = wide.join(poi_counts, on="postcode", how="left")

    noise_cols = ["road_noise_lden_db", "rail_noise_lden_db", "airport_noise_lden_db"]
    noise = (
        pl.scan_parquet(noise_path)
        .with_columns(
            # NaN → null so max_horizontal ignores missing instead of propagating NaN
            *[pl.col(c).fill_nan(None) for c in noise_cols],
        )
        .with_columns(
            pl.max_horizontal(*noise_cols).alias("noise_lden_db"),
        )
        .select("postcode", "noise_lden_db")
    )
    wide = wide.join(noise, on="postcode", how="left")

    school_proximity = pl.scan_parquet(school_proximity_path)
    wide = wide.join(school_proximity, on="postcode", how="left")

    conservation_areas = _conservation_area_by_postcode(
        arcgis.select("postcode", "lat", "lon"), conservation_areas_path
    )
    wide = wide.join(conservation_areas, on="postcode", how="left").with_columns(
        pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
    )

    if tree_density_postcodes_path is not None:
        tree_density = _tree_density_by_postcode(tree_density_postcodes_path)
        wide = wide.join(tree_density, on="postcode", how="left")

    # Broadband: derive max available download speed tier per postcode from
    # Ofcom availability percentages.  Tiers: Gigabit ≥1000, UFBB ≥300,
    # UFBB(100) ≥100, SFBB ≥30 Mbps.  Stored as string enum.
    broadband = (
        pl.scan_parquet(broadband_path)
        .select(
            pl.col("postcode_space").alias("bb_postcode"),
            pl.when(pl.col("Gigabit availability (% premises)") > 0)
            .then(1000)
            .when(pl.col("UFBB availability (% premises)") > 0)
            .then(300)
            .when(pl.col("UFBB (100Mbit/s) availability (% premises)") > 0)
            .then(100)
            .when(pl.col("SFBB availability (% premises)") > 0)
            .then(30)
            .otherwise(10)
            .cast(pl.UInt16)
            .alias("max_download_speed"),
        )
        .group_by("bb_postcode")
        .agg(pl.col("max_download_speed").max())
        .with_columns(pl.col("max_download_speed").cast(pl.Utf8))
    )
    wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")

    # Derive property_type: prefer EPC data, fall back to price-paid.
    # For Houses, use built_form (e.g. Semi-Detached, Mid-Terrace) for finer detail.
    bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
        ["NO DATA!", "Not Recorded"]
    )
    has_epc = pl.col("epc_property_type").is_not_null()
    is_house = pl.col("epc_property_type") == "House"
    wide = wide.with_columns(
        pl.when(has_epc & is_house & ~bad_built_form)
        .then(pl.col("built_form"))
        .when(has_epc & is_house)
        .then(pl.col("pp_property_type"))
        .when(has_epc)
        .then(pl.col("epc_property_type"))
        .otherwise(pl.col("pp_property_type"))
        # Unify EPC's "Flat"/"Maisonette" with price-paid's "Flats/Maisonettes",
        # collapse terrace sub-types, and fold rare types into "Other"
        .replace(
            {
                "Flat": "Flats/Maisonettes",
                "Maisonette": "Flats/Maisonettes",
                "End-Terrace": "Terraced",
                "Mid-Terrace": "Terraced",
                "Enclosed End-Terrace": "Terraced",
                "Enclosed Mid-Terrace": "Terraced",
                "Bungalow": "Other",
                "Park home": "Other",
            }
        )
        .alias("property_type")
    )

    wide = (
        wide.with_columns(
            pl.when(pl.col("duration") == "U")
            .then(None)
            .otherwise(pl.col("duration"))
            .alias("duration"),
            pl.when(pl.col("current_energy_rating") == "INVALID!")
            .then(None)
            .otherwise(pl.col("current_energy_rating"))
            .alias("current_energy_rating"),
        )
        .with_columns(
            (pl.col("latest_price") / pl.col("total_floor_area"))
            .round(0)
            .cast(pl.Int32)
            .alias("Price per sqm"),
        )
        .drop(
            "inspection_date",
            "_bedrooms",
            "LSOA name (2021)",
            "Local Authority District code (2024)",
            "Local Authority District name (2024)",
            "Wider Barriers Sub-domain Score",
            "Geographical Barriers Sub-domain Score",
            "Adult Skills Sub-domain Score",
            "Children and Young People Sub-domain Score",
            "Crime Score",
            "Living Environment Score",
            "Index of Multiple Deprivation (IMD) Score",
            "Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
            "Income Deprivation Affecting Children Index (IDACI) Score (rate)",
            "Barriers to Housing and Services Score",
            "oa21",
            "pcon",
            "epc_property_type",
            "pp_property_type",
            "built_form",
        )
        .rename(
            {
                "date_of_transfer": "Date of last transaction",
                "construction_age_band": "Construction year",
                "is_construction_date_approximate": "Is construction date approximate",
                "Income Score (rate)": "Income Score",
                "Employment Score (rate)": "Employment Score",
                "Indoors Sub-domain Score": "Housing Conditions Score",
                "Outdoors Sub-domain Score": "Air Quality and Road Safety Score",
                "pp_address": "Address per Property Register",
                "epc_address": "Address per EPC",
                "postcode": "Postcode",
                "duration": "Leasehold/Freehold",
                "current_energy_rating": "Current energy rating",
                "potential_energy_rating": "Potential energy rating",
                "total_floor_area": "Total floor area (sqm)",
                "property_type": "Property type",
                "restaurants_2km": "Number of restaurants within 2km",
                "groceries_2km": "Number of grocery shops and supermarkets within 2km",
                "latest_price": "Last known price",
                "number_habitable_rooms": "Number of bedrooms & living rooms",
                "noise_lden_db": "Noise (dB)",
                "good_primary_5km": "Good+ primary schools within 5km",
                "good_secondary_5km": "Good+ secondary schools within 5km",
                "good_primary_2km": "Good+ primary schools within 2km",
                "good_secondary_2km": "Good+ secondary schools within 2km",
                "outstanding_primary_5km": "Outstanding primary schools within 5km",
                "outstanding_secondary_5km": "Outstanding secondary schools within 5km",
                "outstanding_primary_2km": "Outstanding primary schools within 2km",
                "outstanding_secondary_2km": "Outstanding secondary schools within 2km",
                "max_download_speed": "Max available download speed (Mbps)",
                "serious_crime_avg_yr": "Serious crime (avg/yr)",
                "minor_crime_avg_yr": "Minor crime (avg/yr)",
                "serious_crime_per_1k": "Serious crime per 1k residents (avg/yr)",
                "minor_crime_per_1k": "Minor crime per 1k residents (avg/yr)",
                "mean_monthly_rent": "Estimated monthly rent",
                "floor_height": "Interior height (m)",
                "was_council_house": "Former council house",
                "median_age": "Median age",
                "turnout_pct": "Voter turnout (%)",
            }
        )
    )

    print("Collecting with streaming engine...")
    df = wide.collect(engine="streaming")

    if mode == "listings":
        enriched_listings = _finalize_listings(df)
        _validate_property_postcodes(enriched_listings)
        print(f"Enriched listings rows: {enriched_listings.height}")
        return _BuildResult(listings=enriched_listings)

    _validate_property_postcodes(df)

    # Split into postcode-level and property-level dataframes
    area_cols = [
        c for c in df.columns if c in _AREA_COLUMNS or _is_dynamic_poi_metric_column(c)
    ]
    postcode_df = df.select(area_cols).group_by("Postcode").first()
    print(f"Postcode rows: {postcode_df.height} (unique postcodes)")

    property_cols = [
        c
        for c in df.columns
        if (c not in _AREA_COLUMNS and not _is_dynamic_poi_metric_column(c))
        or c == "Postcode"
    ]
    properties_df = df.select(property_cols)
    print(f"Property rows: {properties_df.height}")

    return _BuildResult(postcode=postcode_df, properties=properties_df)


def main():
    parser = argparse.ArgumentParser(
        description="Build wide property dataframe with all joins"
    )
    parser.add_argument(
        "--epc-pp", type=Path, required=True, help="EPC-Price Paid joined parquet file"
    )
    parser.add_argument(
        "--arcgis", type=Path, required=True, help="ArcGIS postcode data parquet file"
    )
    parser.add_argument(
        "--iod",
        type=Path,
        required=True,
        help="Index of Deprivation parquet file (optional)",
    )
    parser.add_argument(
        "--poi-proximity",
        type=Path,
        help="POI proximity counts parquet file (optional)",
    )
    parser.add_argument(
        "--ethnicity",
        type=Path,
        required=True,
        help="Ethnicity by local authority parquet file (optional)",
    )
    parser.add_argument(
        "--crime",
        type=Path,
        required=True,
        help="Crime by LSOA parquet file (optional)",
    )
    parser.add_argument(
        "--noise", type=Path, required=True, help="Road noise by postcode parquet file"
    )
    parser.add_argument(
        "--school-proximity",
        type=Path,
        required=True,
        help="School proximity counts parquet file",
    )
    parser.add_argument(
        "--broadband",
        type=Path,
        required=True,
        help="Broadband performance by output area parquet file",
    )
    parser.add_argument(
        "--conservation-areas",
        type=Path,
        required=True,
        help="Planning Data conservation areas GeoJSON",
    )
    parser.add_argument(
        "--listed-buildings",
        type=Path,
        required=False,
        help="Historic England NHLE listed-building points GeoPackage",
    )
    parser.add_argument(
        "--rental-prices",
        type=Path,
        required=True,
        help="ONS rental prices by LA and bedroom count parquet file",
    )
    parser.add_argument(
        "--lsoa-population",
        type=Path,
        required=True,
        help="Census 2021 population by LSOA parquet file",
    )
    parser.add_argument(
        "--median-age",
        type=Path,
        required=True,
        help="Census 2021 median age by LSOA parquet file",
    )
    parser.add_argument(
        "--election-results",
        type=Path,
        required=True,
        help="2024 General Election results by constituency parquet file",
    )
    parser.add_argument(
        "--tree-density-postcodes",
        type=Path,
        required=False,
        help="Postcode-level tree density parquet from pipeline.transform.tree_density",
    )
    parser.add_argument(
        "--output-postcodes",
        type=Path,
        required=False,
        help="Output postcode parquet (normal mode only)",
    )
    parser.add_argument(
        "--output-properties",
        type=Path,
        required=False,
        help="Output properties parquet (normal mode only)",
    )
    parser.add_argument(
        "--actual-listings",
        type=Path,
        required=False,
        help=(
            "Optional scraped-listings parquet. When provided, listings flow "
            "through the same merge pipeline as historical properties — set "
            "--output-listings to write the enriched-listings file instead "
            "of the postcode/properties files."
        ),
    )
    parser.add_argument(
        "--epc",
        type=Path,
        required=False,
        help=(
            "Raw EPC certificates CSV or zip. Used only with --actual-listings "
            "to match live listings directly to EPC records."
        ),
    )
    parser.add_argument(
        "--output-listings",
        type=Path,
        required=False,
        help=(
            "Output enriched-listings parquet path. Required (and only valid) "
            "when --actual-listings is set; --output-postcodes and "
            "--output-properties are ignored in this mode."
        ),
    )
    args = parser.parse_args()

    listings_mode = args.actual_listings is not None
    if listings_mode and args.output_listings is None:
        parser.error("--output-listings is required when --actual-listings is set")
    if not listings_mode and (
        args.output_postcodes is None or args.output_properties is None
    ):
        parser.error(
            "--output-postcodes and --output-properties are required in normal mode"
        )

    result = _build(
        epc_pp_path=args.epc_pp,
        arcgis_path=args.arcgis,
        iod_path=args.iod,
        poi_proximity_path=args.poi_proximity,
        ethnicity_path=args.ethnicity,
        crime_path=args.crime,
        noise_path=args.noise,
        school_proximity_path=args.school_proximity,
        broadband_path=args.broadband,
        conservation_areas_path=args.conservation_areas,
        rental_prices_path=args.rental_prices,
        lsoa_population_path=args.lsoa_population,
        median_age_path=args.median_age,
        election_results_path=args.election_results,
        tree_density_postcodes_path=args.tree_density_postcodes,
        listed_buildings_path=args.listed_buildings,
        actual_listings_path=args.actual_listings,
        actual_listings_epc_path=args.epc if listings_mode else None,
        mode="listings" if listings_mode else "normal",
    )

    if listings_mode:
        listings_df = result.listings
        assert listings_df is not None  # guaranteed by mode contract
        args.output_listings.parent.mkdir(parents=True, exist_ok=True)
        listings_df.write_parquet(args.output_listings)
        size_mb = args.output_listings.stat().st_size / (1024 * 1024)
        print(
            f"\nEnriched listings: {listings_df.height} rows, "
            f"{len(listings_df.columns)} columns"
        )
        print(f"Wrote {args.output_listings} ({size_mb:.1f} MB)")
        return

    postcode_df = result.postcode
    properties_df = result.properties
    assert postcode_df is not None and properties_df is not None

    print(f"\nPostcode columns: {postcode_df.columns}")
    print(f"Postcode rows: {postcode_df.height}")
    postcode_df.write_parquet(args.output_postcodes)
    size_mb = args.output_postcodes.stat().st_size / (1024 * 1024)
    print(f"Wrote {args.output_postcodes} ({size_mb:.1f} MB)")

    print(f"\nProperty columns: {properties_df.columns}")
    print(f"Property rows: {properties_df.height}")
    properties_df.write_parquet(args.output_properties)
    size_mb = args.output_properties.stat().st_size / (1024 * 1024)
    print(f"Wrote {args.output_properties} ({size_mb:.1f} MB)")


if __name__ == "__main__":
    main()