scraping and data

2026-05-31 15:36:33 +01:00 · 2026-05-31 15:36:33 +01:00 · 8688b7475e
commit 8688b7475e
parent d98819b569
43 changed files with 4920 additions and 531 deletions
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -48,7 +48,7 @@ _AREA_COLUMNS = [
    "lon",
    # Runtime provenance for deciding whether missing coordinates are skippable.
    "ctry25cd",
-    # Keyed lookup for postcode-level side tables (e.g. crime time series).
+    # Join key for LSOA-level side tables (e.g. median age).
    "lsoa21",
    # Deprivation
    "Income Score",
@ -81,8 +81,6 @@ _AREA_COLUMNS = [
    "Other crime (avg/yr)",
    "Serious crime (avg/yr)",
    "Minor crime (avg/yr)",
-    "Serious crime per 1k residents (avg/yr)",
-    "Minor crime per 1k residents (avg/yr)",
    # Amenities
    "Number of restaurants within 2km",
    "Number of grocery shops and supermarkets within 2km",
@ -742,16 +740,13 @@ _PROPERTY_TYPE_VALUES = [
    "Other",
 ]
 _EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"]
-_PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
-_PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
-_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS = 82
-_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS = 96
-_PROPERTY_MATCH_MIN_MARGIN = 4.0
-_DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
-_DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
-_DIRECT_EPC_MATCH_MIN_MARGIN = 4.0
-_DIRECT_EPC_NEARBY_RADIUS_M = 500.0
-_DIRECT_EPC_NEAREST_POSTCODES = 40
+# Listings are matched to EPC certificates and Price-Paid properties first by
+# UPRN (exact) and otherwise by fuzzy street-address similarity within the same
+# postcode. A house number in the listing address is the strong disambiguator,
+# so a numbered listing may match on a lower street-similarity score than a
+# number-less one (which must match the street almost exactly to be trusted).
+_LISTING_MATCH_MIN_SCORE_WITH_NUMBERS = 82
+_LISTING_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 90
 _DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = (
    ("_direct_epc_address", pl.Utf8),
    ("_direct_current_energy_rating", pl.Utf8),
@ -764,7 +759,7 @@ _DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = (
    ("_direct_was_council_house", pl.Utf8),
    ("_direct_epc_match_status", pl.Utf8),
    ("_direct_epc_match_score", pl.Float32),
-    ("_direct_epc_match_margin", pl.Float32),
+    ("_direct_epc_match_method", pl.Utf8),
 )
 _DIRECT_EPC_RAW_COLUMN_MAP = {
    "epc_address": "_direct_epc_address",
@ -840,46 +835,6 @@ def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
    )


-def _ratio_bonus(
-    left: float | int | None, right: float | int | None, pct: float, cap: float
-) -> float:
-    if left is None or right is None:
-        return 0.0
-    try:
-        left_f = float(left)
-        right_f = float(right)
-    except (TypeError, ValueError):
-        return 0.0
-    if left_f <= 0 or right_f <= 0:
-        return 0.0
-    rel = abs(left_f - right_f) / max(left_f, right_f)
-    if rel > pct:
-        return 0.0
-    return cap * (1.0 - rel / pct)
-
-
-def _rooms_bonus(left: int | None, right: int | None) -> float:
-    if left is None or right is None:
-        return 0.0
-    try:
-        diff = abs(int(left) - int(right))
-    except (TypeError, ValueError):
-        return 0.0
-    if diff == 0:
-        return 4.0
-    if diff == 1:
-        return 2.0
-    return 0.0
-
-
-def _enum_bonus(
-    left: str | None, right: str | None, *, exact: float, mismatch: float
-) -> float:
-    if not left or not right:
-        return 0.0
-    return exact if left == right else mismatch
-
-
 def _address_score(query: str, candidate: str | None) -> int:
    if not candidate:
        return 0
@ -893,6 +848,85 @@ def _has_number(address: str | None) -> bool:
    return bool(address and _NUMBER_RE.search(address))


+def _normalize_uprn(value: object) -> str | None:
+    """Canonical UPRN string (digits only) or None.
+
+    UPRNs arrive as strings or ints from the scraper / EPC register; normalise
+    so a listing UPRN and an EPC/property UPRN compare equal regardless of dtype
+    or stray whitespace. A float (e.g. a NaN-bearing column read as Float) is
+    rejected unless it is an exact integer, so "123.0"/"1.5e11" can never be
+    silently mangled into a bogus all-digits key.
+    """
+    if value is None:
+        return None
+    if isinstance(value, float):
+        if not value.is_integer():
+            return None
+        value = int(value)
+    digits = re.sub(r"\D", "", str(value))
+    return digits or None
+
+
+def _best_listing_match(
+    listing_uprn: str | None,
+    query: str | None,
+    uprn_index: dict[str, dict],
+    bucket_candidates: list[dict],
+    addressed_fields: list[str],
+) -> tuple[dict, float, str, str | None] | None:
+    """Pick the best candidate for a listing.
+
+    Matching is, in order: (1) an exact UPRN equality against the global
+    ``uprn_index`` (postcode-independent, so it is robust even when the
+    listing's postcode is slightly off); (2) failing that, the highest
+    fuzzy street-address similarity within the listing's own postcode bucket.
+    No property-attribute heuristics are used — a house number in the listing
+    address gates the fuzzy match (`_numbers_compatible`) and lowers the score
+    threshold; a number-less address must match the street almost exactly.
+
+    ``addressed_fields`` names the candidate columns to fuzzy-match against (a
+    candidate may carry both a register and an EPC address). Returns
+    ``(candidate, score, method, matched_field)`` or None. ``method`` is
+    "uprn" or "address"; ``matched_field`` is the winning address column (or
+    None for a UPRN match).
+    """
+    if listing_uprn:
+        hit = uprn_index.get(listing_uprn)
+        if hit is not None:
+            return hit, 100.0, "uprn", None
+
+    if not query:
+        return None
+
+    listing_has_numbers = _has_number(query)
+    best: dict | None = None
+    best_score = 0
+    best_field: str | None = None
+    for candidate in bucket_candidates:
+        for field in addressed_fields:
+            address = candidate.get(field)
+            if not address:
+                continue
+            if listing_has_numbers and not _numbers_compatible(query, address):
+                continue
+            score = _address_score(query, address)
+            if score > best_score:
+                best_score = score
+                best = candidate
+                best_field = field
+
+    if best is None:
+        return None
+    threshold = (
+        _LISTING_MATCH_MIN_SCORE_WITH_NUMBERS
+        if listing_has_numbers
+        else _LISTING_MATCH_MIN_SCORE_WITHOUT_NUMBERS
+    )
+    if best_score < threshold:
+        return None
+    return best, float(best_score), "address", best_field
+
+
 def _load_listings_for_merge(
    listings_path: Path, arcgis_path: Path
 ) -> pl.DataFrame:
@ -908,6 +942,20 @@ def _load_listings_for_merge(
    raw = pl.scan_parquet(listings_path).with_row_index("_listing_idx")
    postcode_mapping = build_postcode_mapping(arcgis_path).lazy()

+    # UPRN is only present on scraped listings that carry it (Zoopla detail
+    # pages); tolerate its absence so older parquets and test fixtures still
+    # load. Digits-only so it compares equal to the EPC register's UPRN.
+    if "UPRN" in raw.collect_schema().names():
+        uprn_digits = pl.col("UPRN").cast(pl.Utf8).str.replace_all(r"\D", "")
+        listing_uprn_expr = (
+            pl.when(uprn_digits.str.len_chars() > 0)
+            .then(uprn_digits)
+            .otherwise(None)
+            .alias("_listing_uprn")
+        )
+    else:
+        listing_uprn_expr = pl.lit(None, dtype=pl.Utf8).alias("_listing_uprn")
+
    # Listings parquets occasionally carry Float NaNs (e.g. floor area). Polars
    # treats NaN as distinct from null and the downstream `latest_price /
    # total_floor_area` cast to Int32 explodes on a NaN, so we normalise floats
@ -936,12 +984,14 @@ def _load_listings_for_merge(
                "postcode"
            ),
            pl.col("Address per Property Register").alias("pp_address"),
+            listing_uprn_expr,
            *overlay,
        )
        .select(
            "_listing_idx",
            "postcode",
            "pp_address",
+            "_listing_uprn",
            *[dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES],
        )
        .collect(engine="streaming")
@ -972,7 +1022,6 @@ def _empty_direct_epc_matches() -> pl.DataFrame:

 def _load_direct_epc_candidates(
    epc_path: Path,
-    arcgis_path: Path,
    listing_outcodes: list[str],
    temp_dir: Path,
 ) -> pl.DataFrame:
@ -982,8 +1031,7 @@ def _load_direct_epc_candidates(
        "_direct_epc_match_postcode": pl.Utf8,
        "_direct_epc_outcode": pl.Utf8,
        "_direct_epc_canonical_property_type": pl.Utf8,
-        "_direct_epc_east": pl.Float64,
-        "_direct_epc_north": pl.Float64,
+        "_direct_epc_uprn": pl.Utf8,
        **{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS if column.startswith("_direct_")},
    }
    if not listing_outcodes:
@ -1016,12 +1064,6 @@ def _load_direct_epc_candidates(
        .with_columns(pl.lit("Yes").alias("_direct_was_council_house"))
    )

-    arcgis = pl.scan_parquet(arcgis_path).select(
-        normalize_postcode_key(pl.col("pcds")).alias("_direct_epc_match_postcode"),
-        pl.col("east1m").alias("_direct_epc_east"),
-        pl.col("north1m").alias("_direct_epc_north"),
-    )
-
    return (
        epc_base.sort("inspection_date", descending=True)
        .group_by("_direct_epc_match_address", "_direct_epc_match_postcode")
@ -1031,7 +1073,6 @@ def _load_direct_epc_candidates(
            on=["_direct_epc_match_address", "_direct_epc_match_postcode"],
            how="left",
        )
-        .join(arcgis, on="_direct_epc_match_postcode", how="left")
        .with_columns(
            _canonical_epc_property_type_expr().alias(
                "_direct_epc_canonical_property_type"
@ -1046,6 +1087,7 @@ def _load_direct_epc_candidates(
            .otherwise(None)
            .alias("_direct_potential_energy_rating"),
            pl.col("epc_address").alias("_direct_epc_address"),
+            pl.col("uprn").alias("_direct_epc_uprn"),
            pl.col("total_floor_area").alias("_direct_total_floor_area"),
            pl.col("number_habitable_rooms").alias(
                "_direct_number_habitable_rooms"
@ -1066,8 +1108,7 @@ def _load_direct_epc_candidates(
            "_direct_epc_match_postcode",
            "_direct_epc_outcode",
            "_direct_epc_canonical_property_type",
-            "_direct_epc_east",
-            "_direct_epc_north",
+            "_direct_epc_uprn",
            "_direct_epc_address",
            "_direct_current_energy_rating",
            "_direct_potential_energy_rating",
@ -1083,7 +1124,14 @@ def _load_direct_epc_candidates(


 def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
-    match = listings.with_columns(
+    """Add the normalised address/postcode/outcode keys used to match listings.
+
+    Listings are matched to EPC certificates and properties by UPRN and by
+    fuzzy street address within their (now accurate, detail-page-sourced)
+    postcode — never by coordinate proximity — so no projected easting/northing
+    is computed here. `_listing_uprn` flows through from the loaded listings.
+    """
+    return listings.with_columns(
        normalize_address_key(pl.col("pp_address")).alias("_listing_match_address"),
        normalize_postcode_key(pl.col("postcode")).alias("_listing_match_postcode"),
    ).with_columns(
@ -1092,21 +1140,6 @@ def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
        .alias("_listing_outcode")
    )

-    if match.is_empty():
-        return match.with_columns(
-            pl.Series("_listing_east", [], dtype=pl.Float64),
-            pl.Series("_listing_north", [], dtype=pl.Float64),
-        )
-
-    transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
-    east, north = transformer.transform(
-        match["_actual_lon"].to_numpy(), match["_actual_lat"].to_numpy()
-    )
-    return match.with_columns(
-        pl.Series("_listing_east", east, dtype=pl.Float64),
-        pl.Series("_listing_north", north, dtype=pl.Float64),
-    )
-

 def _optional_lazy_col(
    schema: pl.Schema, column: str, dtype: pl.DataType
@ -1122,8 +1155,7 @@ def _listing_property_match_schema() -> dict[str, pl.DataType]:
        "_matched_postcode": pl.Utf8,
        "_matched_pp_address": pl.Utf8,
        "_property_match_score": pl.Float32,
-        "_property_match_address_score": pl.Int32,
-        "_property_match_margin": pl.Float32,
+        "_property_match_method": pl.Utf8,
        "_property_match_field": pl.Utf8,
    }

@ -1139,11 +1171,8 @@ def _property_match_candidate_frame(wide: pl.LazyFrame) -> pl.DataFrame:
            pl.col("postcode").cast(pl.Utf8).alias("postcode"),
            pl.col("pp_address").cast(pl.Utf8).alias("pp_address"),
            _optional_lazy_col(schema, "epc_address", pl.Utf8),
-            _optional_lazy_col(schema, "pp_property_type", pl.Utf8),
-            _optional_lazy_col(schema, "duration", pl.Utf8),
-            _optional_lazy_col(schema, "total_floor_area", pl.Float64),
-            _optional_lazy_col(schema, "number_habitable_rooms", pl.Int16),
-            _optional_lazy_col(schema, "latest_price", pl.Int64),
+            # UPRN keys the exact match; present once epc_pp is rebuilt with it.
+            _optional_lazy_col(schema, "uprn", pl.Utf8),
        )
        .with_row_index("_property_row")
        .with_columns(
@ -1167,110 +1196,52 @@ def _property_match_candidate_frame(wide: pl.LazyFrame) -> pl.DataFrame:
    )


-def _property_candidates_by_postcode(
-    candidates: pl.DataFrame,
-) -> dict[str, list[dict]]:
+def _index_candidates(
+    candidates: pl.DataFrame, postcode_key: str, uprn_key: str
+) -> tuple[dict[str, list[dict]], dict[str, dict]]:
+    """Index candidate rows for matching, in a single pass over the frame.
+
+    Returns ``(postcode_buckets, uprn_index)``. The postcode buckets drive the
+    fuzzy street-address match; the UPRN index drives the exact match and is
+    postcode-independent, so it still resolves when a listing's postcode is
+    slightly off.
+    """
    buckets: dict[str, list[dict]] = {}
+    uprn_index: dict[str, dict] = {}
    for row in candidates.iter_rows(named=True):
-        postcode = row.get("_property_match_postcode")
+        postcode = row.get(postcode_key)
        if postcode:
            buckets.setdefault(postcode, []).append(row)
-    return buckets
+        uprn = _normalize_uprn(row.get(uprn_key))
+        if uprn and uprn not in uprn_index:
+            uprn_index[uprn] = row
+    return buckets, uprn_index


 def _best_listing_property_candidate(
-    listing: dict, candidates: list[dict]
+    listing: dict, uprn_index: dict[str, dict], candidates: list[dict]
 ) -> dict | None:
-    query = listing.get("_listing_match_address")
-    if not query:
-        return None
-
-    listing_has_numbers = _has_number(query)
-    scored: list[tuple[float, int, dict, str]] = []
-    for candidate in candidates:
-        register_address = candidate.get("_property_match_address")
-        epc_address = candidate.get("_property_epc_match_address")
-        register_numbers_compatible = bool(
-            register_address and _numbers_compatible(query, register_address)
-        )
-        epc_numbers_compatible = bool(
-            epc_address and _numbers_compatible(query, epc_address)
-        )
-        if not (register_numbers_compatible or epc_numbers_compatible):
-            continue
-
-        register_score = _address_score(query, register_address)
-        epc_score = _address_score(query, epc_address)
-        base_score = max(register_score, epc_score)
-        if base_score == 0:
-            continue
-
-        score = float(base_score)
-        score += _enum_bonus(
-            listing.get("_actual_property_type"),
-            candidate.get("pp_property_type"),
-            exact=7.0,
-            mismatch=-8.0,
-        )
-        score += _enum_bonus(
-            listing.get("_actual_leasehold_freehold"),
-            candidate.get("duration"),
-            exact=3.0,
-            mismatch=-3.0,
-        )
-        score += _ratio_bonus(
-            listing.get("_actual_total_floor_area"),
-            candidate.get("total_floor_area"),
-            pct=0.15,
-            cap=8.0,
-        )
-        score += _rooms_bonus(
-            listing.get("_actual_number_habitable_rooms"),
-            candidate.get("number_habitable_rooms"),
-        )
-        score += _ratio_bonus(
-            listing.get("_actual_asking_price"),
-            candidate.get("latest_price"),
-            pct=0.25,
-            cap=3.0,
-        )
-        matched_field = (
-            "pp_address" if register_score >= epc_score else "epc_address"
-        )
-        scored.append((score, base_score, candidate, matched_field))
-
-    if not scored:
-        return None
-    scored.sort(key=lambda item: item[0], reverse=True)
-    top = scored[0]
-    runner_up = scored[1][0] if len(scored) > 1 else None
-    margin = top[0] - runner_up if runner_up is not None else top[0]
-    score_threshold = (
-        _PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS
-        if listing_has_numbers
-        else _PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS
+    result = _best_listing_match(
+        listing.get("_listing_uprn"),
+        listing.get("_listing_match_address"),
+        uprn_index,
+        candidates,
+        ["_property_match_address", "_property_epc_match_address"],
    )
-    address_threshold = (
-        _PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS
-        if listing_has_numbers
-        else _PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS
-    )
-    if (
-        top[0] < score_threshold
-        or top[1] < address_threshold
-        or margin < _PROPERTY_MATCH_MIN_MARGIN
-    ):
+    if result is None:
        return None
-
-    candidate = top[2]
+    candidate, score, method, field = result
+    matched_field = {
+        "_property_match_address": "pp_address",
+        "_property_epc_match_address": "epc_address",
+    }.get(field, method)
    return {
        "_listing_idx": listing["_listing_idx"],
        "_matched_postcode": candidate.get("postcode"),
        "_matched_pp_address": candidate.get("pp_address"),
-        "_property_match_score": round(top[0], 1),
-        "_property_match_address_score": top[1],
-        "_property_match_margin": round(margin, 1),
-        "_property_match_field": top[3],
+        "_property_match_score": round(score, 1),
+        "_property_match_method": method,
+        "_property_match_field": matched_field,
    }


@ -1280,23 +1251,32 @@ def _match_listing_properties(
    if listing_matches.is_empty() or property_candidates.is_empty():
        return _empty_listing_property_matches()

-    buckets = _property_candidates_by_postcode(property_candidates)
+    buckets, uprn_index = _index_candidates(
+        property_candidates, "_property_match_postcode", "uprn"
+    )
    best_matches = []
    for listing in listing_matches.iter_rows(named=True):
        postcode = listing.get("_listing_match_postcode")
-        if not postcode:
-            continue
-        match = _best_listing_property_candidate(listing, buckets.get(postcode, []))
+        bucket = buckets.get(postcode, []) if postcode else []
+        match = _best_listing_property_candidate(listing, uprn_index, bucket)
        if match is not None:
            best_matches.append(match)

    if not best_matches:
        return _empty_listing_property_matches()

+    # When two listings claim the same property, keep the most authoritative
+    # match: an exact UPRN match always wins over a fuzzy address match (both can
+    # score 100, so method must break the tie before score and listing index).
    matches = pl.DataFrame(best_matches, schema=_listing_property_match_schema())
    return (
        matches.sort(
-            ["_property_match_score", "_listing_idx"], descending=[True, False]
+            [
+                pl.col("_property_match_method") == "uprn",
+                "_property_match_score",
+                "_listing_idx",
+            ],
+            descending=[True, True, False],
        )
        .unique(
            ["_matched_postcode", "_matched_pp_address"],
@ -1307,133 +1287,19 @@ def _match_listing_properties(
    )


-def _epc_candidates_by_postcode(candidates: pl.DataFrame) -> dict[str, list[dict]]:
-    buckets: dict[str, list[dict]] = {}
-    for row in candidates.iter_rows(named=True):
-        postcode = row.get("_direct_epc_match_postcode")
-        if postcode:
-            buckets.setdefault(postcode, []).append(row)
-    return buckets
-
-
-def _epc_postcode_tree(
-    candidates: pl.DataFrame,
-) -> tuple[cKDTree | None, list[str]]:
-    postcode_points = (
-        candidates.select(
-            "_direct_epc_match_postcode",
-            "_direct_epc_east",
-            "_direct_epc_north",
-        )
-        .drop_nulls()
-        .filter(
-            pl.col("_direct_epc_east").is_finite()
-            & pl.col("_direct_epc_north").is_finite()
-        )
-        .unique("_direct_epc_match_postcode")
+def _best_direct_epc_candidate(
+    listing: dict, uprn_index: dict[str, dict], candidates: list[dict]
+) -> dict | None:
+    result = _best_listing_match(
+        listing.get("_listing_uprn"),
+        listing.get("_listing_match_address"),
+        uprn_index,
+        candidates,
+        ["_direct_epc_match_address"],
    )
-    if postcode_points.is_empty():
-        return None, []
-    coords = np.column_stack(
-        [
-            postcode_points["_direct_epc_east"].to_numpy(),
-            postcode_points["_direct_epc_north"].to_numpy(),
-        ]
-    )
-    return cKDTree(coords), postcode_points["_direct_epc_match_postcode"].to_list()
-
-
-def _candidate_postcodes_for_listing(
-    listing: dict,
-    postcode_tree: cKDTree | None,
-    postcode_values: list[str],
-) -> list[str]:
-    postcodes: list[str] = []
-    exact = listing.get("_listing_match_postcode")
-    if exact:
-        postcodes.append(exact)
-
-    if postcode_tree is None:
-        return postcodes
-
-    east = listing.get("_listing_east")
-    north = listing.get("_listing_north")
-    try:
-        east_f = float(east)
-        north_f = float(north)
-    except (TypeError, ValueError):
-        return postcodes
-    if not np.isfinite(east_f) or not np.isfinite(north_f):
-        return postcodes
-
-    k = min(_DIRECT_EPC_NEAREST_POSTCODES, len(postcode_values))
-    distances, indices = postcode_tree.query(
-        [east_f, north_f],
-        k=k,
-        distance_upper_bound=_DIRECT_EPC_NEARBY_RADIUS_M,
-    )
-    distances = np.atleast_1d(distances)
-    indices = np.atleast_1d(indices)
-    seen = set(postcodes)
-    for distance, idx in zip(distances, indices, strict=False):
-        if not np.isfinite(distance) or idx >= len(postcode_values):
-            continue
-        postcode = postcode_values[int(idx)]
-        if postcode not in seen:
-            postcodes.append(postcode)
-            seen.add(postcode)
-    return postcodes
-
-
-def _best_direct_epc_candidate(listing: dict, candidates: list[dict]) -> dict | None:
-    query = listing.get("_listing_match_address")
-    if not query:
+    if result is None:
        return None
-
-    listing_has_numbers = _has_number(query)
-    scored: list[tuple[float, int, dict]] = []
-    for candidate in candidates:
-        address = candidate.get("_direct_epc_match_address")
-        if listing_has_numbers and not _numbers_compatible(query, address or ""):
-            continue
-        base_score = _address_score(query, address)
-        if base_score == 0:
-            continue
-
-        score = float(base_score)
-        score += _enum_bonus(
-            listing.get("_actual_property_type"),
-            candidate.get("_direct_epc_canonical_property_type"),
-            exact=6.0,
-            mismatch=-6.0,
-        )
-        score += _ratio_bonus(
-            listing.get("_actual_total_floor_area"),
-            candidate.get("_direct_total_floor_area"),
-            pct=0.12,
-            cap=8.0,
-        )
-        score += _rooms_bonus(
-            listing.get("_actual_number_habitable_rooms"),
-            candidate.get("_direct_number_habitable_rooms"),
-        )
-        scored.append((score, base_score, candidate))
-
-    if not scored:
-        return None
-    scored.sort(key=lambda item: item[0], reverse=True)
-    top = scored[0]
-    runner_up = scored[1][0] if len(scored) > 1 else None
-    margin = top[0] - runner_up if runner_up is not None else top[0]
-    threshold = (
-        _DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS
-        if listing_has_numbers
-        else _DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS
-    )
-    if top[0] < threshold or margin < _DIRECT_EPC_MATCH_MIN_MARGIN:
-        return None
-
-    candidate = top[2]
+    candidate, score, method, _field = result
    return {
        "_listing_idx": listing["_listing_idx"],
        "_direct_epc_address": candidate.get("_direct_epc_address"),
@ -1452,8 +1318,8 @@ def _best_direct_epc_candidate(listing: dict, candidates: list[dict]) -> dict |
        ),
        "_direct_was_council_house": candidate.get("_direct_was_council_house"),
        "_direct_epc_match_status": "matched",
-        "_direct_epc_match_score": round(top[0], 1),
-        "_direct_epc_match_margin": round(margin, 1),
+        "_direct_epc_match_score": round(score, 1),
+        "_direct_epc_match_method": method,
    }


@ -1463,25 +1329,14 @@ def _match_direct_epc(
    if listing_matches.is_empty() or epc_candidates.is_empty():
        return _empty_direct_epc_matches()

-    buckets = _epc_candidates_by_postcode(epc_candidates)
-    postcode_tree, postcode_values = _epc_postcode_tree(epc_candidates)
-
+    buckets, uprn_index = _index_candidates(
+        epc_candidates, "_direct_epc_match_postcode", "_direct_epc_uprn"
+    )
    matches = []
    for listing in listing_matches.iter_rows(named=True):
-        candidate_postcodes = _candidate_postcodes_for_listing(
-            listing, postcode_tree, postcode_values
-        )
-        candidate_rows: list[dict] = []
-        seen_rows: set[int] = set()
-        for postcode in candidate_postcodes:
-            for candidate in buckets.get(postcode, []):
-                row = candidate.get("_direct_epc_row")
-                if row in seen_rows:
-                    continue
-                candidate_rows.append(candidate)
-                if row is not None:
-                    seen_rows.add(row)
-        match = _best_direct_epc_candidate(listing, candidate_rows)
+        postcode = listing.get("_listing_match_postcode")
+        bucket = buckets.get(postcode, []) if postcode else []
+        match = _best_direct_epc_candidate(listing, uprn_index, bucket)
        if match is not None:
            matches.append(match)

@ -1493,7 +1348,6 @@ def _match_direct_epc(
 def _enrich_listings_with_direct_epc(
    listings: pl.DataFrame,
    epc_path: Path | None,
-    arcgis_path: Path,
 ) -> pl.DataFrame:
    if epc_path is None:
        return _ensure_direct_epc_columns(listings)
@ -1513,7 +1367,7 @@ def _enrich_listings_with_direct_epc(
        prefix="direct_listing_epc_", dir=local_tmp_dir()
    ) as tmpdir:
        epc_candidates = _load_direct_epc_candidates(
-            epc_path, arcgis_path, listing_outcodes, Path(tmpdir)
+            epc_path, listing_outcodes, Path(tmpdir)
        )
        print(f"Direct listing EPC candidates: {epc_candidates.height}")
        direct_matches = _match_direct_epc(listing_matches, epc_candidates)
@ -1604,7 +1458,7 @@ def _integrate_listings(
    """
    listings = _load_listings_for_merge(listings_path, arcgis_path)
    print(f"Listings loaded: {listings.height}")
-    listings = _enrich_listings_with_direct_epc(listings, epc_path, arcgis_path)
+    listings = _enrich_listings_with_direct_epc(listings, epc_path)

    overlay_columns = [dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES]
    listing_attachment_columns = [
@ -1660,6 +1514,14 @@ def _finalize_listings(df: pl.DataFrame) -> pl.DataFrame:
    """Project the post-rename wide frame down to enriched-listing rows."""
    df = df.filter(pl.col(_LISTING_FLAG_COLUMN).is_not_null())

+    # A matched listing's overlay attaches to every wide row sharing its
+    # (postcode, pp_address). The terminated-postcode remap can collapse several
+    # distinct wide rows onto one such key, which would otherwise emit one duplicate
+    # listing per collapsed row. Each listing matches exactly one (postcode,
+    # pp_address) and each seed row carries a unique URL, so keeping a single row per
+    # listing URL collapses only that fan-out and never merges distinct listings.
+    df = df.unique(subset=[_LISTING_FLAG_COLUMN], keep="first", maintain_order=True)
+
    df = df.with_columns(
        pl.col("_actual_listing_url").alias("Listing URL"),
        pl.col("_actual_listing_date").alias("Listing date"),
@ -1750,7 +1612,6 @@ def _build(
    broadband_path: Path,
    conservation_areas_path: Path,
    rental_prices_path: Path,
-    lsoa_population_path: Path,
    median_age_path: Path,
    election_results_path: Path,
    tree_density_postcodes_path: Path | None = None,
@ -1881,8 +1742,10 @@ def _build(
        how="left",
    )

+    # Crime is counted spatially per postcode (incidents within 50m of the
+    # postcode boundary), so it joins on postcode rather than LSOA.
    crime = pl.scan_parquet(crime_path)
-    wide = wide.join(crime, left_on="lsoa21", right_on="LSOA code", how="left")
+    wide = wide.join(crime, on="postcode", how="left")

    wide = wide.with_columns(
        pl.sum_horizontal(
@ -1905,17 +1768,6 @@ def _build(
        ).alias("minor_crime_avg_yr"),
    )

-    lsoa_pop = pl.scan_parquet(lsoa_population_path)
-    wide = wide.join(lsoa_pop, on="lsoa21", how="left")
-    wide = wide.with_columns(
-        pl.when(pl.col("population") > 0)
-        .then((pl.col("serious_crime_avg_yr") / pl.col("population") * 1000).round(1))
-        .alias("serious_crime_per_1k"),
-        pl.when(pl.col("population") > 0)
-        .then((pl.col("minor_crime_avg_yr") / pl.col("population") * 1000).round(1))
-        .alias("minor_crime_per_1k"),
-    ).drop("population")
-
    median_age = pl.scan_parquet(median_age_path)
    wide = wide.join(median_age, on="lsoa21", how="left")

@ -2082,8 +1934,6 @@ def _build(
                "max_download_speed": "Max available download speed (Mbps)",
                "serious_crime_avg_yr": "Serious crime (avg/yr)",
                "minor_crime_avg_yr": "Minor crime (avg/yr)",
-                "serious_crime_per_1k": "Serious crime per 1k residents (avg/yr)",
-                "minor_crime_per_1k": "Minor crime per 1k residents (avg/yr)",
                "mean_monthly_rent": "Estimated monthly rent",
                "floor_height": "Interior height (m)",
                "was_council_house": "Former council house",
@ -2189,12 +2039,6 @@ def main():
        required=True,
        help="ONS rental prices by LA and bedroom count parquet file",
    )
-    parser.add_argument(
-        "--lsoa-population",
-        type=Path,
-        required=True,
-        help="Census 2021 population by LSOA parquet file",
-    )
    parser.add_argument(
        "--median-age",
        type=Path,
@ -2279,7 +2123,6 @@ def main():
        broadband_path=args.broadband,
        conservation_areas_path=args.conservation_areas,
        rental_prices_path=args.rental_prices,
-        lsoa_population_path=args.lsoa_population,
        median_age_path=args.median_age,
        election_results_path=args.election_results,
        tree_density_postcodes_path=args.tree_density_postcodes,