has issues

2026-05-25 13:20:17 +01:00 · 2026-05-25 13:20:17 +01:00 · c645b0f1d4
commit c645b0f1d4
parent 2e112d7398
96 changed files with 2147083 additions and 5787 deletions
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -1,12 +1,27 @@
 import argparse
 import re

+import numpy as np
 import polars as pl
 from pathlib import Path

+import pyogrio
+from pyproj import Transformer
+from scipy.spatial import cKDTree
+from shapely import from_wkb, points
+from shapely.geometry.base import BaseGeometry
+from shapely.strtree import STRtree
+from thefuzz import fuzz
+
+from pipeline.utils.fuzzy_join import normalize_address_key
 from pipeline.utils.postcode_mapping import build_postcode_mapping

 MIN_FLOOR_AREA_M2 = 10
+CONSERVATION_AREA_FEATURE = "Within conservation area"
+LISTED_BUILDING_FEATURE = "Listed building"
+LISTED_BUILDING_MATCH_RADIUS_M = 250.0
+LISTED_BUILDING_NEAREST_POSTCODES = 3
+LISTED_BUILDING_MIN_MATCH_SCORE = 95

 _IOD_PERCENTILE_COLUMNS = [
    "Education, Skills and Training Score",
@ -24,6 +39,8 @@ _AREA_COLUMNS = [
    "lon",
    # Runtime provenance for deciding whether missing coordinates are skippable.
    "ctry25cd",
+    # Keyed lookup for postcode-level side tables (e.g. crime time series).
+    "lsoa21",
    # Deprivation
    "Income Score",
    "Employment Score",
@ -63,6 +80,7 @@ _AREA_COLUMNS = [
    # Environment
    "Noise (dB)",
    "Max available download speed (Mbps)",
+    CONSERVATION_AREA_FEATURE,
    # Schools
    "Good+ primary schools within 5km",
    "Good+ secondary schools within 5km",
@ -97,6 +115,20 @@ _RENT_SOURCE_UNAVAILABLE_LADS = {
    "E06000053": "Isles of Scilly",
    "E09000001": "City of London",
 }
+_NUMBER_RE = re.compile(r"\d+")
+_LISTED_NAME_STOP_WORDS = {
+    "A",
+    "AN",
+    "AND",
+    "AT",
+    "BY",
+    "IN",
+    "OF",
+    "ON",
+    "THE",
+    "TO",
+    "WITH",
+}


 def _is_dynamic_poi_metric_column(column: str) -> bool:
@ -105,6 +137,389 @@ def _is_dynamic_poi_metric_column(column: str) -> bool:
    )


+def _numbers_compatible(left: str, right: str) -> bool:
+    """Require address/list-entry numbers to agree when either side has numbers."""
+    left_nums = set(_NUMBER_RE.findall(left))
+    right_nums = set(_NUMBER_RE.findall(right))
+    smaller, larger = (
+        (left_nums, right_nums)
+        if len(left_nums) <= len(right_nums)
+        else (right_nums, left_nums)
+    )
+    if not smaller and larger:
+        return False
+    return smaller.issubset(larger)
+
+
+def _listed_candidate_schema() -> dict[str, pl.DataType]:
+    return {
+        "postcode": pl.Utf8,
+        "_listed_match_name": pl.Utf8,
+        "_listed_grade": pl.Utf8,
+        "_listed_entry": pl.Int64,
+    }
+
+
+def _empty_listed_candidates() -> pl.DataFrame:
+    return pl.DataFrame(schema=_listed_candidate_schema())
+
+
+def _empty_listed_property_flags() -> pl.DataFrame:
+    return pl.DataFrame(
+        schema={
+            "postcode": pl.Utf8,
+            "pp_address": pl.Utf8,
+            LISTED_BUILDING_FEATURE: pl.Utf8,
+        }
+    )
+
+
+def _is_matchable_listed_name(name_key: str | None) -> bool:
+    if not name_key:
+        return False
+    if _NUMBER_RE.search(name_key):
+        return True
+    substantive_tokens = [
+        token
+        for token in name_key.split()
+        if token not in _LISTED_NAME_STOP_WORDS and len(token) >= 3
+    ]
+    return len(substantive_tokens) >= 2
+
+
+def _load_listed_building_points(listed_buildings_path: Path) -> pl.DataFrame:
+    """Load Historic England NHLE listed-building point attributes."""
+    columns = ["ListEntry", "Name", "Grade", "Easting", "Northing"]
+    info = pyogrio.read_info(listed_buildings_path)
+    geometry_type = str(info.get("geometry_type") or "")
+    if "Point" not in geometry_type:
+        raise ValueError(
+            f"Expected listed-building point data, got geometry {geometry_type!r}"
+        )
+    _, table = pyogrio.read_arrow(
+        listed_buildings_path,
+        columns=columns,
+        read_geometry=False,
+    )
+    df = pl.from_arrow(table)
+    missing = sorted(set(columns) - set(df.columns))
+    if missing:
+        raise ValueError(
+            f"{listed_buildings_path} is missing listed-building columns: {missing}"
+        )
+    return (
+        df.select(
+            pl.col("ListEntry").cast(pl.Int64),
+            pl.col("Name").cast(pl.Utf8),
+            pl.col("Grade").cast(pl.Utf8),
+            pl.col("Easting").cast(pl.Float64),
+            pl.col("Northing").cast(pl.Float64),
+        )
+        .drop_nulls(["Name", "Easting", "Northing"])
+        .with_columns(normalize_address_key(pl.col("Name")).alias("_listed_match_name"))
+        .filter(pl.col("_listed_match_name").is_not_null())
+    )
+
+
+def _postcode_listed_building_candidates(
+    listed_points: pl.DataFrame,
+    active_postcodes: pl.DataFrame,
+    *,
+    nearest_postcodes: int = LISTED_BUILDING_NEAREST_POSTCODES,
+    max_distance_m: float = LISTED_BUILDING_MATCH_RADIUS_M,
+) -> pl.DataFrame:
+    """Assign each listed-building point to nearby active postcode candidates."""
+    if listed_points.is_empty() or active_postcodes.is_empty():
+        return _empty_listed_candidates()
+
+    required_postcode_cols = {"postcode", "east1m", "north1m"}
+    missing = sorted(required_postcode_cols - set(active_postcodes.columns))
+    if missing:
+        raise ValueError(f"Active postcode data missing required columns: {missing}")
+
+    required_listed_cols = {
+        "_listed_match_name",
+        "Grade",
+        "ListEntry",
+        "Easting",
+        "Northing",
+    }
+    missing = sorted(required_listed_cols - set(listed_points.columns))
+    if missing:
+        raise ValueError(f"Listed-building data missing required columns: {missing}")
+
+    postcodes = active_postcodes.drop_nulls(["postcode", "east1m", "north1m"])
+    postcodes = postcodes.filter(
+        pl.col("east1m").is_finite() & pl.col("north1m").is_finite()
+    )
+    listed = listed_points.drop_nulls(["_listed_match_name", "Easting", "Northing"])
+    listed = listed.filter(
+        pl.col("Easting").is_finite() & pl.col("Northing").is_finite()
+    )
+    if postcodes.is_empty() or listed.is_empty():
+        return _empty_listed_candidates()
+
+    postcode_coords = np.column_stack(
+        [postcodes["east1m"].to_numpy(), postcodes["north1m"].to_numpy()]
+    )
+    listed_coords = np.column_stack(
+        [listed["Easting"].to_numpy(), listed["Northing"].to_numpy()]
+    )
+    k = max(1, min(nearest_postcodes, postcodes.height))
+    distances, indices = cKDTree(postcode_coords).query(
+        listed_coords,
+        k=k,
+        distance_upper_bound=max_distance_m,
+    )
+    if k == 1:
+        distances = distances[:, np.newaxis]
+        indices = indices[:, np.newaxis]
+
+    postcode_values = postcodes["postcode"].to_list()
+    listed_names = listed["_listed_match_name"].to_list()
+    listed_grades = listed["Grade"].to_list()
+    listed_entries = listed["ListEntry"].to_list()
+
+    rows: list[tuple[str, str, str | None, int | None]] = []
+    for listed_idx in range(listed.height):
+        name_key = listed_names[listed_idx]
+        if not _is_matchable_listed_name(name_key):
+            continue
+        seen_postcodes: set[str] = set()
+        for distance, postcode_idx in zip(distances[listed_idx], indices[listed_idx]):
+            if not np.isfinite(distance) or postcode_idx >= postcodes.height:
+                continue
+            postcode = postcode_values[int(postcode_idx)]
+            if postcode in seen_postcodes:
+                continue
+            seen_postcodes.add(postcode)
+            rows.append(
+                (
+                    postcode,
+                    name_key,
+                    listed_grades[listed_idx],
+                    listed_entries[listed_idx],
+                )
+            )
+
+    if not rows:
+        return _empty_listed_candidates()
+
+    return (
+        pl.DataFrame(
+            rows,
+            schema=[
+                "postcode",
+                "_listed_match_name",
+                "_listed_grade",
+                "_listed_entry",
+            ],
+            orient="row",
+        )
+        .cast(_listed_candidate_schema())
+        .unique(["postcode", "_listed_match_name", "_listed_entry"])
+    )
+
+
+def _matched_listed_building_flags(
+    properties: pl.LazyFrame,
+    listed_candidates: pl.DataFrame,
+    *,
+    min_score: int = LISTED_BUILDING_MIN_MATCH_SCORE,
+) -> pl.DataFrame:
+    """Return property keys that conservatively match an NHLE listed entry."""
+    if listed_candidates.is_empty():
+        return _empty_listed_property_flags()
+
+    candidate_postcodes = listed_candidates.select("postcode").unique()
+    property_candidates = (
+        properties.select("postcode", "pp_address", "epc_address")
+        .join(candidate_postcodes.lazy(), on="postcode", how="semi")
+        .with_columns(
+            normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
+            normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"),
+        )
+        .filter(
+            pl.col("pp_address").is_not_null()
+            & (
+                pl.col("_pp_match_address").is_not_null()
+                | pl.col("_epc_match_address").is_not_null()
+            )
+        )
+        .collect(engine="streaming")
+    )
+    if property_candidates.is_empty():
+        return _empty_listed_property_flags()
+
+    listed_by_postcode: dict[str, list[str]] = {}
+    for postcode, name in listed_candidates.select(
+        "postcode", "_listed_match_name"
+    ).iter_rows():
+        if postcode and name:
+            listed_by_postcode.setdefault(postcode, []).append(name)
+
+    matches: list[tuple[str, str, str]] = []
+    for row in property_candidates.iter_rows(named=True):
+        postcode = row["postcode"]
+        listed_names = listed_by_postcode.get(postcode)
+        if not listed_names:
+            continue
+
+        address_keys = []
+        for col in ("_pp_match_address", "_epc_match_address"):
+            value = row.get(col)
+            if value and value not in address_keys:
+                address_keys.append(value)
+
+        matched = False
+        for address_key in address_keys:
+            for listed_name in listed_names:
+                if not _numbers_compatible(address_key, listed_name):
+                    continue
+                if fuzz.token_set_ratio(address_key, listed_name) >= min_score:
+                    matched = True
+                    break
+            if matched:
+                break
+
+        if matched:
+            matches.append((postcode, row["pp_address"], "Yes"))
+
+    if not matches:
+        return _empty_listed_property_flags()
+
+    return (
+        pl.DataFrame(
+            matches,
+            schema=["postcode", "pp_address", LISTED_BUILDING_FEATURE],
+            orient="row",
+        )
+        .cast(
+            {
+                "postcode": pl.Utf8,
+                "pp_address": pl.Utf8,
+                LISTED_BUILDING_FEATURE: pl.Utf8,
+            }
+        )
+        .unique(["postcode", "pp_address"])
+    )
+
+
+def _listed_building_flags(
+    properties: pl.LazyFrame,
+    active_postcodes: pl.DataFrame,
+    listed_buildings_path: Path,
+) -> pl.DataFrame:
+    print(f"Loading listed-building points from {listed_buildings_path}...")
+    listed_points = _load_listed_building_points(listed_buildings_path)
+    print(f"Loaded {listed_points.height} listed-building point records")
+    listed_candidates = _postcode_listed_building_candidates(
+        listed_points, active_postcodes
+    )
+    print(
+        "Matching listed-building names to property addresses across "
+        f"{listed_candidates['postcode'].n_unique()} nearby postcodes..."
+    )
+    flags = _matched_listed_building_flags(properties, listed_candidates)
+    print(f"Matched {flags.height} property addresses to listed-building entries")
+    return flags
+
+
+def _normalise_crs(crs: object | None) -> str:
+    return str(crs) if crs else "EPSG:4326"
+
+
+def _load_conservation_area_geometries(
+    conservation_areas_path: Path,
+) -> tuple[list[BaseGeometry], str]:
+    metadata, table = pyogrio.read_arrow(conservation_areas_path, columns=[])
+    geometry_name = metadata.get("geometry_name") or table.column_names[-1]
+    geometries = []
+    for geom in from_wkb(table[geometry_name].combine_chunks().to_pylist()):
+        if geom is not None and not geom.is_empty:
+            geometries.append(geom)
+    if not geometries:
+        raise ValueError(
+            f"{conservation_areas_path} does not contain any usable polygon geometries"
+        )
+    return geometries, _normalise_crs(metadata.get("crs"))
+
+
+def _postcode_conservation_area_flags(
+    postcodes: pl.DataFrame,
+    conservation_geometries: list[BaseGeometry],
+    conservation_crs: object | None,
+    batch_size: int = 100_000,
+) -> pl.DataFrame:
+    required = {"postcode", "lat", "lon"}
+    missing = sorted(required - set(postcodes.columns))
+    if missing:
+        raise ValueError(f"Postcode data missing required columns: {missing}")
+
+    all_postcodes = postcodes.select("postcode").drop_nulls().unique()
+    valid_points = postcodes.select("postcode", "lat", "lon").drop_nulls()
+    if valid_points.is_empty():
+        return all_postcodes.with_columns(pl.lit("No").alias(CONSERVATION_AREA_FEATURE))
+
+    lat = valid_points["lat"].to_numpy()
+    lon = valid_points["lon"].to_numpy()
+    finite = np.isfinite(lat) & np.isfinite(lon)
+    valid_points = valid_points.filter(pl.Series(finite))
+    if valid_points.is_empty():
+        return all_postcodes.with_columns(pl.lit("No").alias(CONSERVATION_AREA_FEATURE))
+
+    lat = valid_points["lat"].to_numpy()
+    lon = valid_points["lon"].to_numpy()
+    transformer = Transformer.from_crs(
+        "EPSG:4326", _normalise_crs(conservation_crs), always_xy=True
+    )
+    x, y = transformer.transform(lon, lat)
+
+    tree = STRtree(conservation_geometries)
+    inside = np.zeros(valid_points.height, dtype=bool)
+    for start in range(0, valid_points.height, batch_size):
+        end = min(start + batch_size, valid_points.height)
+        point_batch = points(x[start:end], y[start:end])
+        matches = tree.query(point_batch, predicate="intersects")
+        if matches.size > 0:
+            inside[start + matches[0]] = True
+
+    matched = (
+        valid_points.select("postcode")
+        .with_columns(pl.Series("_within_conservation_area", inside))
+        .group_by("postcode")
+        .agg(pl.col("_within_conservation_area").max())
+        .with_columns(
+            pl.when(pl.col("_within_conservation_area"))
+            .then(pl.lit("Yes"))
+            .otherwise(pl.lit("No"))
+            .alias(CONSERVATION_AREA_FEATURE)
+        )
+        .select("postcode", CONSERVATION_AREA_FEATURE)
+    )
+    return (
+        all_postcodes.join(matched, on="postcode", how="left")
+        .with_columns(pl.col(CONSERVATION_AREA_FEATURE).fill_null("No"))
+        .select("postcode", CONSERVATION_AREA_FEATURE)
+    )
+
+
+def _conservation_area_by_postcode(
+    postcodes: pl.LazyFrame,
+    conservation_areas_path: Path,
+) -> pl.LazyFrame:
+    print(f"Loading conservation area polygons from {conservation_areas_path}...")
+    geometries, crs = _load_conservation_area_geometries(conservation_areas_path)
+    postcode_points = postcodes.select("postcode", "lat", "lon").collect(
+        engine="streaming"
+    )
+    print(
+        "Computing conservation area membership for "
+        f"{postcode_points.height} active English postcodes..."
+    )
+    return _postcode_conservation_area_flags(postcode_points, geometries, crs).lazy()
+
+
 def _less_deprived_percentile_expr(column: str) -> pl.Expr:
    """Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
    non_null_count = pl.col(column).count()
@ -234,11 +649,13 @@ def _build(
    noise_path: Path,
    school_proximity_path: Path,
    broadband_path: Path,
+    conservation_areas_path: Path,
    rental_prices_path: Path,
    lsoa_population_path: Path,
    median_age_path: Path,
    election_results_path: Path,
    tree_density_postcodes_path: Path | None = None,
+    listed_buildings_path: Path | None = None,
 ) -> tuple[pl.DataFrame, pl.DataFrame]:
    """Build postcode and properties dataframes from epc_pp + auxiliary data.

@ -273,6 +690,29 @@ def _build(
    ).unique(["postcode"])
    wide = wide.join(postcode_country, on="postcode", how="left")

+    if listed_buildings_path is not None:
+        active_postcodes_for_listed = (
+            arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")
+            .filter(pl.col("doterm").is_null())
+            .select(
+                pl.col("pcds").alias("postcode"),
+                "east1m",
+                "north1m",
+            )
+            .collect(engine="streaming")
+        )
+        listed_flags = _listed_building_flags(
+            wide.select("postcode", "pp_address", "epc_address"),
+            active_postcodes_for_listed,
+            listed_buildings_path,
+        )
+        wide = wide.join(listed_flags.lazy(), on=["postcode", "pp_address"], how="left")
+    else:
+        wide = wide.with_columns(
+            pl.lit(None, dtype=pl.Utf8).alias(LISTED_BUILDING_FEATURE)
+        )
+    wide = wide.with_columns(pl.col(LISTED_BUILDING_FEATURE).fill_null("No"))
+
    arcgis = (
        arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")  # England only
        .filter(pl.col("doterm").is_null())  # Active postcodes only
@ -382,6 +822,13 @@ def _build(
    school_proximity = pl.scan_parquet(school_proximity_path)
    wide = wide.join(school_proximity, on="postcode", how="left")

+    conservation_areas = _conservation_area_by_postcode(
+        arcgis.select("postcode", "lat", "lon"), conservation_areas_path
+    )
+    wide = wide.join(conservation_areas, on="postcode", how="left").with_columns(
+        pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
+    )
+
    if tree_density_postcodes_path is not None:
        tree_density = _tree_density_by_postcode(tree_density_postcodes_path)
        wide = wide.join(tree_density, on="postcode", how="left")
@ -476,7 +923,6 @@ def _build(
            "Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
            "Income Deprivation Affecting Children Index (IDACI) Score (rate)",
            "Barriers to Housing and Services Score",
-            "lsoa21",
            "oa21",
            "pcon",
            "epc_property_type",
@ -598,6 +1044,18 @@ def main():
        required=True,
        help="Broadband performance by output area parquet file",
    )
+    parser.add_argument(
+        "--conservation-areas",
+        type=Path,
+        required=True,
+        help="Historic England conservation areas GeoPackage",
+    )
+    parser.add_argument(
+        "--listed-buildings",
+        type=Path,
+        required=False,
+        help="Historic England NHLE listed-building points GeoPackage",
+    )
    parser.add_argument(
        "--rental-prices",
        type=Path,
@ -652,11 +1110,13 @@ def main():
        noise_path=args.noise,
        school_proximity_path=args.school_proximity,
        broadband_path=args.broadband,
+        conservation_areas_path=args.conservation_areas,
        rental_prices_path=args.rental_prices,
        lsoa_population_path=args.lsoa_population,
        median_age_path=args.median_age,
        election_results_path=args.election_results,
        tree_density_postcodes_path=args.tree_density_postcodes,
+        listed_buildings_path=args.listed_buildings,
    )

    print(f"\nPostcode columns: {postcode_df.columns}")