has issues

2026-05-25 13:20:17 +01:00 · 2026-05-25 13:20:17 +01:00 · c645b0f1d4
commit c645b0f1d4
parent 2e112d7398
96 changed files with 2147083 additions and 5787 deletions
--- a/pipeline/transform/crime.py
+++ b/pipeline/transform/crime.py
@ -7,6 +7,27 @@ import polars as pl
 STREET_CRIME_CSV_RE = re.compile(r"^\d{4}-\d{2}-.+-street\.csv$")
 MONTH_RE = r"^\d{4}-\d{2}$"

+# Crime types that roll up into "Serious crime" / "Minor crime" aggregates.
+# Must match the names used in pipeline/transform/merge.py for the sum_horizontal expressions.
+SERIOUS_CRIME_TYPES = (
+    "Violence and sexual offences",
+    "Robbery",
+    "Burglary",
+    "Possession of weapons",
+)
+MINOR_CRIME_TYPES = (
+    "Anti-social behaviour",
+    "Criminal damage and arson",
+    "Shoplifting",
+    "Bicycle theft",
+    "Theft from the person",
+    "Other theft",
+    "Vehicle crime",
+    "Public order",
+    "Drugs",
+    "Other crime",
+)
+

 def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
    csvs = sorted(crime_dir.rglob("*.csv"))
@ -14,7 +35,12 @@ def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
    return street_csvs, len(csvs) - len(street_csvs)


-def transform_crime(crime_dir: Path, output_path: Path) -> None:
+def transform_crime(
+    crime_dir: Path,
+    output_path: Path,
+    by_year_output_path: Path | None = None,
+    lsoa_lookup_path: Path | None = None,
+) -> None:
    csvs, ignored_csv_count = find_street_crime_csvs(crime_dir)
    if not csvs:
        raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}")
@ -38,6 +64,8 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
        },
    ).select("LSOA code", "Crime type", "Month")

+    df = _apply_lsoa_2011_to_2021(df, lsoa_lookup_path)
+
    valid_month_expr = pl.col("Month").str.contains(MONTH_RE)
    valid_months = (
        df.filter(valid_month_expr)
@ -57,6 +85,9 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
    )

    # Count monthly incidents, then annualise over every valid month in the dataset.
+    # `_weight` (≤1) comes from the LSOA 2011→2021 lookup: 2011 LSOAs that split
+    # into N 2021 LSOAs contribute 1/N of their count to each child, since we
+    # don't know which child a given incident actually belonged to.
    yearly_counts = (
        df.filter(
            valid_month_expr
@ -66,7 +97,7 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
            & (pl.col("Crime type") != "")
        )
        .group_by("LSOA code", "Month", "Crime type")
-        .agg(pl.len().alias("count"))
+        .agg((pl.col("_weight").first() * pl.len()).alias("count"))
        .group_by("LSOA code", "Crime type")
        .agg(
            (pl.col("count").sum() / pl.lit(valid_month_count) * 12)
@ -98,6 +129,118 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
    wide.write_parquet(output_path, compression="zstd")
    print(f"Saved to {output_path}")

+    if by_year_output_path is not None:
+        _write_crime_by_year(df, valid_month_expr, by_year_output_path)
+
+
+def _write_crime_by_year(
+    df: pl.LazyFrame, valid_month_expr: pl.Expr, by_year_output_path: Path
+) -> None:
+    """Emit per-LSOA per-type per-year crime counts as nested list[struct] columns.
+
+    Partial years are scaled to a 12-month-equivalent count so cross-year trends
+    aren't distorted by months missing from the source data.
+    """
+    filtered = df.filter(
+        valid_month_expr
+        & pl.col("LSOA code").is_not_null()
+        & (pl.col("LSOA code") != "")
+        & pl.col("Crime type").is_not_null()
+        & (pl.col("Crime type") != "")
+    ).with_columns(pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"))
+
+    # Months observed *anywhere* in the dataset for each year (annualisation denominator).
+    # Using crime-type-specific months would over-scale years where a rare type appears
+    # in only some months.
+    months_per_year = filtered.group_by("year").agg(
+        pl.col("Month").n_unique().alias("months_in_year")
+    )
+
+    yearly_per_type = (
+        filtered.group_by("LSOA code", "Crime type", "year", "Month")
+        .agg((pl.col("_weight").first() * pl.len()).alias("count"))
+        .group_by("LSOA code", "Crime type", "year")
+        .agg(pl.col("count").sum().alias("count"))
+        .join(months_per_year, on="year")
+        .with_columns(
+            (pl.col("count").cast(pl.Float32) * 12.0 / pl.col("months_in_year"))
+            .round(1)
+            .alias("count")
+        )
+        .select("LSOA code", "Crime type", "year", "count")
+        .collect(engine="streaming")
+    )
+
+    if yearly_per_type.is_empty():
+        raise ValueError("No valid crime rows for by-year output")
+
+    serious_rollup = _rollup_long(yearly_per_type, SERIOUS_CRIME_TYPES, "Serious crime")
+    minor_rollup = _rollup_long(yearly_per_type, MINOR_CRIME_TYPES, "Minor crime")
+    combined = pl.concat([yearly_per_type, serious_rollup, minor_rollup])
+
+    by_lsoa_type = (
+        combined.sort("year")
+        .group_by("LSOA code", "Crime type")
+        .agg(pl.struct("year", "count").alias("series"))
+    )
+
+    wide_by_year = by_lsoa_type.pivot(
+        on="Crime type", index="LSOA code", values="series"
+    )
+
+    type_cols = [c for c in wide_by_year.columns if c != "LSOA code"]
+    wide_by_year = wide_by_year.rename({col: f"{col} (by year)" for col in type_cols})
+
+    print(f"By-year output shape: {wide_by_year.shape}")
+    print(f"By-year columns: {wide_by_year.columns}")
+
+    wide_by_year.write_parquet(by_year_output_path, compression="zstd")
+    print(f"Saved by-year output to {by_year_output_path}")
+
+
+def _rollup_long(
+    yearly_per_type: pl.DataFrame, types: tuple[str, ...], rollup_name: str
+) -> pl.DataFrame:
+    """Sum per-year counts across a set of crime types into a single rollup type."""
+    return (
+        yearly_per_type.filter(pl.col("Crime type").is_in(list(types)))
+        .group_by("LSOA code", "year")
+        .agg(pl.col("count").sum().round(1).alias("count"))
+        .with_columns(pl.lit(rollup_name).alias("Crime type"))
+        .select("LSOA code", "Crime type", "year", "count")
+    )
+
+
+def _apply_lsoa_2011_to_2021(
+    df: pl.LazyFrame, lsoa_lookup_path: Path | None
+) -> pl.LazyFrame:
+    """Remap pre-2022 LSOA 2011 codes to LSOA 2021 codes.
+
+    Police.uk reports older years using LSOA 2011 codes; the rest of the pipeline
+    keys on LSOA 2021. Without remapping, those years silently fail to join and
+    the crime-over-time chart only shows post-2022 data.
+
+    For 1:1 mappings the LSOA code is rewritten in place. For 1→N splits (one
+    2011 LSOA becoming several 2021 ones), each child gets an even share via
+    `_weight = 1/N` since the source CSVs don't tell us which child a given
+    incident actually fell into.
+    """
+    if lsoa_lookup_path is None:
+        return df.with_columns(pl.lit(1.0).alias("_weight"))
+
+    lookup = pl.scan_parquet(lsoa_lookup_path).select("lsoa11", "lsoa21")
+    weighted = lookup.with_columns(
+        (1.0 / pl.col("lsoa21").count().over("lsoa11")).alias("_weight")
+    )
+    return (
+        df.join(weighted, left_on="LSOA code", right_on="lsoa11", how="left")
+        .with_columns(
+            pl.coalesce("lsoa21", "LSOA code").alias("LSOA code"),
+            pl.col("_weight").fill_null(1.0),
+        )
+        .drop("lsoa21")
+    )
+

 def main() -> None:
    parser = argparse.ArgumentParser(
@ -109,8 +252,22 @@ def main() -> None:
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
    )
+    parser.add_argument(
+        "--output-by-year",
+        type=Path,
+        required=False,
+        help="Optional output parquet for per-LSOA per-year per-type counts (nested list[struct])",
+    )
+    parser.add_argument(
+        "--lsoa-lookup",
+        type=Path,
+        required=False,
+        help="Optional parquet with columns (lsoa11, lsoa21) for remapping pre-2022 codes",
+    )
    args = parser.parse_args()
-    transform_crime(args.input, args.output)
+    transform_crime(
+        args.input, args.output, args.output_by_year, args.lsoa_lookup
+    )


 if __name__ == "__main__":
--- a/pipeline/transform/crime_hotspot_tiles.py
+++ b/pipeline/transform/crime_hotspot_tiles.py
@ -0,0 +1,159 @@
+"""Build PMTiles point tiles for the crime heatmap overlay.
+
+The output intentionally keeps point features rather than H3/grid aggregates so
+MapLibre can render a true client-side heatmap. Police.uk coordinates are
+published anonymous map points, not exact offence locations.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+
+import polars as pl
+
+from pipeline.local_temp import local_tmp_dir
+from pipeline.transform.crime import find_street_crime_csvs
+
+
+def _latest_months(crime_dir: Path, month_count: int) -> list[str]:
+    csvs, _ignored = find_street_crime_csvs(crime_dir)
+    months = sorted({path.parent.name for path in csvs})
+    if not months:
+        raise FileNotFoundError(f"No street crime CSVs found in {crime_dir}")
+    return months[-month_count:]
+
+
+def _street_csvs_for_months(crime_dir: Path, months: set[str]) -> list[Path]:
+    csvs, _ignored = find_street_crime_csvs(crime_dir)
+    selected = [path for path in csvs if path.parent.name in months]
+    if not selected:
+        raise FileNotFoundError(f"No street crime CSVs found for {sorted(months)}")
+    return selected
+
+
+def _require_tippecanoe() -> str:
+    executable = shutil.which("tippecanoe")
+    if executable is None:
+        raise RuntimeError(
+            "tippecanoe is required to build crime hotspot PMTiles. "
+            "Install tippecanoe and rerun this target."
+        )
+    return executable
+
+
+def _write_geojsonseq(csvs: list[Path], output_path: Path) -> int:
+    df = (
+        pl.scan_csv(
+            csvs,
+            schema_overrides={
+                "Longitude": pl.Float64,
+                "Latitude": pl.Float64,
+                "Month": pl.Utf8,
+                "Crime type": pl.Utf8,
+            },
+            ignore_errors=True,
+        )
+        .select(
+            pl.col("Longitude").alias("lon"),
+            pl.col("Latitude").alias("lat"),
+            pl.col("Month").alias("month"),
+            pl.col("Crime type").alias("crime_type"),
+        )
+        .drop_nulls(["lon", "lat"])
+        .filter(pl.col("lon").is_between(-9.5, 5.0))
+        .filter(pl.col("lat").is_between(49.0, 57.0))
+        .collect(engine="streaming")
+    )
+
+    with output_path.open("w") as file:
+        for row in df.iter_rows(named=True):
+            feature = {
+                "type": "Feature",
+                "geometry": {
+                    "type": "Point",
+                    "coordinates": [row["lon"], row["lat"]],
+                },
+                "properties": {
+                    "count": 1,
+                    "weight": 1,
+                    "month": row["month"],
+                    "crime_type": row["crime_type"],
+                },
+            }
+            file.write(json.dumps(feature, separators=(",", ":")) + "\n")
+
+    return df.height
+
+
+def build_crime_hotspot_tiles(
+    crime_dir: Path,
+    output_path: Path,
+    months: int,
+    min_zoom: int,
+    max_zoom: int,
+) -> None:
+    tippecanoe = _require_tippecanoe()
+    selected_months = set(_latest_months(crime_dir, months))
+    csvs = _street_csvs_for_months(crime_dir, selected_months)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
+        ndjson_path = Path(tmp) / "crime_hotspots.geojsonseq"
+        feature_count = _write_geojsonseq(csvs, ndjson_path)
+        print(
+            f"Writing {feature_count:,} approximate crime heatmap points "
+            f"from {min(selected_months)} to {max(selected_months)}"
+        )
+
+        subprocess.run(
+            [
+                tippecanoe,
+                "--force",
+                "--output",
+                str(output_path),
+                "--layer",
+                "crime_hotspots",
+                "--minimum-zoom",
+                str(min_zoom),
+                "--maximum-zoom",
+                str(max_zoom),
+                "--drop-densest-as-needed",
+                "--extend-zooms-if-still-dropping",
+                str(ndjson_path),
+            ],
+            check=True,
+        )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--input", type=Path, required=True, help="Crime CSV directory")
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output .pmtiles path"
+    )
+    parser.add_argument(
+        "--months",
+        type=int,
+        default=12,
+        help="Latest complete months to include in the heatmap",
+    )
+    parser.add_argument("--min-zoom", type=int, default=12)
+    parser.add_argument("--max-zoom", type=int, default=16)
+    args = parser.parse_args()
+
+    build_crime_hotspot_tiles(
+        args.input,
+        args.output,
+        args.months,
+        args.min_zoom,
+        args.max_zoom,
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -1,12 +1,27 @@
 import argparse
 import re

+import numpy as np
 import polars as pl
 from pathlib import Path

+import pyogrio
+from pyproj import Transformer
+from scipy.spatial import cKDTree
+from shapely import from_wkb, points
+from shapely.geometry.base import BaseGeometry
+from shapely.strtree import STRtree
+from thefuzz import fuzz
+
+from pipeline.utils.fuzzy_join import normalize_address_key
 from pipeline.utils.postcode_mapping import build_postcode_mapping

 MIN_FLOOR_AREA_M2 = 10
+CONSERVATION_AREA_FEATURE = "Within conservation area"
+LISTED_BUILDING_FEATURE = "Listed building"
+LISTED_BUILDING_MATCH_RADIUS_M = 250.0
+LISTED_BUILDING_NEAREST_POSTCODES = 3
+LISTED_BUILDING_MIN_MATCH_SCORE = 95

 _IOD_PERCENTILE_COLUMNS = [
    "Education, Skills and Training Score",
@ -24,6 +39,8 @@ _AREA_COLUMNS = [
    "lon",
    # Runtime provenance for deciding whether missing coordinates are skippable.
    "ctry25cd",
+    # Keyed lookup for postcode-level side tables (e.g. crime time series).
+    "lsoa21",
    # Deprivation
    "Income Score",
    "Employment Score",
@ -63,6 +80,7 @@ _AREA_COLUMNS = [
    # Environment
    "Noise (dB)",
    "Max available download speed (Mbps)",
+    CONSERVATION_AREA_FEATURE,
    # Schools
    "Good+ primary schools within 5km",
    "Good+ secondary schools within 5km",
@ -97,6 +115,20 @@ _RENT_SOURCE_UNAVAILABLE_LADS = {
    "E06000053": "Isles of Scilly",
    "E09000001": "City of London",
 }
+_NUMBER_RE = re.compile(r"\d+")
+_LISTED_NAME_STOP_WORDS = {
+    "A",
+    "AN",
+    "AND",
+    "AT",
+    "BY",
+    "IN",
+    "OF",
+    "ON",
+    "THE",
+    "TO",
+    "WITH",
+}


 def _is_dynamic_poi_metric_column(column: str) -> bool:
@ -105,6 +137,389 @@ def _is_dynamic_poi_metric_column(column: str) -> bool:
    )


+def _numbers_compatible(left: str, right: str) -> bool:
+    """Require address/list-entry numbers to agree when either side has numbers."""
+    left_nums = set(_NUMBER_RE.findall(left))
+    right_nums = set(_NUMBER_RE.findall(right))
+    smaller, larger = (
+        (left_nums, right_nums)
+        if len(left_nums) <= len(right_nums)
+        else (right_nums, left_nums)
+    )
+    if not smaller and larger:
+        return False
+    return smaller.issubset(larger)
+
+
+def _listed_candidate_schema() -> dict[str, pl.DataType]:
+    return {
+        "postcode": pl.Utf8,
+        "_listed_match_name": pl.Utf8,
+        "_listed_grade": pl.Utf8,
+        "_listed_entry": pl.Int64,
+    }
+
+
+def _empty_listed_candidates() -> pl.DataFrame:
+    return pl.DataFrame(schema=_listed_candidate_schema())
+
+
+def _empty_listed_property_flags() -> pl.DataFrame:
+    return pl.DataFrame(
+        schema={
+            "postcode": pl.Utf8,
+            "pp_address": pl.Utf8,
+            LISTED_BUILDING_FEATURE: pl.Utf8,
+        }
+    )
+
+
+def _is_matchable_listed_name(name_key: str | None) -> bool:
+    if not name_key:
+        return False
+    if _NUMBER_RE.search(name_key):
+        return True
+    substantive_tokens = [
+        token
+        for token in name_key.split()
+        if token not in _LISTED_NAME_STOP_WORDS and len(token) >= 3
+    ]
+    return len(substantive_tokens) >= 2
+
+
+def _load_listed_building_points(listed_buildings_path: Path) -> pl.DataFrame:
+    """Load Historic England NHLE listed-building point attributes."""
+    columns = ["ListEntry", "Name", "Grade", "Easting", "Northing"]
+    info = pyogrio.read_info(listed_buildings_path)
+    geometry_type = str(info.get("geometry_type") or "")
+    if "Point" not in geometry_type:
+        raise ValueError(
+            f"Expected listed-building point data, got geometry {geometry_type!r}"
+        )
+    _, table = pyogrio.read_arrow(
+        listed_buildings_path,
+        columns=columns,
+        read_geometry=False,
+    )
+    df = pl.from_arrow(table)
+    missing = sorted(set(columns) - set(df.columns))
+    if missing:
+        raise ValueError(
+            f"{listed_buildings_path} is missing listed-building columns: {missing}"
+        )
+    return (
+        df.select(
+            pl.col("ListEntry").cast(pl.Int64),
+            pl.col("Name").cast(pl.Utf8),
+            pl.col("Grade").cast(pl.Utf8),
+            pl.col("Easting").cast(pl.Float64),
+            pl.col("Northing").cast(pl.Float64),
+        )
+        .drop_nulls(["Name", "Easting", "Northing"])
+        .with_columns(normalize_address_key(pl.col("Name")).alias("_listed_match_name"))
+        .filter(pl.col("_listed_match_name").is_not_null())
+    )
+
+
+def _postcode_listed_building_candidates(
+    listed_points: pl.DataFrame,
+    active_postcodes: pl.DataFrame,
+    *,
+    nearest_postcodes: int = LISTED_BUILDING_NEAREST_POSTCODES,
+    max_distance_m: float = LISTED_BUILDING_MATCH_RADIUS_M,
+) -> pl.DataFrame:
+    """Assign each listed-building point to nearby active postcode candidates."""
+    if listed_points.is_empty() or active_postcodes.is_empty():
+        return _empty_listed_candidates()
+
+    required_postcode_cols = {"postcode", "east1m", "north1m"}
+    missing = sorted(required_postcode_cols - set(active_postcodes.columns))
+    if missing:
+        raise ValueError(f"Active postcode data missing required columns: {missing}")
+
+    required_listed_cols = {
+        "_listed_match_name",
+        "Grade",
+        "ListEntry",
+        "Easting",
+        "Northing",
+    }
+    missing = sorted(required_listed_cols - set(listed_points.columns))
+    if missing:
+        raise ValueError(f"Listed-building data missing required columns: {missing}")
+
+    postcodes = active_postcodes.drop_nulls(["postcode", "east1m", "north1m"])
+    postcodes = postcodes.filter(
+        pl.col("east1m").is_finite() & pl.col("north1m").is_finite()
+    )
+    listed = listed_points.drop_nulls(["_listed_match_name", "Easting", "Northing"])
+    listed = listed.filter(
+        pl.col("Easting").is_finite() & pl.col("Northing").is_finite()
+    )
+    if postcodes.is_empty() or listed.is_empty():
+        return _empty_listed_candidates()
+
+    postcode_coords = np.column_stack(
+        [postcodes["east1m"].to_numpy(), postcodes["north1m"].to_numpy()]
+    )
+    listed_coords = np.column_stack(
+        [listed["Easting"].to_numpy(), listed["Northing"].to_numpy()]
+    )
+    k = max(1, min(nearest_postcodes, postcodes.height))
+    distances, indices = cKDTree(postcode_coords).query(
+        listed_coords,
+        k=k,
+        distance_upper_bound=max_distance_m,
+    )
+    if k == 1:
+        distances = distances[:, np.newaxis]
+        indices = indices[:, np.newaxis]
+
+    postcode_values = postcodes["postcode"].to_list()
+    listed_names = listed["_listed_match_name"].to_list()
+    listed_grades = listed["Grade"].to_list()
+    listed_entries = listed["ListEntry"].to_list()
+
+    rows: list[tuple[str, str, str | None, int | None]] = []
+    for listed_idx in range(listed.height):
+        name_key = listed_names[listed_idx]
+        if not _is_matchable_listed_name(name_key):
+            continue
+        seen_postcodes: set[str] = set()
+        for distance, postcode_idx in zip(distances[listed_idx], indices[listed_idx]):
+            if not np.isfinite(distance) or postcode_idx >= postcodes.height:
+                continue
+            postcode = postcode_values[int(postcode_idx)]
+            if postcode in seen_postcodes:
+                continue
+            seen_postcodes.add(postcode)
+            rows.append(
+                (
+                    postcode,
+                    name_key,
+                    listed_grades[listed_idx],
+                    listed_entries[listed_idx],
+                )
+            )
+
+    if not rows:
+        return _empty_listed_candidates()
+
+    return (
+        pl.DataFrame(
+            rows,
+            schema=[
+                "postcode",
+                "_listed_match_name",
+                "_listed_grade",
+                "_listed_entry",
+            ],
+            orient="row",
+        )
+        .cast(_listed_candidate_schema())
+        .unique(["postcode", "_listed_match_name", "_listed_entry"])
+    )
+
+
+def _matched_listed_building_flags(
+    properties: pl.LazyFrame,
+    listed_candidates: pl.DataFrame,
+    *,
+    min_score: int = LISTED_BUILDING_MIN_MATCH_SCORE,
+) -> pl.DataFrame:
+    """Return property keys that conservatively match an NHLE listed entry."""
+    if listed_candidates.is_empty():
+        return _empty_listed_property_flags()
+
+    candidate_postcodes = listed_candidates.select("postcode").unique()
+    property_candidates = (
+        properties.select("postcode", "pp_address", "epc_address")
+        .join(candidate_postcodes.lazy(), on="postcode", how="semi")
+        .with_columns(
+            normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
+            normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"),
+        )
+        .filter(
+            pl.col("pp_address").is_not_null()
+            & (
+                pl.col("_pp_match_address").is_not_null()
+                | pl.col("_epc_match_address").is_not_null()
+            )
+        )
+        .collect(engine="streaming")
+    )
+    if property_candidates.is_empty():
+        return _empty_listed_property_flags()
+
+    listed_by_postcode: dict[str, list[str]] = {}
+    for postcode, name in listed_candidates.select(
+        "postcode", "_listed_match_name"
+    ).iter_rows():
+        if postcode and name:
+            listed_by_postcode.setdefault(postcode, []).append(name)
+
+    matches: list[tuple[str, str, str]] = []
+    for row in property_candidates.iter_rows(named=True):
+        postcode = row["postcode"]
+        listed_names = listed_by_postcode.get(postcode)
+        if not listed_names:
+            continue
+
+        address_keys = []
+        for col in ("_pp_match_address", "_epc_match_address"):
+            value = row.get(col)
+            if value and value not in address_keys:
+                address_keys.append(value)
+
+        matched = False
+        for address_key in address_keys:
+            for listed_name in listed_names:
+                if not _numbers_compatible(address_key, listed_name):
+                    continue
+                if fuzz.token_set_ratio(address_key, listed_name) >= min_score:
+                    matched = True
+                    break
+            if matched:
+                break
+
+        if matched:
+            matches.append((postcode, row["pp_address"], "Yes"))
+
+    if not matches:
+        return _empty_listed_property_flags()
+
+    return (
+        pl.DataFrame(
+            matches,
+            schema=["postcode", "pp_address", LISTED_BUILDING_FEATURE],
+            orient="row",
+        )
+        .cast(
+            {
+                "postcode": pl.Utf8,
+                "pp_address": pl.Utf8,
+                LISTED_BUILDING_FEATURE: pl.Utf8,
+            }
+        )
+        .unique(["postcode", "pp_address"])
+    )
+
+
+def _listed_building_flags(
+    properties: pl.LazyFrame,
+    active_postcodes: pl.DataFrame,
+    listed_buildings_path: Path,
+) -> pl.DataFrame:
+    print(f"Loading listed-building points from {listed_buildings_path}...")
+    listed_points = _load_listed_building_points(listed_buildings_path)
+    print(f"Loaded {listed_points.height} listed-building point records")
+    listed_candidates = _postcode_listed_building_candidates(
+        listed_points, active_postcodes
+    )
+    print(
+        "Matching listed-building names to property addresses across "
+        f"{listed_candidates['postcode'].n_unique()} nearby postcodes..."
+    )
+    flags = _matched_listed_building_flags(properties, listed_candidates)
+    print(f"Matched {flags.height} property addresses to listed-building entries")
+    return flags
+
+
+def _normalise_crs(crs: object | None) -> str:
+    return str(crs) if crs else "EPSG:4326"
+
+
+def _load_conservation_area_geometries(
+    conservation_areas_path: Path,
+) -> tuple[list[BaseGeometry], str]:
+    metadata, table = pyogrio.read_arrow(conservation_areas_path, columns=[])
+    geometry_name = metadata.get("geometry_name") or table.column_names[-1]
+    geometries = []
+    for geom in from_wkb(table[geometry_name].combine_chunks().to_pylist()):
+        if geom is not None and not geom.is_empty:
+            geometries.append(geom)
+    if not geometries:
+        raise ValueError(
+            f"{conservation_areas_path} does not contain any usable polygon geometries"
+        )
+    return geometries, _normalise_crs(metadata.get("crs"))
+
+
+def _postcode_conservation_area_flags(
+    postcodes: pl.DataFrame,
+    conservation_geometries: list[BaseGeometry],
+    conservation_crs: object | None,
+    batch_size: int = 100_000,
+) -> pl.DataFrame:
+    required = {"postcode", "lat", "lon"}
+    missing = sorted(required - set(postcodes.columns))
+    if missing:
+        raise ValueError(f"Postcode data missing required columns: {missing}")
+
+    all_postcodes = postcodes.select("postcode").drop_nulls().unique()
+    valid_points = postcodes.select("postcode", "lat", "lon").drop_nulls()
+    if valid_points.is_empty():
+        return all_postcodes.with_columns(pl.lit("No").alias(CONSERVATION_AREA_FEATURE))
+
+    lat = valid_points["lat"].to_numpy()
+    lon = valid_points["lon"].to_numpy()
+    finite = np.isfinite(lat) & np.isfinite(lon)
+    valid_points = valid_points.filter(pl.Series(finite))
+    if valid_points.is_empty():
+        return all_postcodes.with_columns(pl.lit("No").alias(CONSERVATION_AREA_FEATURE))
+
+    lat = valid_points["lat"].to_numpy()
+    lon = valid_points["lon"].to_numpy()
+    transformer = Transformer.from_crs(
+        "EPSG:4326", _normalise_crs(conservation_crs), always_xy=True
+    )
+    x, y = transformer.transform(lon, lat)
+
+    tree = STRtree(conservation_geometries)
+    inside = np.zeros(valid_points.height, dtype=bool)
+    for start in range(0, valid_points.height, batch_size):
+        end = min(start + batch_size, valid_points.height)
+        point_batch = points(x[start:end], y[start:end])
+        matches = tree.query(point_batch, predicate="intersects")
+        if matches.size > 0:
+            inside[start + matches[0]] = True
+
+    matched = (
+        valid_points.select("postcode")
+        .with_columns(pl.Series("_within_conservation_area", inside))
+        .group_by("postcode")
+        .agg(pl.col("_within_conservation_area").max())
+        .with_columns(
+            pl.when(pl.col("_within_conservation_area"))
+            .then(pl.lit("Yes"))
+            .otherwise(pl.lit("No"))
+            .alias(CONSERVATION_AREA_FEATURE)
+        )
+        .select("postcode", CONSERVATION_AREA_FEATURE)
+    )
+    return (
+        all_postcodes.join(matched, on="postcode", how="left")
+        .with_columns(pl.col(CONSERVATION_AREA_FEATURE).fill_null("No"))
+        .select("postcode", CONSERVATION_AREA_FEATURE)
+    )
+
+
+def _conservation_area_by_postcode(
+    postcodes: pl.LazyFrame,
+    conservation_areas_path: Path,
+) -> pl.LazyFrame:
+    print(f"Loading conservation area polygons from {conservation_areas_path}...")
+    geometries, crs = _load_conservation_area_geometries(conservation_areas_path)
+    postcode_points = postcodes.select("postcode", "lat", "lon").collect(
+        engine="streaming"
+    )
+    print(
+        "Computing conservation area membership for "
+        f"{postcode_points.height} active English postcodes..."
+    )
+    return _postcode_conservation_area_flags(postcode_points, geometries, crs).lazy()
+
+
 def _less_deprived_percentile_expr(column: str) -> pl.Expr:
    """Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
    non_null_count = pl.col(column).count()
@ -234,11 +649,13 @@ def _build(
    noise_path: Path,
    school_proximity_path: Path,
    broadband_path: Path,
+    conservation_areas_path: Path,
    rental_prices_path: Path,
    lsoa_population_path: Path,
    median_age_path: Path,
    election_results_path: Path,
    tree_density_postcodes_path: Path | None = None,
+    listed_buildings_path: Path | None = None,
 ) -> tuple[pl.DataFrame, pl.DataFrame]:
    """Build postcode and properties dataframes from epc_pp + auxiliary data.

@ -273,6 +690,29 @@ def _build(
    ).unique(["postcode"])
    wide = wide.join(postcode_country, on="postcode", how="left")

+    if listed_buildings_path is not None:
+        active_postcodes_for_listed = (
+            arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")
+            .filter(pl.col("doterm").is_null())
+            .select(
+                pl.col("pcds").alias("postcode"),
+                "east1m",
+                "north1m",
+            )
+            .collect(engine="streaming")
+        )
+        listed_flags = _listed_building_flags(
+            wide.select("postcode", "pp_address", "epc_address"),
+            active_postcodes_for_listed,
+            listed_buildings_path,
+        )
+        wide = wide.join(listed_flags.lazy(), on=["postcode", "pp_address"], how="left")
+    else:
+        wide = wide.with_columns(
+            pl.lit(None, dtype=pl.Utf8).alias(LISTED_BUILDING_FEATURE)
+        )
+    wide = wide.with_columns(pl.col(LISTED_BUILDING_FEATURE).fill_null("No"))
+
    arcgis = (
        arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")  # England only
        .filter(pl.col("doterm").is_null())  # Active postcodes only
@ -382,6 +822,13 @@ def _build(
    school_proximity = pl.scan_parquet(school_proximity_path)
    wide = wide.join(school_proximity, on="postcode", how="left")

+    conservation_areas = _conservation_area_by_postcode(
+        arcgis.select("postcode", "lat", "lon"), conservation_areas_path
+    )
+    wide = wide.join(conservation_areas, on="postcode", how="left").with_columns(
+        pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
+    )
+
    if tree_density_postcodes_path is not None:
        tree_density = _tree_density_by_postcode(tree_density_postcodes_path)
        wide = wide.join(tree_density, on="postcode", how="left")
@ -476,7 +923,6 @@ def _build(
            "Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
            "Income Deprivation Affecting Children Index (IDACI) Score (rate)",
            "Barriers to Housing and Services Score",
-            "lsoa21",
            "oa21",
            "pcon",
            "epc_property_type",
@ -598,6 +1044,18 @@ def main():
        required=True,
        help="Broadband performance by output area parquet file",
    )
+    parser.add_argument(
+        "--conservation-areas",
+        type=Path,
+        required=True,
+        help="Historic England conservation areas GeoPackage",
+    )
+    parser.add_argument(
+        "--listed-buildings",
+        type=Path,
+        required=False,
+        help="Historic England NHLE listed-building points GeoPackage",
+    )
    parser.add_argument(
        "--rental-prices",
        type=Path,
@ -652,11 +1110,13 @@ def main():
        noise_path=args.noise,
        school_proximity_path=args.school_proximity,
        broadband_path=args.broadband,
+        conservation_areas_path=args.conservation_areas,
        rental_prices_path=args.rental_prices,
        lsoa_population_path=args.lsoa_population,
        median_age_path=args.median_age,
        election_results_path=args.election_results,
        tree_density_postcodes_path=args.tree_density_postcodes,
+        listed_buildings_path=args.listed_buildings,
    )

    print(f"\nPostcode columns: {postcode_df.columns}")
--- a/pipeline/transform/noise_overlay_tiles.py
+++ b/pipeline/transform/noise_overlay_tiles.py
@ -0,0 +1,398 @@
+"""Build PMTiles raster tiles for the high-resolution Defra noise overlay.
+
+This keeps the native 10m strategic-noise rasters as the source of truth and
+renders transparent PNG XYZ tiles into MBTiles before converting to PMTiles.
+The dashboard serves the resulting archive through /api/overlays/noise.
+"""
+
+from __future__ import annotations
+
+import argparse
+import io
+import math
+import sqlite3
+import subprocess
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+import rasterio
+from PIL import Image
+from rasterio.enums import Resampling
+from rasterio.transform import from_bounds
+from rasterio.warp import reproject, transform_bounds
+from shapely import STRtree, box
+
+from pipeline.download.noise import (
+    BNG_MAX_E,
+    BNG_MAX_N,
+    BNG_MIN_E,
+    BNG_MIN_N,
+    NOISE_SOURCES,
+    download_raster,
+)
+from pipeline.download.tiles import ensure_pmtiles_cli
+from pipeline.local_temp import local_tmp_dir
+
+WEB_MERCATOR_CRS = "EPSG:3857"
+WEB_MERCATOR_EXTENT = 20_037_508.342789244
+DEFAULT_SOURCE_NAMES = ("road", "rail", "airport")
+NOISE_COLOR_STOPS = np.array([45.0, 55.0, 65.0, 75.0], dtype=np.float32)
+NOISE_COLORS = np.array(
+    [
+        [254, 240, 138],
+        [251, 146, 60],
+        [220, 38, 38],
+        [127, 29, 29],
+    ],
+    dtype=np.float32,
+)
+
+
+@dataclass(frozen=True)
+class RasterInfo:
+    path: Path
+    bounds_mercator: tuple[float, float, float, float]
+
+
+def _source_specs(source_names: tuple[str, ...]):
+    requested = {name.lower() for name in source_names}
+    if "all" in requested:
+        requested = set(DEFAULT_SOURCE_NAMES)
+
+    by_name = {label.lower(): spec for label, *spec in NOISE_SOURCES}
+    unknown = sorted(requested - set(by_name))
+    if unknown:
+        raise ValueError(f"Unknown noise source(s): {', '.join(unknown)}")
+
+    return [
+        (name.title(), *by_name[name])
+        for name in DEFAULT_SOURCE_NAMES
+        if name in requested
+    ]
+
+
+def _download_source_rasters(
+    raster_dir: Path,
+    source_names: tuple[str, ...],
+) -> list[Path]:
+    paths: list[Path] = []
+    raster_dir.mkdir(parents=True, exist_ok=True)
+
+    for (
+        label,
+        _col_name,
+        wcs_base,
+        coverage_id,
+        wcs_version,
+        allow_missing_tiles,
+    ) in _source_specs(source_names):
+        tile_dir = raster_dir / label.lower()
+        tile_dir.mkdir(parents=True, exist_ok=True)
+        paths.extend(
+            download_raster(
+                tile_dir,
+                wcs_base,
+                coverage_id,
+                label,
+                wcs_version,
+                allow_missing_tiles,
+            )
+        )
+
+    return paths
+
+
+def _raster_infos(raster_paths: list[Path]) -> list[RasterInfo]:
+    infos: list[RasterInfo] = []
+    for path in raster_paths:
+        with rasterio.open(path) as dataset:
+            if dataset.crs is None:
+                raise ValueError(f"Raster has no CRS: {path}")
+            bounds = transform_bounds(
+                dataset.crs,
+                WEB_MERCATOR_CRS,
+                *dataset.bounds,
+                densify_pts=21,
+            )
+            infos.append(RasterInfo(path=path, bounds_mercator=bounds))
+    return infos
+
+
+def _england_bounds_wgs84() -> tuple[float, float, float, float]:
+    return transform_bounds(
+        "EPSG:27700",
+        "EPSG:4326",
+        BNG_MIN_E,
+        BNG_MIN_N,
+        BNG_MAX_E,
+        BNG_MAX_N,
+        densify_pts=21,
+    )
+
+
+def _lonlat_to_tile(lon: float, lat: float, zoom: int) -> tuple[int, int]:
+    lat = max(min(lat, 85.05112878), -85.05112878)
+    n = 1 << zoom
+    x = int(math.floor((lon + 180.0) / 360.0 * n))
+    y = int(
+        math.floor((1.0 - math.asinh(math.tan(math.radians(lat))) / math.pi) / 2.0 * n)
+    )
+    return min(max(x, 0), n - 1), min(max(y, 0), n - 1)
+
+
+def _tile_bounds_mercator(
+    zoom: int, x: int, y: int
+) -> tuple[float, float, float, float]:
+    n = 1 << zoom
+    tile_size_m = WEB_MERCATOR_EXTENT * 2 / n
+    left = -WEB_MERCATOR_EXTENT + x * tile_size_m
+    right = left + tile_size_m
+    top = WEB_MERCATOR_EXTENT - y * tile_size_m
+    bottom = top - tile_size_m
+    return left, bottom, right, top
+
+
+def _read_noise_tile(
+    candidates: list[RasterInfo],
+    bounds_mercator: tuple[float, float, float, float],
+    tile_size: int,
+) -> np.ndarray:
+    left, bottom, right, top = bounds_mercator
+    merged = np.full((tile_size, tile_size), np.nan, dtype=np.float32)
+
+    for info in candidates:
+        with rasterio.open(info.path) as source:
+            tile = np.full((tile_size, tile_size), np.nan, dtype=np.float32)
+            reproject(
+                source=rasterio.band(source, 1),
+                destination=tile,
+                src_transform=source.transform,
+                src_crs=source.crs,
+                src_nodata=source.nodata if source.nodata is not None else 0,
+                dst_transform=from_bounds(
+                    left, bottom, right, top, tile_size, tile_size
+                ),
+                dst_crs=WEB_MERCATOR_CRS,
+                dst_nodata=np.nan,
+                resampling=Resampling.bilinear,
+            )
+
+        tile[~np.isfinite(tile) | (tile <= 0)] = np.nan
+        merged = np.fmax(merged, tile)
+
+    return merged
+
+
+def _encode_noise_png(noise_db: np.ndarray) -> bytes | None:
+    valid = np.isfinite(noise_db) & (noise_db >= NOISE_COLOR_STOPS[0])
+    if not valid.any():
+        return None
+
+    clipped = np.clip(noise_db, NOISE_COLOR_STOPS[0], NOISE_COLOR_STOPS[-1])
+    rgba = np.zeros((*noise_db.shape, 4), dtype=np.uint8)
+    valid_values = clipped[valid]
+
+    for channel in range(3):
+        channel_values = np.interp(
+            valid_values,
+            NOISE_COLOR_STOPS,
+            NOISE_COLORS[:, channel],
+        ).astype(np.uint8)
+        rgba[..., channel][valid] = channel_values
+
+    alpha = np.interp(
+        valid_values,
+        [NOISE_COLOR_STOPS[0], NOISE_COLOR_STOPS[-1]],
+        [70, 190],
+    ).astype(np.uint8)
+    rgba[..., 3][valid] = alpha
+
+    output = io.BytesIO()
+    Image.fromarray(rgba, mode="RGBA").save(output, format="PNG", optimize=True)
+    return output.getvalue()
+
+
+def _tile_ranges(
+    bounds_wgs84: tuple[float, float, float, float],
+    zoom: int,
+) -> tuple[range, range]:
+    west, south, east, north = bounds_wgs84
+    min_x, min_y = _lonlat_to_tile(west, north, zoom)
+    max_x, max_y = _lonlat_to_tile(east, south, zoom)
+    return range(min_x, max_x + 1), range(min_y, max_y + 1)
+
+
+def _create_mbtiles(
+    raster_infos: list[RasterInfo],
+    mbtiles_path: Path,
+    min_zoom: int,
+    max_zoom: int,
+    tile_size: int,
+) -> int:
+    if mbtiles_path.exists():
+        mbtiles_path.unlink()
+
+    bounds_wgs84 = _england_bounds_wgs84()
+    geometries = [box(*info.bounds_mercator) for info in raster_infos]
+    tree = STRtree(geometries)
+
+    conn = sqlite3.connect(mbtiles_path)
+    conn.execute("CREATE TABLE metadata (name TEXT, value TEXT)")
+    conn.execute(
+        "CREATE TABLE tiles (zoom_level INTEGER, tile_column INTEGER, "
+        "tile_row INTEGER, tile_data BLOB)"
+    )
+    conn.execute(
+        "CREATE UNIQUE INDEX tile_index ON tiles (zoom_level, tile_column, tile_row)"
+    )
+    conn.executemany(
+        "INSERT INTO metadata (name, value) VALUES (?, ?)",
+        [
+            ("name", "Defra Lden noise overlay"),
+            ("type", "overlay"),
+            ("version", "1"),
+            ("description", "Defra Round 4 10m strategic noise Lden overlay"),
+            ("format", "png"),
+            (
+                "attribution",
+                "Contains public sector information licensed under the OGL v3.0",
+            ),
+            ("bounds", ",".join(f"{value:.6f}" for value in bounds_wgs84)),
+            ("minzoom", str(min_zoom)),
+            ("maxzoom", str(max_zoom)),
+        ],
+    )
+
+    total_tiles = 0
+    try:
+        for zoom in range(min_zoom, max_zoom + 1):
+            x_range, y_range = _tile_ranges(bounds_wgs84, zoom)
+            zoom_tiles = 0
+            for x in x_range:
+                for y in y_range:
+                    bounds_mercator = _tile_bounds_mercator(zoom, x, y)
+                    candidate_indexes = tree.query(box(*bounds_mercator))
+                    if len(candidate_indexes) == 0:
+                        continue
+
+                    candidates = [
+                        raster_infos[int(index)] for index in candidate_indexes
+                    ]
+                    tile = _read_noise_tile(candidates, bounds_mercator, tile_size)
+                    tile_png = _encode_noise_png(tile)
+                    if tile_png is None:
+                        continue
+
+                    tms_y = (1 << zoom) - 1 - y
+                    conn.execute(
+                        "INSERT INTO tiles VALUES (?, ?, ?, ?)",
+                        (zoom, x, tms_y, tile_png),
+                    )
+                    zoom_tiles += 1
+                    total_tiles += 1
+
+            conn.commit()
+            print(f"Zoom {zoom}: wrote {zoom_tiles:,} PNG tiles")
+    finally:
+        conn.close()
+
+    return total_tiles
+
+
+def build_noise_overlay_tiles(
+    output_path: Path,
+    raster_dir: Path,
+    source_names: tuple[str, ...],
+    input_rasters: tuple[Path, ...],
+    pmtiles_bin: Path,
+    pmtiles_version: str,
+    min_zoom: int,
+    max_zoom: int,
+    tile_size: int,
+) -> None:
+    if min_zoom > max_zoom:
+        raise ValueError("--min-zoom must be <= --max-zoom")
+
+    raster_paths = list(input_rasters) or _download_source_rasters(
+        raster_dir, source_names
+    )
+    if not raster_paths:
+        raise FileNotFoundError("No noise raster GeoTIFFs available")
+
+    print(f"Preparing {len(raster_paths):,} noise raster tile(s)")
+    raster_infos = _raster_infos(raster_paths)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    ensure_pmtiles_cli(pmtiles_bin, pmtiles_version)
+
+    with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
+        mbtiles_path = Path(tmp) / "noise_lden_10m.mbtiles"
+        tile_count = _create_mbtiles(
+            raster_infos, mbtiles_path, min_zoom, max_zoom, tile_size
+        )
+        if tile_count == 0:
+            raise RuntimeError("Noise overlay generation produced no tiles")
+
+        subprocess.run(
+            [
+                str(pmtiles_bin),
+                "convert",
+                str(mbtiles_path),
+                str(output_path),
+                "--force",
+            ],
+            check=True,
+        )
+
+    size_mb = output_path.stat().st_size / (1024 * 1024)
+    print(f"Wrote {output_path} ({size_mb:.1f} MB)")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--output", type=Path, required=True)
+    parser.add_argument(
+        "--raster-dir",
+        type=Path,
+        default=Path("property-data/noise_overlay_rasters"),
+        help="Cache directory for downloaded Defra WCS GeoTIFF tiles",
+    )
+    parser.add_argument(
+        "--source",
+        action="append",
+        dest="sources",
+        choices=("all", *DEFAULT_SOURCE_NAMES),
+        help="Noise source to include; repeatable. Defaults to all.",
+    )
+    parser.add_argument(
+        "--input-raster",
+        action="append",
+        dest="input_rasters",
+        type=Path,
+        help="Existing GeoTIFF to render instead of downloading WCS rasters",
+    )
+    parser.add_argument(
+        "--pmtiles-bin", type=Path, default=Path("property-data/pmtiles")
+    )
+    parser.add_argument("--pmtiles-version", default="1.22.3")
+    parser.add_argument("--min-zoom", type=int, default=13)
+    parser.add_argument("--max-zoom", type=int, default=14)
+    parser.add_argument("--tile-size", type=int, default=256)
+    args = parser.parse_args()
+
+    build_noise_overlay_tiles(
+        output_path=args.output,
+        raster_dir=args.raster_dir,
+        source_names=tuple(args.sources or ("all",)),
+        input_rasters=tuple(args.input_rasters or ()),
+        pmtiles_bin=args.pmtiles_bin,
+        pmtiles_version=args.pmtiles_version,
+        min_zoom=args.min_zoom,
+        max_zoom=args.max_zoom,
+        tile_size=args.tile_size,
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/transform/school_proximity.py
+++ b/pipeline/transform/school_proximity.py
@ -39,6 +39,8 @@ def main():
        pl.col("Ofsted phase").is_in(["Primary", "Secondary"])
        & pl.col("Latest OEIF overall effectiveness").is_in(["1", "2"])
    )
+    if ofsted.is_empty():
+        raise ValueError("No good+ primary/secondary Ofsted schools found")

    print(f"Good+ schools: {len(ofsted):,}")
    print(
@ -74,6 +76,8 @@ def main():
    )

    schools = ofsted.join(arcgis, on="postcode", how="inner")
+    if schools.is_empty():
+        raise ValueError("No Ofsted schools matched ArcGIS postcode coordinates")
    print(f"Schools with coordinates: {len(schools):,}")

    # Load all postcodes for proximity counting
@ -88,6 +92,7 @@ def main():

    result = counts_5km.join(counts_2km, on="postcode")

+    args.output.parent.mkdir(parents=True, exist_ok=True)
    result.write_parquet(args.output)
    size_mb = args.output.stat().st_size / (1024 * 1024)
    print(f"Wrote {args.output} ({size_mb:.1f} MB)")
--- a/pipeline/transform/test_crime.py
+++ b/pipeline/transform/test_crime.py
@ -95,6 +95,69 @@ def test_transform_crime_annualises_over_all_valid_months(tmp_path):
    ]


+def test_transform_crime_writes_by_year_output(tmp_path):
+    crime_dir = tmp_path / "crime"
+    jan23 = crime_dir / "2023-01"
+    jan24 = crime_dir / "2024-01"
+    feb24 = crime_dir / "2024-02"
+    for d in (jan23, jan24, feb24):
+        d.mkdir(parents=True)
+
+    header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
+    (jan23 / "2023-01-test-force-street.csv").write_text(
+        "\n".join(
+            [
+                header,
+                "1,2023-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
+                "2,2023-01,F,F,-0.1,51.5,X,E01000001,L,Robbery,U,",
+            ]
+        )
+        + "\n"
+    )
+    (jan24 / "2024-01-test-force-street.csv").write_text(
+        "\n".join(
+            [
+                header,
+                "3,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
+                "4,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
+            ]
+        )
+        + "\n"
+    )
+    (feb24 / "2024-02-test-force-street.csv").write_text(
+        "\n".join(
+            [
+                header,
+                "5,2024-02,F,F,-0.1,51.5,X,E01000001,L,Anti-social behaviour,U,",
+            ]
+        )
+        + "\n"
+    )
+
+    output = tmp_path / "crime.parquet"
+    by_year_output = tmp_path / "crime_by_year.parquet"
+    transform_crime(crime_dir, output, by_year_output)
+
+    by_year = pl.read_parquet(by_year_output)
+    assert by_year.height == 1
+    cols = set(by_year.columns)
+    assert "Burglary (by year)" in cols
+    assert "Serious crime (by year)" in cols
+    assert "Minor crime (by year)" in cols
+
+    row = by_year.row(0, named=True)
+    burglary = sorted(row["Burglary (by year)"], key=lambda r: r["year"])
+    # 2023: 1 burglary in 1 month → 12/yr; 2024: 2 in 2 months → 12/yr
+    assert burglary == [
+        {"year": 2023, "count": 12.0},
+        {"year": 2024, "count": 12.0},
+    ]
+    # Serious crime in 2023 = Burglary(12) + Robbery(12) = 24
+    serious = {p["year"]: p["count"] for p in row["Serious crime (by year)"]}
+    assert serious[2023] == 24.0
+    assert serious[2024] == 12.0
+
+
 def test_transform_crime_fails_without_valid_months(tmp_path):
    crime_dir = tmp_path / "crime"
    month_dir = crime_dir / "2024-01"
@ -117,3 +180,49 @@ def test_transform_crime_fails_without_valid_months(tmp_path):
        assert "No valid crime months" in str(exc)
    else:
        raise AssertionError("Expected ValueError")
+
+
+def test_transform_crime_applies_lsoa_2011_to_2021_lookup(tmp_path):
+    crime_dir = tmp_path / "crime"
+    month_dir = crime_dir / "2024-01"
+    month_dir.mkdir(parents=True)
+
+    header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
+    # E01000001 was split into two 2021 LSOAs; E01000099 is unchanged.
+    (month_dir / "2024-01-test-force-street.csv").write_text(
+        "\n".join(
+            [
+                header,
+                "1,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
+                "2,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
+                "3,2024-01,F,F,-0.1,51.5,X,E01000099,L,Burglary,U,",
+            ]
+        )
+        + "\n"
+    )
+
+    lookup_path = tmp_path / "lookup.parquet"
+    pl.DataFrame(
+        {
+            "lsoa11": ["E01000001", "E01000001", "E01000099"],
+            "lsoa21": ["E01000050", "E01000051", "E01000099"],
+        }
+    ).write_parquet(lookup_path)
+
+    output = tmp_path / "crime.parquet"
+    by_year_output = tmp_path / "by_year.parquet"
+    transform_crime(crime_dir, output, by_year_output, lookup_path)
+
+    # Split LSOA: 2 burglaries split evenly → 1/yr each child, annualised to 12/yr each.
+    avg = pl.read_parquet(output).sort("LSOA code").to_dicts()
+    assert avg == [
+        {"LSOA code": "E01000050", "Burglary (avg/yr)": 12.0},
+        {"LSOA code": "E01000051", "Burglary (avg/yr)": 12.0},
+        {"LSOA code": "E01000099", "Burglary (avg/yr)": 12.0},
+    ]
+
+    by_year = pl.read_parquet(by_year_output).sort("LSOA code").to_dicts()
+    burglaries = {row["LSOA code"]: row["Burglary (by year)"] for row in by_year}
+    assert burglaries["E01000050"] == [{"year": 2024, "count": 12.0}]
+    assert burglaries["E01000051"] == [{"year": 2024, "count": 12.0}]
+    assert burglaries["E01000099"] == [{"year": 2024, "count": 12.0}]
--- a/pipeline/transform/test_merge.py
+++ b/pipeline/transform/test_merge.py
@ -1,11 +1,17 @@
 import polars as pl
 import pytest
+from shapely import box

 from pipeline.transform.merge import (
    _AREA_COLUMNS,
+    CONSERVATION_AREA_FEATURE,
+    LISTED_BUILDING_FEATURE,
    TREE_DENSITY_FEATURE,
    _is_dynamic_poi_metric_column,
    _less_deprived_percentile_expr,
+    _matched_listed_building_flags,
+    _postcode_conservation_area_flags,
+    _postcode_listed_building_candidates,
    _tree_density_by_postcode,
    _validate_lad_source_coverage,
    _validate_property_postcodes,
@ -48,6 +54,106 @@ def test_country_code_is_kept_in_postcode_area_columns() -> None:
    assert "ctry25cd" in _AREA_COLUMNS


+def test_conservation_area_feature_is_area_level() -> None:
+    assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS
+
+
+def test_listed_building_feature_is_property_level() -> None:
+    assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
+
+
+def test_postcode_conservation_area_flags_marks_point_membership() -> None:
+    postcodes = pl.DataFrame(
+        {
+            "postcode": ["AA1 1AA", "BB1 1BB", "CC1 1CC"],
+            "lat": [0.5, 2.0, None],
+            "lon": [0.5, 2.0, 0.5],
+        }
+    )
+
+    result = _postcode_conservation_area_flags(
+        postcodes, [box(0, 0, 1, 1)], "EPSG:4326", batch_size=2
+    ).sort("postcode")
+
+    assert result.to_dicts() == [
+        {"postcode": "AA1 1AA", CONSERVATION_AREA_FEATURE: "Yes"},
+        {"postcode": "BB1 1BB", CONSERVATION_AREA_FEATURE: "No"},
+        {"postcode": "CC1 1CC", CONSERVATION_AREA_FEATURE: "No"},
+    ]
+
+
+def test_postcode_listed_building_candidates_uses_nearby_postcodes() -> None:
+    listed_points = pl.DataFrame(
+        {
+            "ListEntry": [1234, 5678],
+            "Name": ["1 and 2 High Street", "Distant Hall"],
+            "Grade": ["II", "I"],
+            "Easting": [100.0, 1000.0],
+            "Northing": [100.0, 1000.0],
+        }
+    ).with_columns(
+        pl.col("Name")
+        .str.to_uppercase()
+        .str.replace_all(r"[^0-9A-Z]+", " ")
+        .str.replace_all(r"\s+", " ")
+        .str.strip_chars()
+        .alias("_listed_match_name")
+    )
+    active_postcodes = pl.DataFrame(
+        {
+            "postcode": ["AA1 1AA", "BB1 1BB"],
+            "east1m": [105.0, 5000.0],
+            "north1m": [105.0, 5000.0],
+        }
+    )
+
+    result = _postcode_listed_building_candidates(
+        listed_points,
+        active_postcodes,
+        nearest_postcodes=1,
+        max_distance_m=25,
+    )
+
+    assert result.select("postcode", "_listed_match_name").to_dicts() == [
+        {"postcode": "AA1 1AA", "_listed_match_name": "1 AND 2 HIGH STREET"}
+    ]
+
+
+def test_matched_listed_building_flags_requires_address_match() -> None:
+    properties = pl.DataFrame(
+        {
+            "postcode": ["AA1 1AA", "AA1 1AA", "BB1 1BB"],
+            "pp_address": ["1 HIGH STREET", "99 HIGH STREET", "THE OLD RECTORY"],
+            "epc_address": ["1, High Street", "99, High Street", "Old Rectory"],
+        }
+    )
+    listed_candidates = pl.DataFrame(
+        {
+            "postcode": ["AA1 1AA", "BB1 1BB"],
+            "_listed_match_name": ["1 AND 2 HIGH STREET", "OLD RECTORY"],
+            "_listed_grade": ["II", "II*"],
+            "_listed_entry": [1234, 5678],
+        }
+    )
+
+    result = _matched_listed_building_flags(
+        properties.lazy(), listed_candidates, min_score=95
+    ).sort("postcode", "pp_address")
+
+    assert result.to_dicts() == [
+        {
+            "postcode": "AA1 1AA",
+            "pp_address": "1 HIGH STREET",
+            LISTED_BUILDING_FEATURE: "Yes",
+        },
+        {
+            "postcode": "BB1 1BB",
+            "pp_address": "THE OLD RECTORY",
+            LISTED_BUILDING_FEATURE: "Yes",
+        },
+    ]
+
+
 def test_validate_property_postcodes_rejects_blank_rows() -> None:
    df = pl.DataFrame(
        {
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -182,6 +182,19 @@ DROP_CATEGORIES = {
    "public_transport/platform",
    "public_transport/station",
    "public_transport/stop_position",
+    # Education amenities — schools come from GIAS instead. OSM coverage for
+    # tertiary education, tutoring, and childcare is too noisy/incomplete to be
+    # useful on a property-search map.
+    "amenity/school",
+    "amenity/prep_school",
+    "amenity/language_school",
+    "amenity/music_school",
+    "amenity/university",
+    "amenity/college",
+    "building/university",
+    "amenity/kindergarten",
+    "amenity/childcare",
+    "office/tutoring",
 }


@ -943,23 +956,10 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "tourism/chalet",
        ],
    ),
-    (
-        "Education",
-        "School",
-        "🏫",
-        [
-            "amenity/school",
-            "amenity/prep_school",
-            "amenity/language_school",
-            "amenity/music_school",
-            "amenity/university",
-            "amenity/college",
-            "building/university",
-            "amenity/kindergarten",
-            "amenity/childcare",
-            "office/tutoring",
-        ],
-    ),
+    # Note: schools come from the GIAS register (see transform_gias_schools).
+    # Niche/tertiary education amenities that GIAS does not cover are dropped
+    # rather than mixed in with state-funded schools.
+
    (
        "Local Businesses",
        "Hotel",
@ -1316,11 +1316,45 @@ def transform_grocery_retail_points(
    ).select("id", "name", "category", "icon_category", "group", "lat", "lng", "emoji")


+def transform_gias_schools(gias_path: Path) -> pl.LazyFrame:
+    """Convert the GIAS register parquet into POI rows with school metadata."""
+    return pl.scan_parquet(gias_path).select(
+        pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
+        pl.col("name"),
+        pl.lit("School").alias("category"),
+        pl.lit("School").alias("icon_category"),
+        pl.lit("Education").alias("group"),
+        pl.col("lat").cast(pl.Float64),
+        pl.col("lng").cast(pl.Float64),
+        pl.lit("🏫").alias("emoji"),
+        pl.col("phase").alias("school_phase"),
+        pl.col("type").alias("school_type"),
+        pl.col("type_group").alias("school_type_group"),
+        pl.col("age_range").alias("school_age_range"),
+        pl.col("gender").alias("school_gender"),
+        pl.col("religious_character").alias("school_religious_character"),
+        pl.col("admissions_policy").alias("school_admissions_policy"),
+        pl.col("nursery_provision").alias("school_nursery_provision"),
+        pl.col("sixth_form").alias("school_sixth_form"),
+        pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
+        pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
+        pl.col("fsm_percent").cast(pl.Float32, strict=False).alias("school_fsm_percent"),
+        pl.col("trust").alias("school_trust"),
+        pl.col("address").alias("school_address"),
+        pl.col("postcode").alias("school_postcode"),
+        pl.col("local_authority").alias("school_local_authority"),
+        pl.col("website").alias("school_website"),
+        pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
+        pl.col("head_name").alias("school_head_name"),
+    )
+
+
 def transform(
    input_path: Path,
-    naptan_path: Path | None = None,
-    boundary_path: Path | None = None,
-    grocery_retail_points_path: Path | None = None,
+    naptan_path: Path,
+    boundary_path: Path,
+    grocery_retail_points_path: Path,
+    gias_path: Path,
 ) -> pl.LazyFrame:
    lf = pl.scan_parquet(input_path)

@ -1372,24 +1406,21 @@ def transform(
    )

    naptan_df = pl.scan_parquet(naptan_path).collect()
-    if boundary_path is not None:
-        mask = in_england_mask(
-            boundary_path,
-            naptan_df["lat"].to_numpy(),
-            naptan_df["lng"].to_numpy(),
-        )
-        naptan_df = naptan_df.filter(pl.Series(mask))
+    mask = in_england_mask(
+        boundary_path,
+        naptan_df["lat"].to_numpy(),
+        naptan_df["lng"].to_numpy(),
+    )
+    naptan_df = naptan_df.filter(pl.Series(mask))
    naptan = naptan_df.lazy().with_columns(
        pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
        pl.lit("Public Transport").alias("group"),
        pl.col("category").alias("icon_category"),
    )

-    frames = [lf, naptan]
-    if grocery_retail_points_path is not None:
-        grocery_df = pl.read_parquet(grocery_retail_points_path)
-        grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
-        frames.append(grocery_pois.lazy())
+    grocery_df = pl.read_parquet(grocery_retail_points_path)
+    grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
+    frames = [lf, naptan, grocery_pois.lazy(), transform_gias_schools(gias_path)]

    return pl.concat(frames, how="diagonal_relaxed")

@ -1413,8 +1444,15 @@ def main():
    parser.add_argument(
        "--grocery-retail-points",
        type=Path,
+        required=True,
        help="GEOLYTIX Grocery Retail Points parquet",
    )
+    parser.add_argument(
+        "--gias",
+        type=Path,
+        required=True,
+        help="GIAS schools register parquet (replaces OSM schools)",
+    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output filtered POIs parquet file"
    )
@ -1425,6 +1463,7 @@ def main():
        args.naptan,
        args.boundary,
        args.grocery_retail_points,
+        args.gias,
    ).collect(engine="streaming")

    df.write_parquet(args.output)
--- a/pipeline/transform/tree_overlay_tiles.py
+++ b/pipeline/transform/tree_overlay_tiles.py
@ -0,0 +1,269 @@
+"""Build PMTiles polygon tiles for the Trees Outside Woodland overlay."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+
+import numpy as np
+import pyogrio
+import shapely
+from pyproj import Transformer
+
+from pipeline.local_temp import local_tmp_dir
+from pipeline.transform.tree_density import (
+    DEFAULT_TOW_TYPES,
+    _layers,
+    _tow_dataset_path,
+    _where_for_tow_types,
+)
+
+
+def _require_tippecanoe() -> str:
+    executable = shutil.which("tippecanoe")
+    if executable is None:
+        raise RuntimeError(
+            "tippecanoe is required to build tree overlay PMTiles. "
+            "Install tippecanoe and rerun this target."
+        )
+    return executable
+
+
+def _column_or_none(batch, names: list[str], column: str):
+    if column not in names:
+        return None
+    return batch.column(names.index(column)).to_numpy(zero_copy_only=False)
+
+
+def _number_or_none(value) -> float | int | None:
+    if value is None:
+        return None
+    try:
+        if np.isfinite(value):
+            if float(value).is_integer():
+                return int(value)
+            return round(float(value), 2)
+    except TypeError:
+        return None
+    return None
+
+
+def _write_tree_geojsonseq(
+    dataset_path: str,
+    output_path: Path,
+    tow_types: tuple[str, ...],
+    batch_size: int,
+    layer_names: tuple[str, ...] | None,
+    max_features_per_layer: int | None,
+) -> int:
+    to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
+    where = _where_for_tow_types(tow_types)
+    layers = _layers(dataset_path, layer_names)
+    print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
+    if where:
+        print(f"TOW type filter: {where}")
+
+    columns = [
+        "TOW_ID",
+        "Woodland_Type",
+        "TOW_Area_M",
+        "MEANHT",
+        "MINHT",
+        "MAXHT",
+        "LiDAR_Survey_Year",
+    ]
+    feature_count = 0
+
+    with output_path.open("w") as file:
+        for layer in layers:
+            info = pyogrio.read_info(dataset_path, layer=layer)
+            print(f"\nLayer {layer}: {info.get('features', 0):,} features")
+            layer_features_seen = 0
+
+            with pyogrio.open_arrow(
+                dataset_path,
+                layer=layer,
+                columns=columns,
+                where=where,
+                batch_size=batch_size,
+                use_pyarrow=True,
+            ) as (_meta, reader):
+                for batch in reader:
+                    if max_features_per_layer is not None:
+                        remaining = max_features_per_layer - layer_features_seen
+                        if remaining <= 0:
+                            break
+                        if batch.num_rows > remaining:
+                            batch = batch.slice(0, remaining)
+
+                    layer_features_seen += batch.num_rows
+                    names = batch.schema.names
+                    area = np.asarray(
+                        batch.column(names.index("TOW_Area_M")).to_numpy(
+                            zero_copy_only=False
+                        ),
+                        dtype=np.float64,
+                    )
+                    geometry = np.asarray(
+                        batch.column(names.index("SHAPE")).to_numpy(
+                            zero_copy_only=False
+                        ),
+                        dtype=object,
+                    )
+                    valid = np.isfinite(area) & (area > 0)
+                    if not valid.any():
+                        continue
+
+                    tow_id = _column_or_none(batch, names, "TOW_ID")
+                    woodland_type = _column_or_none(batch, names, "Woodland_Type")
+                    mean_height = _column_or_none(batch, names, "MEANHT")
+                    min_height = _column_or_none(batch, names, "MINHT")
+                    max_height = _column_or_none(batch, names, "MAXHT")
+                    lidar_year = _column_or_none(batch, names, "LiDAR_Survey_Year")
+
+                    geometries = shapely.from_wkb(geometry[valid])
+                    geometries = shapely.transform(
+                        geometries,
+                        to_wgs84.transform,
+                        interleaved=False,
+                    )
+                    geometries_json = shapely.to_geojson(geometries)
+                    valid_indexes = np.flatnonzero(valid)
+
+                    for idx, geometry_json in zip(valid_indexes, geometries_json):
+                        properties = {
+                            "tow_id": str(tow_id[idx]) if tow_id is not None else "",
+                            "woodland_type": (
+                                str(woodland_type[idx])
+                                if woodland_type is not None
+                                else ""
+                            ),
+                            "area_sqm": _number_or_none(area[idx]),
+                            "mean_height_m": (
+                                _number_or_none(mean_height[idx])
+                                if mean_height is not None
+                                else None
+                            ),
+                            "min_height_m": (
+                                _number_or_none(min_height[idx])
+                                if min_height is not None
+                                else None
+                            ),
+                            "max_height_m": (
+                                _number_or_none(max_height[idx])
+                                if max_height is not None
+                                else None
+                            ),
+                            "lidar_year": (
+                                _number_or_none(lidar_year[idx])
+                                if lidar_year is not None
+                                else None
+                            ),
+                            "source_layer": layer,
+                        }
+                        feature = {
+                            "type": "Feature",
+                            "geometry": json.loads(geometry_json),
+                            "properties": properties,
+                        }
+                        file.write(json.dumps(feature, separators=(",", ":")) + "\n")
+                        feature_count += 1
+
+    return feature_count
+
+
+def build_tree_overlay_tiles(
+    tow_zip: Path,
+    output_path: Path,
+    extract_dir: Path,
+    tow_types: tuple[str, ...],
+    batch_size: int,
+    layer_names: tuple[str, ...] | None,
+    max_features_per_layer: int | None,
+    min_zoom: int,
+    max_zoom: int,
+    force_extract: bool,
+    use_vsizip: bool,
+) -> None:
+    tippecanoe = _require_tippecanoe()
+    dataset_path = _tow_dataset_path(tow_zip, extract_dir, force_extract, use_vsizip)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
+        ndjson_path = Path(tmp) / "trees_outside_woodlands.geojsonseq"
+        feature_count = _write_tree_geojsonseq(
+            dataset_path,
+            ndjson_path,
+            tow_types,
+            batch_size,
+            layer_names,
+            max_features_per_layer,
+        )
+        print(f"Writing {feature_count:,} TOW polygon features")
+
+        subprocess.run(
+            [
+                tippecanoe,
+                "--force",
+                "--output",
+                str(output_path),
+                "--layer",
+                "trees_outside_woodlands",
+                "--minimum-zoom",
+                str(min_zoom),
+                "--maximum-zoom",
+                str(max_zoom),
+                "--drop-smallest-as-needed",
+                "--extend-zooms-if-still-dropping",
+                str(ndjson_path),
+            ],
+            check=True,
+        )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--tow-zip", type=Path, required=True)
+    parser.add_argument("--output", type=Path, required=True)
+    parser.add_argument(
+        "--extract-dir",
+        type=Path,
+        default=Path("property-data/fr_tow_v1_all"),
+        help="Directory used to extract the FileGDB",
+    )
+    parser.add_argument(
+        "--tow-type",
+        action="append",
+        dest="tow_types",
+        help="Woodland_Type to include; repeatable. Defaults to TOW outside-woodland classes.",
+    )
+    parser.add_argument("--batch-size", type=int, default=50_000)
+    parser.add_argument("--layer", action="append", dest="layers")
+    parser.add_argument("--max-features-per-layer", type=int)
+    parser.add_argument("--min-zoom", type=int, default=15)
+    parser.add_argument("--max-zoom", type=int, default=17)
+    parser.add_argument("--force-extract", action="store_true")
+    parser.add_argument("--use-vsizip", action="store_true")
+    args = parser.parse_args()
+
+    build_tree_overlay_tiles(
+        tow_zip=args.tow_zip,
+        output_path=args.output,
+        extract_dir=args.extract_dir,
+        tow_types=tuple(args.tow_types or DEFAULT_TOW_TYPES),
+        batch_size=args.batch_size,
+        layer_names=tuple(args.layers) if args.layers else None,
+        max_features_per_layer=args.max_features_per_layer,
+        min_zoom=args.min_zoom,
+        max_zoom=args.max_zoom,
+        force_extract=args.force_extract,
+        use_vsizip=args.use_vsizip,
+    )
+
+
+if __name__ == "__main__":
+    main()