alright

2026-05-26 19:45:13 +01:00 · 2026-05-26 19:45:13 +01:00 · 39ef5c6646
commit 39ef5c6646
parent c645b0f1d4
79 changed files with 5660 additions and 2199 deletions
--- a/pipeline/download/map_assets.py
+++ b/pipeline/download/map_assets.py
@ -10,7 +10,11 @@ from pathlib import Path

 from PIL import Image, ImageDraw

-from pipeline.transform.transform_poi import NAPTAN_EMOJIS, _CATEGORIES
+from pipeline.transform.transform_poi import (
+    NAPTAN_EMOJIS,
+    SCHOOL_ICON_CATEGORIES,
+    _CATEGORIES,
+)

 GLYPHS_BASE = "https://protomaps.github.io/basemaps-assets/fonts"
 SPRITES_BASE = "https://protomaps.github.io/basemaps-assets/sprites/v4"
@ -109,6 +113,9 @@ def collect_twemoji_codes() -> list[str]:
    for emoji in NAPTAN_EMOJIS.values():
        emojis.add(emoji)

+    for emoji in SCHOOL_ICON_CATEGORIES.values():
+        emojis.add(emoji)
+
    # First codepoint hex, matching frontend logic
    return sorted({f"{ord(e[0]):x}" for e in emojis})

--- a/pipeline/transform/crime_hotspot_tiles.py
+++ b/pipeline/transform/crime_hotspot_tiles.py
@ -124,6 +124,8 @@ def build_crime_hotspot_tiles(
                str(max_zoom),
                "--drop-densest-as-needed",
                "--extend-zooms-if-still-dropping",
+                "--temporary-directory",
+                tmp,
                str(ndjson_path),
            ],
            check=True,
--- a/pipeline/transform/enrich_actual_listings.py
+++ b/pipeline/transform/enrich_actual_listings.py
@ -0,0 +1,960 @@
+import argparse
+import re
+import tempfile
+from pathlib import Path
+
+import polars as pl
+from thefuzz import fuzz
+from tqdm import tqdm
+
+from pipeline.local_temp import local_tmp_dir
+from pipeline.transform.join_epc_pp import _scan_epc_certificates
+from pipeline.utils.fuzzy_join import normalize_address_key, normalize_postcode_key
+from pipeline.utils.postcode_mapping import build_postcode_mapping
+
+MIN_FLOOR_AREA_M2 = 10.0
+PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
+PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
+PROPERTY_MATCH_MIN_MARGIN = 4.0
+EPC_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
+EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
+EPC_MATCH_MIN_MARGIN = 4.0
+ENRICHMENT_VERSION = 1
+
+_NUMBER_RE = re.compile(r"\d+")
+
+LISTING_REQUIRED_COLUMNS = [
+    "Bedrooms",
+    "Bathrooms",
+    "Number of bedrooms & living rooms",
+    "lon",
+    "lat",
+    "Postcode",
+    "Address per Property Register",
+    "Leasehold/Freehold",
+    "Property type",
+    "Property sub-type",
+    "Price qualifier",
+    "Total floor area (sqm)",
+    "Listing URL",
+    "Listing features",
+    "Listing date",
+    "Listing status",
+    "Asking price",
+    "Asking price per sqm",
+]
+
+PROPERTY_CANDIDATE_COLUMNS = [
+    "Address per Property Register",
+    "Postcode",
+    "Leasehold/Freehold",
+    "Last known price",
+    "Date of last transaction",
+    "Address per EPC",
+    "Current energy rating",
+    "Potential energy rating",
+    "Total floor area (sqm)",
+    "Number of bedrooms & living rooms",
+    "Interior height (m)",
+    "Construction year",
+    "Former council house",
+    "Is construction date approximate",
+    "Listed building",
+    "Estimated monthly rent",
+    "Street tree density percentile",
+    "Property type",
+    "Price per sqm",
+    "Estimated current price",
+    "Est. price per sqm",
+]
+
+PROPERTY_ENRICHMENT_COLUMNS = [
+    "Address per EPC",
+    "Current energy rating",
+    "Potential energy rating",
+    "Interior height (m)",
+    "Construction year",
+    "Former council house",
+    "Is construction date approximate",
+    "Listed building",
+    "Estimated monthly rent",
+    "Street tree density percentile",
+    "Date of last transaction",
+]
+
+EPC_ENRICHMENT_COLUMNS = [
+    "Address per EPC",
+    "Current energy rating",
+    "Potential energy rating",
+    "Total floor area (sqm)",
+    "Number of bedrooms & living rooms",
+    "Interior height (m)",
+    "Construction year",
+    "Former council house",
+]
+
+EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"]
+TENURE_VALUES = ["Freehold", "Leasehold"]
+PROPERTY_TYPE_VALUES = [
+    "Detached",
+    "Semi-Detached",
+    "Terraced",
+    "Flats/Maisonettes",
+    "Other",
+]
+
+COLUMN_DTYPES = {
+    "Address per EPC": pl.Utf8,
+    "Current energy rating": pl.Utf8,
+    "Potential energy rating": pl.Utf8,
+    "Total floor area (sqm)": pl.Float64,
+    "Number of bedrooms & living rooms": pl.Int32,
+    "Interior height (m)": pl.Float64,
+    "Construction year": pl.UInt16,
+    "Former council house": pl.Utf8,
+    "Is construction date approximate": pl.UInt8,
+    "Listed building": pl.Utf8,
+    "Estimated monthly rent": pl.Float32,
+    "Street tree density percentile": pl.Float32,
+    "Date of last transaction": pl.Datetime("us"),
+    "Property type": pl.Utf8,
+    "Leasehold/Freehold": pl.Utf8,
+}
+
+
+def _canonical_postcode_expr(column: str) -> pl.Expr:
+    compact = (
+        pl.col(column)
+        .cast(pl.Utf8)
+        .str.to_uppercase()
+        .str.replace_all(r"[^A-Z0-9]+", "")
+        .str.strip_chars()
+    )
+    return (
+        pl.when(compact.str.contains(r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"))
+        .then(compact.str.replace(r"^(.+)([0-9][A-Z]{2})$", "${1} ${2}"))
+        .otherwise(None)
+    )
+
+
+def _clean_string_expr(column: str) -> pl.Expr:
+    stripped = pl.col(column).cast(pl.Utf8).str.strip_chars()
+    return pl.when(stripped == "").then(None).otherwise(stripped)
+
+
+def _coalesce_non_empty(*columns: str) -> pl.Expr:
+    return pl.coalesce(
+        [
+            pl.when(pl.col(column).cast(pl.Utf8).str.strip_chars() == "")
+            .then(None)
+            .otherwise(pl.col(column).cast(pl.Utf8))
+            for column in columns
+        ]
+    )
+
+
+def _valid_number_expr(column: str) -> pl.Expr:
+    return pl.when(pl.col(column).is_finite()).then(pl.col(column)).otherwise(None)
+
+
+def _read_listings(listings_path: Path, arcgis_path: Path) -> pl.DataFrame:
+    schema = pl.scan_parquet(listings_path).collect_schema()
+    missing = sorted(set(LISTING_REQUIRED_COLUMNS) - set(schema.names()))
+    if missing:
+        raise ValueError(f"{listings_path} is missing listing columns: {missing}")
+
+    listings = (
+        pl.scan_parquet(listings_path)
+        .with_row_index("_listing_idx")
+        .with_columns(
+            _canonical_postcode_expr("Postcode").alias("_original_postcode"),
+            normalize_address_key(pl.col("Address per Property Register")).alias(
+                "_listing_match_address"
+            ),
+            normalize_postcode_key(pl.col("Postcode")).alias("_listing_match_postcode"),
+        )
+        .collect(engine="streaming")
+    )
+
+    postcode_mapping = build_postcode_mapping(arcgis_path)
+    listings = (
+        listings.join(
+            postcode_mapping,
+            left_on="_original_postcode",
+            right_on="old_postcode",
+            how="left",
+        )
+        .with_columns(
+            pl.coalesce("new_postcode", "_original_postcode", "Postcode").alias(
+                "Postcode"
+            ),
+        )
+        .drop("new_postcode", strict=False)
+        .with_columns(
+            normalize_postcode_key(pl.col("Postcode")).alias("_listing_match_postcode"),
+        )
+    )
+    return listings
+
+
+def _load_property_candidates(
+    properties_path: Path, listing_postcodes: list[str]
+) -> pl.DataFrame:
+    schema = pl.scan_parquet(properties_path).collect_schema()
+    columns = [
+        column for column in PROPERTY_CANDIDATE_COLUMNS if column in schema.names()
+    ]
+    missing = sorted(
+        set(
+            [
+                "Address per Property Register",
+                "Postcode",
+                "Property type",
+                "Total floor area (sqm)",
+            ]
+        )
+        - set(columns)
+    )
+    if missing:
+        raise ValueError(f"{properties_path} is missing property columns: {missing}")
+
+    return (
+        pl.scan_parquet(properties_path)
+        .select(columns)
+        .with_columns(
+            normalize_postcode_key(pl.col("Postcode")).alias("_match_postcode")
+        )
+        .filter(pl.col("_match_postcode").is_in(listing_postcodes))
+        .with_columns(
+            normalize_address_key(pl.col("Address per Property Register")).alias(
+                "_match_register_address"
+            ),
+            normalize_address_key(pl.col("Address per EPC")).alias("_match_epc_address")
+            if "Address per EPC" in columns
+            else pl.lit(None, dtype=pl.Utf8).alias("_match_epc_address"),
+        )
+        .filter(
+            pl.col("_match_register_address").is_not_null()
+            | pl.col("_match_epc_address").is_not_null()
+        )
+        .with_row_index("_property_row")
+        .collect(engine="streaming")
+    )
+
+
+def _property_candidates_by_postcode(
+    candidates: pl.DataFrame,
+) -> dict[str, list[dict]]:
+    buckets: dict[str, list[dict]] = {}
+    for row in candidates.iter_rows(named=True):
+        postcode = row.get("_match_postcode")
+        if postcode:
+            buckets.setdefault(postcode, []).append(row)
+    return buckets
+
+
+def _numbers_compatible(left: str | None, right: str | None) -> bool:
+    if not left or not right:
+        return False
+    left_nums = set(_NUMBER_RE.findall(left))
+    right_nums = set(_NUMBER_RE.findall(right))
+    smaller, larger = (
+        (left_nums, right_nums)
+        if len(left_nums) <= len(right_nums)
+        else (right_nums, left_nums)
+    )
+    if not smaller and larger:
+        return False
+    return smaller.issubset(larger)
+
+
+def _has_number(address: str | None) -> bool:
+    return bool(address and _NUMBER_RE.search(address))
+
+
+def _ratio_bonus(
+    left: float | int | None, right: float | int | None, pct: float, cap: float
+) -> float:
+    if left is None or right is None:
+        return 0.0
+    try:
+        left_f = float(left)
+        right_f = float(right)
+    except (TypeError, ValueError):
+        return 0.0
+    if left_f <= 0 or right_f <= 0:
+        return 0.0
+    rel = abs(left_f - right_f) / max(left_f, right_f)
+    if rel > pct:
+        return 0.0
+    return cap * (1.0 - rel / pct)
+
+
+def _rooms_bonus(left: int | None, right: int | None) -> float:
+    if left is None or right is None:
+        return 0.0
+    try:
+        diff = abs(int(left) - int(right))
+    except (TypeError, ValueError):
+        return 0.0
+    if diff == 0:
+        return 4.0
+    if diff == 1:
+        return 2.0
+    return 0.0
+
+
+def _enum_bonus(
+    left: str | None, right: str | None, *, exact: float, mismatch: float
+) -> float:
+    if not left or not right:
+        return 0.0
+    return exact if left == right else mismatch
+
+
+def _address_score(query: str, candidate: str | None) -> int:
+    if not candidate:
+        return 0
+    return max(
+        fuzz.token_set_ratio(query, candidate),
+        fuzz.token_sort_ratio(query, candidate),
+    )
+
+
+def _best_property_candidate(listing: dict, candidates: list[dict]) -> dict | None:
+    query = listing.get("_listing_match_address")
+    if not query:
+        return None
+
+    listing_has_numbers = _has_number(query)
+    scored: list[tuple[float, int, dict, str]] = []
+    for candidate in candidates:
+        register_address = candidate.get("_match_register_address")
+        epc_address = candidate.get("_match_epc_address")
+        if listing_has_numbers and not (
+            _numbers_compatible(query, register_address)
+            or _numbers_compatible(query, epc_address)
+        ):
+            continue
+
+        register_score = _address_score(query, register_address)
+        epc_score = _address_score(query, epc_address)
+        base_score = max(register_score, epc_score)
+        if base_score == 0:
+            continue
+
+        score = float(base_score)
+        score += _enum_bonus(
+            listing.get("Property type"),
+            candidate.get("Property type"),
+            exact=7.0,
+            mismatch=-8.0,
+        )
+        score += _enum_bonus(
+            listing.get("Leasehold/Freehold"),
+            candidate.get("Leasehold/Freehold"),
+            exact=3.0,
+            mismatch=-3.0,
+        )
+        score += _ratio_bonus(
+            listing.get("Total floor area (sqm)"),
+            candidate.get("Total floor area (sqm)"),
+            pct=0.15,
+            cap=8.0,
+        )
+        score += _rooms_bonus(
+            listing.get("Number of bedrooms & living rooms"),
+            candidate.get("Number of bedrooms & living rooms"),
+        )
+        score += _ratio_bonus(
+            listing.get("Asking price"),
+            candidate.get("Estimated current price")
+            or candidate.get("Last known price"),
+            pct=0.25,
+            cap=3.0,
+        )
+        matched_address = (
+            "Address per Property Register"
+            if register_score >= epc_score
+            else "Address per EPC"
+        )
+        scored.append((score, base_score, candidate, matched_address))
+
+    if not scored:
+        return None
+    scored.sort(key=lambda item: item[0], reverse=True)
+    top = scored[0]
+    runner_up = scored[1][0] if len(scored) > 1 else None
+    margin = top[0] - runner_up if runner_up is not None else top[0]
+    threshold = (
+        PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS
+        if listing_has_numbers
+        else PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS
+    )
+    if top[0] < threshold or margin < PROPERTY_MATCH_MIN_MARGIN:
+        return None
+
+    return {
+        "_listing_idx": listing["_listing_idx"],
+        "_property_row": top[2]["_property_row"],
+        "Historical property match score": round(top[0], 1),
+        "Historical property address score": top[1],
+        "Historical property match margin": round(margin, 1),
+        "Historical property match field": top[3],
+        "Historical property match status": "matched",
+    }
+
+
+def _match_properties(listings: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame:
+    schema = {
+        "_listing_idx": pl.UInt32,
+        "_property_row": pl.UInt32,
+        "Historical property match score": pl.Float32,
+        "Historical property address score": pl.Int32,
+        "Historical property match margin": pl.Float32,
+        "Historical property match field": pl.Utf8,
+        "Historical property match status": pl.Utf8,
+    }
+    if candidates.is_empty():
+        return pl.DataFrame(schema=schema)
+
+    buckets = _property_candidates_by_postcode(candidates)
+    matches = []
+    for listing in tqdm(
+        listings.iter_rows(named=True),
+        total=listings.height,
+        desc="Matching historical properties",
+    ):
+        postcode = listing.get("_listing_match_postcode")
+        if not postcode:
+            continue
+        match = _best_property_candidate(listing, buckets.get(postcode, []))
+        if match is not None:
+            matches.append(match)
+
+    if not matches:
+        return pl.DataFrame(schema=schema)
+    return pl.DataFrame(matches, schema=schema)
+
+
+def _prefix_columns(df: pl.DataFrame, columns: list[str], prefix: str) -> pl.DataFrame:
+    rename = {column: f"{prefix}{column}" for column in columns if column in df.columns}
+    return df.rename(rename)
+
+
+def _ensure_prefixed_columns(
+    df: pl.DataFrame, columns: list[str], prefix: str
+) -> pl.DataFrame:
+    missing_exprs = [
+        pl.lit(None, dtype=COLUMN_DTYPES.get(column, pl.Utf8)).alias(
+            f"{prefix}{column}"
+        )
+        for column in columns
+        if f"{prefix}{column}" not in df.columns
+    ]
+    if not missing_exprs:
+        return df
+    return df.with_columns(missing_exprs)
+
+
+def _property_match_frame(
+    matches: pl.DataFrame, candidates: pl.DataFrame
+) -> pl.DataFrame:
+    if matches.is_empty():
+        return matches
+    selected_columns = [
+        "_property_row",
+        *[
+            column
+            for column in PROPERTY_CANDIDATE_COLUMNS
+            if column in candidates.columns
+        ],
+    ]
+    matched = matches.join(
+        candidates.select(selected_columns), on="_property_row", how="left"
+    )
+    return _prefix_columns(
+        matched,
+        [column for column in PROPERTY_CANDIDATE_COLUMNS if column in matched.columns],
+        "_property_",
+    )
+
+
+def _canonical_epc_property_type_expr() -> pl.Expr:
+    bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
+        ["NO DATA!", "Not Recorded"]
+    )
+    has_epc = pl.col("epc_property_type").is_not_null()
+    is_house = pl.col("epc_property_type") == "House"
+    return (
+        pl.when(has_epc & is_house & ~bad_built_form)
+        .then(pl.col("built_form"))
+        .when(has_epc)
+        .then(pl.col("epc_property_type"))
+        .otherwise(None)
+        .replace(
+            {
+                "Flat": "Flats/Maisonettes",
+                "Maisonette": "Flats/Maisonettes",
+                "End-Terrace": "Terraced",
+                "Mid-Terrace": "Terraced",
+                "Enclosed End-Terrace": "Terraced",
+                "Enclosed Mid-Terrace": "Terraced",
+                "Bungalow": "Other",
+                "Park home": "Other",
+                "House": "Other",
+            }
+        )
+    )
+
+
+def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
+    return (
+        pl.col(column)
+        .cast(pl.Utf8)
+        .str.replace("England and Wales: ", "")
+        .str.replace(" onwards", "")
+        .str.extract(r"(\d{4})", 1)
+        .cast(pl.UInt16, strict=False)
+    )
+
+
+def _fractional_year_expr(column: str) -> pl.Expr:
+    return (
+        pl.col(column).dt.year().cast(pl.Float32)
+        + (pl.col(column).dt.month().cast(pl.Float32) - 1.0) / 12.0
+    )
+
+
+def _load_epc_candidates(
+    epc_path: Path, listing_postcodes: list[str], temp_dir: Path
+) -> pl.DataFrame:
+    epc_base = _scan_epc_certificates(epc_path, temp_dir).with_columns(
+        normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"),
+        normalize_postcode_key(pl.col("epc_postcode")).alias("_epc_match_postcode"),
+    )
+
+    epc = (
+        epc_base.filter(pl.col("_epc_match_postcode").is_in(listing_postcodes))
+        .sort("inspection_date", descending=True)
+        .group_by("_epc_match_address", "_epc_match_postcode")
+        .first()
+        .with_columns(
+            _canonical_epc_property_type_expr().alias("_epc_canonical_property_type"),
+            _construction_year_expr().alias("Construction year"),
+            pl.when(pl.col("current_energy_rating").is_in(EPC_RATING_VALUES))
+            .then(pl.col("current_energy_rating"))
+            .otherwise(None)
+            .alias("Current energy rating"),
+            pl.when(pl.col("potential_energy_rating").is_in(EPC_RATING_VALUES))
+            .then(pl.col("potential_energy_rating"))
+            .otherwise(None)
+            .alias("Potential energy rating"),
+            pl.col("total_floor_area").alias("Total floor area (sqm)"),
+            pl.col("number_habitable_rooms").alias("Number of bedrooms & living rooms"),
+            pl.col("floor_height").alias("Interior height (m)"),
+            pl.col("epc_address").alias("Address per EPC"),
+        )
+        .drop("tenure", strict=False)
+    )
+
+    social_tenure = (
+        epc_base.filter(pl.col("_epc_match_postcode").is_in(listing_postcodes))
+        .filter(pl.col("tenure").str.to_lowercase().str.contains("social"))
+        .select("_epc_match_address", "_epc_match_postcode")
+        .unique()
+        .with_columns(pl.lit("Yes").alias("Former council house"))
+    )
+
+    return (
+        epc.join(
+            social_tenure,
+            on=["_epc_match_address", "_epc_match_postcode"],
+            how="left",
+        )
+        .with_columns(pl.col("Former council house").fill_null("No"))
+        .filter(pl.col("_epc_match_address").is_not_null())
+        .with_row_index("_epc_row")
+        .select(
+            "_epc_row",
+            "_epc_match_address",
+            "_epc_match_postcode",
+            "_epc_canonical_property_type",
+            *EPC_ENRICHMENT_COLUMNS,
+        )
+        .collect(engine="streaming")
+    )
+
+
+def _epc_candidates_by_postcode(candidates: pl.DataFrame) -> dict[str, list[dict]]:
+    buckets: dict[str, list[dict]] = {}
+    for row in candidates.iter_rows(named=True):
+        postcode = row.get("_epc_match_postcode")
+        if postcode:
+            buckets.setdefault(postcode, []).append(row)
+    return buckets
+
+
+def _best_epc_candidate(listing: dict, candidates: list[dict]) -> dict | None:
+    query = listing.get("_listing_match_address")
+    if not query:
+        return None
+
+    listing_has_numbers = _has_number(query)
+    scored: list[tuple[float, int, dict]] = []
+    for candidate in candidates:
+        address = candidate.get("_epc_match_address")
+        if listing_has_numbers and not _numbers_compatible(query, address):
+            continue
+        base_score = _address_score(query, address)
+        if base_score == 0:
+            continue
+        score = float(base_score)
+        score += _enum_bonus(
+            listing.get("Property type"),
+            candidate.get("_epc_canonical_property_type"),
+            exact=6.0,
+            mismatch=-6.0,
+        )
+        score += _ratio_bonus(
+            listing.get("Total floor area (sqm)"),
+            candidate.get("Total floor area (sqm)"),
+            pct=0.12,
+            cap=8.0,
+        )
+        score += _rooms_bonus(
+            listing.get("Number of bedrooms & living rooms"),
+            candidate.get("Number of bedrooms & living rooms"),
+        )
+        scored.append((score, base_score, candidate))
+
+    if not scored:
+        return None
+    scored.sort(key=lambda item: item[0], reverse=True)
+    top = scored[0]
+    runner_up = scored[1][0] if len(scored) > 1 else None
+    margin = top[0] - runner_up if runner_up is not None else top[0]
+    threshold = (
+        EPC_MATCH_MIN_SCORE_WITH_NUMBERS
+        if listing_has_numbers
+        else EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS
+    )
+    if top[0] < threshold or margin < EPC_MATCH_MIN_MARGIN:
+        return None
+    return {
+        "_listing_idx": listing["_listing_idx"],
+        "_epc_row": top[2]["_epc_row"],
+        "EPC match score": round(top[0], 1),
+        "EPC address score": top[1],
+        "EPC match margin": round(margin, 1),
+        "EPC match status": "matched",
+    }
+
+
+def _match_epc(listings: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame:
+    schema = {
+        "_listing_idx": pl.UInt32,
+        "_epc_row": pl.UInt32,
+        "EPC match score": pl.Float32,
+        "EPC address score": pl.Int32,
+        "EPC match margin": pl.Float32,
+        "EPC match status": pl.Utf8,
+    }
+    if candidates.is_empty():
+        return pl.DataFrame(schema=schema)
+
+    buckets = _epc_candidates_by_postcode(candidates)
+    matches = []
+    for listing in tqdm(
+        listings.iter_rows(named=True),
+        total=listings.height,
+        desc="Matching EPC certificates",
+    ):
+        postcode = listing.get("_listing_match_postcode")
+        if not postcode:
+            continue
+        match = _best_epc_candidate(listing, buckets.get(postcode, []))
+        if match is not None:
+            matches.append(match)
+
+    if not matches:
+        return pl.DataFrame(schema=schema)
+    return pl.DataFrame(matches, schema=schema)
+
+
+def _epc_match_frame(matches: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame:
+    if matches.is_empty():
+        return matches
+    matched = matches.join(
+        candidates.select("_epc_row", *EPC_ENRICHMENT_COLUMNS),
+        on="_epc_row",
+        how="left",
+    )
+    return _prefix_columns(
+        matched,
+        [column for column in EPC_ENRICHMENT_COLUMNS if column in matched.columns],
+        "_epc_",
+    )
+
+
+def _join_postcode_features(
+    listings: pl.DataFrame, postcode_features_path: Path
+) -> pl.DataFrame:
+    postcode_features = pl.scan_parquet(postcode_features_path).collect(
+        engine="streaming"
+    )
+    return listings.join(
+        postcode_features, on="Postcode", how="left", suffix="_postcode"
+    )
+
+
+def _coalesce_feature_columns(df: pl.DataFrame) -> pl.DataFrame:
+    with_columns: list[pl.Expr] = [
+        pl.lit(ENRICHMENT_VERSION, dtype=pl.UInt16).alias(
+            "Actual listing enrichment version"
+        ),
+        _coalesce_non_empty(
+            "_epc_Address per EPC",
+            "_property_Address per EPC",
+        ).alias("Address per EPC"),
+        pl.when(pl.col("Property type").is_in(PROPERTY_TYPE_VALUES))
+        .then(pl.col("Property type"))
+        .otherwise(pl.col("_property_Property type"))
+        .alias("Property type"),
+        pl.when(pl.col("Leasehold/Freehold").is_in(TENURE_VALUES))
+        .then(pl.col("Leasehold/Freehold"))
+        .otherwise(pl.col("_property_Leasehold/Freehold"))
+        .alias("Leasehold/Freehold"),
+        pl.coalesce(
+            _valid_number_expr("Total floor area (sqm)"),
+            _valid_number_expr("_epc_Total floor area (sqm)"),
+            _valid_number_expr("_property_Total floor area (sqm)"),
+        ).alias("Total floor area (sqm)"),
+        pl.when(pl.col("Number of bedrooms & living rooms") > 0)
+        .then(pl.col("Number of bedrooms & living rooms"))
+        .otherwise(
+            pl.coalesce(
+                pl.col("_epc_Number of bedrooms & living rooms"),
+                pl.col("_property_Number of bedrooms & living rooms"),
+            )
+        )
+        .cast(pl.Int32, strict=False)
+        .alias("Number of bedrooms & living rooms"),
+        pl.col("Asking price").alias("Estimated current price"),
+        pl.col("Asking price").alias("Last known price"),
+        _coalesce_non_empty(
+            "_epc_Current energy rating",
+            "_property_Current energy rating",
+        ).alias("Current energy rating"),
+        _coalesce_non_empty(
+            "_epc_Potential energy rating",
+            "_property_Potential energy rating",
+        ).alias("Potential energy rating"),
+        pl.coalesce(
+            _valid_number_expr("_epc_Interior height (m)"),
+            _valid_number_expr("_property_Interior height (m)"),
+        ).alias("Interior height (m)"),
+        pl.coalesce(
+            pl.col("_epc_Construction year"),
+            pl.col("_property_Construction year"),
+        )
+        .cast(pl.UInt16, strict=False)
+        .alias("Construction year"),
+        _coalesce_non_empty(
+            "_epc_Former council house",
+            "_property_Former council house",
+        )
+        .fill_null("No")
+        .alias("Former council house"),
+        pl.col("_property_Is construction date approximate").alias(
+            "Is construction date approximate"
+        ),
+        pl.col("_property_Listed building").fill_null("No").alias("Listed building"),
+        pl.col("_property_Estimated monthly rent").alias("Estimated monthly rent"),
+        pl.col("_property_Street tree density percentile").alias(
+            "Street tree density percentile"
+        ),
+        _fractional_year_expr("_property_Date of last transaction").alias(
+            "Date of last transaction"
+        ),
+    ]
+
+    df = df.with_columns(with_columns)
+    df = df.with_columns(
+        pl.when(
+            pl.col("Asking price").is_not_null()
+            & pl.col("Total floor area (sqm)").is_not_null()
+            & (pl.col("Total floor area (sqm)") > 0)
+        )
+        .then((pl.col("Asking price") / pl.col("Total floor area (sqm)")).round(0))
+        .otherwise(None)
+        .cast(pl.Int32, strict=False)
+        .alias("Asking price per sqm"),
+    ).with_columns(
+        pl.col("Asking price per sqm").alias("Est. price per sqm"),
+        pl.col("Asking price per sqm").alias("Price per sqm"),
+    )
+
+    return df
+
+
+def _drop_internal_columns(df: pl.DataFrame) -> pl.DataFrame:
+    internal_prefixes = ("_property_", "_epc_")
+    internal_exact = {
+        "_listing_idx",
+        "_listing_match_address",
+        "_listing_match_postcode",
+        "_original_postcode",
+        "_property_row",
+        "_epc_row",
+        "lat_postcode",
+        "lon_postcode",
+    }
+    drop_cols = [
+        column
+        for column in df.columns
+        if column in internal_exact or column.startswith(internal_prefixes)
+    ]
+    return df.drop(drop_cols, strict=False)
+
+
+def build_enriched_actual_listings(
+    listings_path: Path,
+    properties_path: Path,
+    postcode_features_path: Path,
+    arcgis_path: Path,
+    output_path: Path,
+    *,
+    epc_path: Path | None = None,
+) -> pl.DataFrame:
+    print(f"Loading listings from {listings_path}...")
+    listings = _read_listings(listings_path, arcgis_path)
+    listing_postcodes = (
+        listings.select("_listing_match_postcode")
+        .drop_nulls()
+        .unique()
+        .to_series()
+        .to_list()
+    )
+    print(f"Listings: {listings.height}; unique postcodes: {len(listing_postcodes)}")
+
+    print(f"Loading property candidates from {properties_path}...")
+    property_candidates = _load_property_candidates(properties_path, listing_postcodes)
+    print(f"Property candidates: {property_candidates.height}")
+    property_matches = _match_properties(listings, property_candidates)
+    print(f"Historical property matches: {property_matches.height}")
+    property_match_frame = _property_match_frame(property_matches, property_candidates)
+
+    enriched = _join_postcode_features(listings, postcode_features_path)
+    if not property_match_frame.is_empty():
+        enriched = enriched.join(property_match_frame, on="_listing_idx", how="left")
+    else:
+        enriched = enriched.with_columns(
+            pl.lit(None, dtype=pl.Utf8).alias("Historical property match status")
+        )
+
+    if epc_path is not None:
+        with tempfile.TemporaryDirectory(
+            prefix="actual_listing_epc_", dir=local_tmp_dir()
+        ) as tmpdir:
+            print(f"Loading EPC candidates from {epc_path}...")
+            epc_candidates = _load_epc_candidates(
+                epc_path, listing_postcodes, Path(tmpdir)
+            )
+            print(f"EPC candidates: {epc_candidates.height}")
+            epc_matches = _match_epc(listings, epc_candidates)
+            print(f"EPC matches: {epc_matches.height}")
+            epc_match_frame = _epc_match_frame(epc_matches, epc_candidates)
+            if not epc_match_frame.is_empty():
+                enriched = enriched.join(epc_match_frame, on="_listing_idx", how="left")
+            else:
+                enriched = enriched.with_columns(
+                    pl.lit(None, dtype=pl.Utf8).alias("EPC match status")
+                )
+    else:
+        enriched = enriched.with_columns(
+            pl.lit(None, dtype=pl.Utf8).alias("EPC match status")
+        )
+
+    enriched = _ensure_prefixed_columns(
+        enriched, PROPERTY_CANDIDATE_COLUMNS, "_property_"
+    )
+    enriched = _ensure_prefixed_columns(enriched, EPC_ENRICHMENT_COLUMNS, "_epc_")
+    enriched = _coalesce_feature_columns(enriched)
+    enriched = _drop_internal_columns(enriched)
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    enriched.write_parquet(output_path)
+    size_mb = output_path.stat().st_size / (1024 * 1024)
+    print(
+        f"Wrote {enriched.height} enriched listings to {output_path} ({size_mb:.1f} MB)"
+    )
+    return enriched
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Build a pre-enriched actual-listings parquet for the server"
+    )
+    parser.add_argument(
+        "--listings",
+        type=Path,
+        default=Path("finder/data/online_listings_buy.parquet"),
+        help="Input scraped listings parquet",
+    )
+    parser.add_argument(
+        "--properties",
+        type=Path,
+        default=Path("property-data/properties.parquet"),
+        help="Historical properties parquet",
+    )
+    parser.add_argument(
+        "--postcode-features",
+        type=Path,
+        default=Path("property-data/postcode.parquet"),
+        help="Postcode feature parquet",
+    )
+    parser.add_argument(
+        "--arcgis",
+        type=Path,
+        default=Path("property-data/arcgis_data.parquet"),
+        help="ArcGIS/NSPL postcode parquet used for terminated-postcode remapping",
+    )
+    parser.add_argument(
+        "--epc",
+        type=Path,
+        default=Path("manual-data/domestic-csv.zip"),
+        help="Optional EPC certificates CSV/zip for direct listing-to-EPC fuzzy matching",
+    )
+    parser.add_argument(
+        "--no-epc",
+        action="store_true",
+        help="Skip direct EPC matching even when --epc exists",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("finder/data/online_listings_buy_enriched.parquet"),
+        help="Output enriched listings parquet",
+    )
+    args = parser.parse_args()
+
+    epc_path = None if args.no_epc else args.epc
+    if epc_path is not None and not epc_path.exists():
+        print(
+            f"EPC source not found at {epc_path}; continuing without direct EPC matching"
+        )
+        epc_path = None
+
+    build_enriched_actual_listings(
+        listings_path=args.listings,
+        properties_path=args.properties,
+        postcode_features_path=args.postcode_features,
+        arcgis_path=args.arcgis,
+        epc_path=epc_path,
+        output_path=args.output,
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -22,6 +22,7 @@ LISTED_BUILDING_FEATURE = "Listed building"
 LISTED_BUILDING_MATCH_RADIUS_M = 250.0
 LISTED_BUILDING_NEAREST_POSTCODES = 3
 LISTED_BUILDING_MIN_MATCH_SCORE = 95
+_UNPUBLISHED_CONSERVATION_AREA_PREFIX = "no data available for publication"

 _IOD_PERCENTILE_COLUMNS = [
    "Education, Skills and Training Score",
@ -429,19 +430,38 @@ def _normalise_crs(crs: object | None) -> str:
    return str(crs) if crs else "EPSG:4326"


+def _is_unpublished_conservation_area_record(name: object) -> bool:
+    return (
+        isinstance(name, str)
+        and name.strip().casefold().startswith(_UNPUBLISHED_CONSERVATION_AREA_PREFIX)
+    )
+
+
 def _load_conservation_area_geometries(
    conservation_areas_path: Path,
 ) -> tuple[list[BaseGeometry], str]:
-    metadata, table = pyogrio.read_arrow(conservation_areas_path, columns=[])
+    metadata, table = pyogrio.read_arrow(conservation_areas_path, columns=["NAME"])
    geometry_name = metadata.get("geometry_name") or table.column_names[-1]
+    names = table["NAME"].combine_chunks().to_pylist()
    geometries = []
-    for geom in from_wkb(table[geometry_name].combine_chunks().to_pylist()):
-        if geom is not None and not geom.is_empty:
+    skipped_unpublished = 0
+    for name, geom in zip(
+        names, from_wkb(table[geometry_name].combine_chunks().to_pylist()), strict=True
+    ):
+        if _is_unpublished_conservation_area_record(name):
+            skipped_unpublished += 1
+        elif geom is not None and not geom.is_empty:
            geometries.append(geom)
    if not geometries:
        raise ValueError(
            f"{conservation_areas_path} does not contain any usable polygon geometries"
        )
+    if skipped_unpublished:
+        print(
+            "Skipped "
+            f"{skipped_unpublished} Historic England unpublished conservation-area "
+            "placeholder polygons"
+        )
    return geometries, _normalise_crs(metadata.get("crs"))


--- a/pipeline/transform/poi_proximity.py
+++ b/pipeline/transform/poi_proximity.py
@ -25,7 +25,7 @@ GREENSPACE_PARK_FUNCTIONS = {
 }

 GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
-DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure"}
+DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure", "Health"}
 DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS = {"Groceries"}
 DYNAMIC_FILTER_EXCLUDED_CATEGORIES = {"Park"}

--- a/pipeline/transform/test_enrich_actual_listings.py
+++ b/pipeline/transform/test_enrich_actual_listings.py
@ -0,0 +1,143 @@
+from pathlib import Path
+
+import polars as pl
+
+from pipeline.transform.enrich_actual_listings import build_enriched_actual_listings
+
+
+def test_build_enriched_actual_listings_joins_postcode_and_property_features(
+    tmp_path: Path,
+) -> None:
+    listings_path = tmp_path / "listings.parquet"
+    properties_path = tmp_path / "properties.parquet"
+    postcode_path = tmp_path / "postcode.parquet"
+    arcgis_path = tmp_path / "arcgis.parquet"
+    output_path = tmp_path / "online_listings_buy_enriched.parquet"
+
+    pl.DataFrame(
+        {
+            "Bedrooms": [2],
+            "Bathrooms": [1],
+            "Number of bedrooms & living rooms": [3],
+            "lon": [-0.1],
+            "lat": [51.5],
+            "Postcode": ["AA1 1AB"],
+            "Address per Property Register": ["1 High Street"],
+            "Leasehold/Freehold": [None],
+            "Property type": ["Terraced"],
+            "Property sub-type": ["Terraced"],
+            "Price qualifier": [""],
+            "Total floor area (sqm)": [None],
+            "Listing URL": ["https://example.test/listing"],
+            "Listing features": [["Garden"]],
+            "Listing date": [None],
+            "Listing status": ["For sale"],
+            "Asking price": [300_000],
+            "Asking price per sqm": [None],
+        },
+        schema={
+            "Bedrooms": pl.Int32,
+            "Bathrooms": pl.Int32,
+            "Number of bedrooms & living rooms": pl.Int32,
+            "lon": pl.Float64,
+            "lat": pl.Float64,
+            "Postcode": pl.Utf8,
+            "Address per Property Register": pl.Utf8,
+            "Leasehold/Freehold": pl.Utf8,
+            "Property type": pl.Utf8,
+            "Property sub-type": pl.Utf8,
+            "Price qualifier": pl.Utf8,
+            "Total floor area (sqm)": pl.Float64,
+            "Listing URL": pl.Utf8,
+            "Listing features": pl.List(pl.Utf8),
+            "Listing date": pl.Datetime("us"),
+            "Listing status": pl.Utf8,
+            "Asking price": pl.Int64,
+            "Asking price per sqm": pl.Int32,
+        },
+    ).write_parquet(listings_path)
+
+    pl.DataFrame(
+        {
+            "Address per Property Register": ["1 HIGH STREET"],
+            "Postcode": ["AA1 1AA"],
+            "Leasehold/Freehold": ["Freehold"],
+            "Address per EPC": ["1 High Street"],
+            "Current energy rating": ["C"],
+            "Potential energy rating": ["B"],
+            "Total floor area (sqm)": [80.0],
+            "Number of bedrooms & living rooms": [4],
+            "Interior height (m)": [2.4],
+            "Construction year": [1935],
+            "Former council house": ["No"],
+            "Listed building": ["No"],
+            "Estimated monthly rent": [1200.0],
+            "Street tree density percentile": [75.0],
+            "Property type": ["Terraced"],
+            "Estimated current price": [310_000.0],
+        },
+        schema={
+            "Address per Property Register": pl.Utf8,
+            "Postcode": pl.Utf8,
+            "Leasehold/Freehold": pl.Utf8,
+            "Address per EPC": pl.Utf8,
+            "Current energy rating": pl.Utf8,
+            "Potential energy rating": pl.Utf8,
+            "Total floor area (sqm)": pl.Float64,
+            "Number of bedrooms & living rooms": pl.Int32,
+            "Interior height (m)": pl.Float64,
+            "Construction year": pl.UInt16,
+            "Former council house": pl.Utf8,
+            "Listed building": pl.Utf8,
+            "Estimated monthly rent": pl.Float32,
+            "Street tree density percentile": pl.Float32,
+            "Property type": pl.Utf8,
+            "Estimated current price": pl.Float64,
+        },
+    ).write_parquet(properties_path)
+
+    pl.DataFrame(
+        {
+            "Postcode": ["AA1 1AA"],
+            "Income Score": [82.5],
+            "Within conservation area": ["Yes"],
+        }
+    ).write_parquet(postcode_path)
+
+    pl.DataFrame(
+        {
+            "pcds": ["AA1 1AA", "AA1 1AB"],
+            "ctry25cd": ["E92000001", "E92000001"],
+            "doterm": [None, "202401"],
+            "east1m": [100.0, 105.0],
+            "north1m": [100.0, 105.0],
+        },
+        schema={
+            "pcds": pl.Utf8,
+            "ctry25cd": pl.Utf8,
+            "doterm": pl.Utf8,
+            "east1m": pl.Float64,
+            "north1m": pl.Float64,
+        },
+    ).write_parquet(arcgis_path)
+
+    result = build_enriched_actual_listings(
+        listings_path=listings_path,
+        properties_path=properties_path,
+        postcode_features_path=postcode_path,
+        arcgis_path=arcgis_path,
+        output_path=output_path,
+        epc_path=None,
+    )
+
+    row = result.row(0, named=True)
+    assert output_path.exists()
+    assert row["Postcode"] == "AA1 1AA"
+    assert row["Historical property match status"] == "matched"
+    assert row["Income Score"] == 82.5
+    assert row["Within conservation area"] == "Yes"
+    assert row["Leasehold/Freehold"] == "Freehold"
+    assert row["Total floor area (sqm)"] == 80.0
+    assert row["Asking price per sqm"] == 3750
+    assert row["Estimated current price"] == 300_000
+    assert row["Current energy rating"] == "C"
--- a/pipeline/transform/test_merge.py
+++ b/pipeline/transform/test_merge.py
@ -1,14 +1,17 @@
 import polars as pl
+import pyarrow as pa
 import pytest
-from shapely import box
+from shapely import box, to_wkb

 from pipeline.transform.merge import (
    _AREA_COLUMNS,
    CONSERVATION_AREA_FEATURE,
    LISTED_BUILDING_FEATURE,
    TREE_DENSITY_FEATURE,
+    _is_unpublished_conservation_area_record,
    _is_dynamic_poi_metric_column,
    _less_deprived_percentile_expr,
+    _load_conservation_area_geometries,
    _matched_listed_building_flags,
    _postcode_conservation_area_flags,
    _postcode_listed_building_candidates,
@ -82,6 +85,45 @@ def test_postcode_conservation_area_flags_marks_point_membership() -> None:
    ]


+def test_unpublished_conservation_area_records_are_identified() -> None:
+    assert _is_unpublished_conservation_area_record(
+        "No data available for publication by HE"
+    )
+    assert not _is_unpublished_conservation_area_record("Bloomsbury")
+    assert not _is_unpublished_conservation_area_record(None)
+
+
+def test_load_conservation_area_geometries_skips_unpublished_placeholders(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path,
+) -> None:
+    real_area = box(0, 0, 1, 1)
+    placeholder_area = box(-100, -100, 100, 100)
+
+    def fake_read_arrow(path, columns):
+        assert path == tmp_path / "conservation_areas.gpkg"
+        assert columns == ["NAME"]
+        table = pa.table(
+            {
+                "NAME": [
+                    "Central Village",
+                    "No data available for publication by HE",
+                ],
+                "SHAPE": to_wkb([real_area, placeholder_area]),
+            }
+        )
+        return {"geometry_name": "SHAPE", "crs": "EPSG:4326"}, table
+
+    monkeypatch.setattr("pipeline.transform.merge.pyogrio.read_arrow", fake_read_arrow)
+
+    geometries, crs = _load_conservation_area_geometries(
+        tmp_path / "conservation_areas.gpkg"
+    )
+
+    assert crs == "EPSG:4326"
+    assert geometries == [real_area]
+
+
 def test_postcode_listed_building_candidates_uses_nearby_postcodes() -> None:
    listed_points = pl.DataFrame(
        {
--- a/pipeline/transform/test_poi_proximity.py
+++ b/pipeline/transform/test_poi_proximity.py
@ -37,6 +37,7 @@ def test_dynamic_poi_groups_include_requested_categories_only() -> None:
    assert set(display_names.values()) == {
        "Bus stop",
        "Café",
+        "Pharmacy",
        "Rail station",
        "Restaurant",
        "Tesco",
@ -44,7 +45,6 @@ def test_dynamic_poi_groups_include_requested_categories_only() -> None:
    assert "poi_waitrose" not in groups
    assert "poi_park" not in groups
    assert "poi_school" not in groups
-    assert "poi_pharmacy" not in groups


 def test_dynamic_poi_metric_renames_support_park_count_options() -> None:
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -1316,17 +1316,122 @@ def transform_grocery_retail_points(
    ).select("id", "name", "category", "icon_category", "group", "lat", "lng", "emoji")


-def transform_gias_schools(gias_path: Path) -> pl.LazyFrame:
-    """Convert the GIAS register parquet into POI rows with school metadata."""
-    return pl.scan_parquet(gias_path).select(
+SCHOOL_ICON_CATEGORIES: dict[str, str] = {
+    "Nursery school": "🧸",
+    "Primary school": "🎒",
+    "Secondary school": "🏫",
+    "All-through school": "🏫",
+    "Sixth form": "📚",
+    "Further education college": "📚",
+    "University": "🎓",
+    "Special school": "🤝",
+    "School": "🏫",
+}
+
+
+def _school_icon_category_expr() -> pl.Expr:
+    """Pick an icon category from GIAS phase/type_group/age_range. type_group
+    wins for universities, FE colleges and special schools (which span multiple
+    phases); otherwise phase determines the bucket. For independent and other
+    non-statutory schools where GIAS leaves phase null, fall back to the
+    age_range bounds so they still split into the right pill."""
+    # GIAS phase mixes casing ("Middle deemed Primary" vs "Middle deemed
+    # primary") so we normalise before matching.
+    phase = pl.col("phase").str.to_lowercase()
+    # age_range is "<min>–<max>" using an em-dash; both ends may be missing.
+    age_parts = pl.col("age_range").str.split_exact("–", 1)
+    min_age = age_parts.struct.field("field_0").cast(pl.Int32, strict=False)
+    max_age = age_parts.struct.field("field_1").cast(pl.Int32, strict=False)
+    return (
+        pl.when(pl.col("type_group") == "Universities")
+        .then(pl.lit("University"))
+        .when(pl.col("type_group") == "Special schools")
+        .then(pl.lit("Special school"))
+        .when(pl.col("type_group") == "Colleges")
+        .then(pl.lit("Further education college"))
+        .when(phase == "nursery")
+        .then(pl.lit("Nursery school"))
+        .when(phase.is_in(["primary", "middle deemed primary"]))
+        .then(pl.lit("Primary school"))
+        .when(phase.is_in(["secondary", "middle deemed secondary"]))
+        .then(pl.lit("Secondary school"))
+        .when(phase == "all-through")
+        .then(pl.lit("All-through school"))
+        .when(phase.is_in(["16 plus", "sixth form"]))
+        .then(pl.lit("Sixth form"))
+        # Age-range fallback for null-phase rows (≈3k Independents + Academies
+        # GIAS doesn't classify by phase).
+        .when(max_age <= 5)
+        .then(pl.lit("Nursery school"))
+        .when(min_age >= 16)
+        .then(pl.lit("Sixth form"))
+        .when((min_age <= 6) & (max_age >= 16))
+        .then(pl.lit("All-through school"))
+        .when(max_age <= 11)
+        .then(pl.lit("Primary school"))
+        .when(min_age >= 10)
+        .then(pl.lit("Secondary school"))
+        .otherwise(pl.lit("School"))
+    )
+
+
+OFSTED_OEIF_LABELS = {
+    "1": "Outstanding",
+    "2": "Good",
+    "3": "Requires improvement",
+    "4": "Inadequate",
+}
+
+
+def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
+    """Project the latest OEIF effectiveness grade to a human-readable label,
+    keyed by URN so it can be joined onto the GIAS register. Grades 1-4 map to
+    the conventional Ofsted labels; "Not judged" (post-2025 reform schools that
+    only have a report card) is preserved verbatim; null grades drop out."""
+    grade_col = pl.col("Latest OEIF overall effectiveness")
+    label = (
+        pl.when(grade_col == "1")
+        .then(pl.lit(OFSTED_OEIF_LABELS["1"]))
+        .when(grade_col == "2")
+        .then(pl.lit(OFSTED_OEIF_LABELS["2"]))
+        .when(grade_col == "3")
+        .then(pl.lit(OFSTED_OEIF_LABELS["3"]))
+        .when(grade_col == "4")
+        .then(pl.lit(OFSTED_OEIF_LABELS["4"]))
+        .when(grade_col == "Not judged")
+        .then(pl.lit("Not judged"))
+        .otherwise(None)
+    )
+    return (
+        pl.scan_parquet(ofsted_path)
+        .select(
+            pl.col("URN").cast(pl.Int64).alias("urn"),
+            label.alias("ofsted_rating"),
+        )
+        .filter(pl.col("ofsted_rating").is_not_null())
+    )
+
+
+def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
+    """Convert the GIAS register parquet into POI rows with school metadata.
+    Ofsted ratings are joined by URN so each school carries its latest OEIF
+    overall effectiveness grade (Outstanding/Good/Requires improvement/
+    Inadequate/Not judged), surfaced in the map popup."""
+    icon_category_expr = _school_icon_category_expr()
+    emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES)
+    ofsted = _load_ofsted_ratings(ofsted_path)
+    # category mirrors icon_category so the dashboard renders one toggle per
+    # school type (Nursery / Primary / Secondary / Sixth form / University /…)
+    # instead of bundling every GIAS row under a single "School" pill.
+    return pl.scan_parquet(gias_path).join(ofsted, on="urn", how="left").select(
        pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
        pl.col("name"),
-        pl.lit("School").alias("category"),
-        pl.lit("School").alias("icon_category"),
+        icon_category_expr.alias("category"),
+        icon_category_expr.alias("icon_category"),
        pl.lit("Education").alias("group"),
        pl.col("lat").cast(pl.Float64),
        pl.col("lng").cast(pl.Float64),
-        pl.lit("🏫").alias("emoji"),
+        emoji_expr.alias("emoji"),
        pl.col("phase").alias("school_phase"),
        pl.col("type").alias("school_type"),
        pl.col("type_group").alias("school_type_group"),
@ -1346,6 +1451,7 @@ def transform_gias_schools(gias_path: Path) -> pl.LazyFrame:
        pl.col("website").alias("school_website"),
        pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
        pl.col("head_name").alias("school_head_name"),
+        pl.col("ofsted_rating").alias("school_ofsted_rating"),
    )


@ -1355,6 +1461,7 @@ def transform(
    boundary_path: Path,
    grocery_retail_points_path: Path,
    gias_path: Path,
+    ofsted_path: Path,
 ) -> pl.LazyFrame:
    lf = pl.scan_parquet(input_path)

@ -1420,7 +1527,12 @@ def transform(

    grocery_df = pl.read_parquet(grocery_retail_points_path)
    grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
-    frames = [lf, naptan, grocery_pois.lazy(), transform_gias_schools(gias_path)]
+    frames = [
+        lf,
+        naptan,
+        grocery_pois.lazy(),
+        transform_gias_schools(gias_path, ofsted_path),
+    ]

    return pl.concat(frames, how="diagonal_relaxed")

@ -1453,6 +1565,12 @@ def main():
        required=True,
        help="GIAS schools register parquet (replaces OSM schools)",
    )
+    parser.add_argument(
+        "--ofsted",
+        type=Path,
+        required=True,
+        help="Ofsted latest-inspections parquet (provides per-URN ratings)",
+    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output filtered POIs parquet file"
    )
@ -1464,6 +1582,7 @@ def main():
        args.boundary,
        args.grocery_retail_points,
        args.gias,
+        args.ofsted,
    ).collect(engine="streaming")

    df.write_parquet(args.output)
--- a/pipeline/transform/tree_overlay_tiles.py
+++ b/pipeline/transform/tree_overlay_tiles.py
@ -219,6 +219,8 @@ def build_tree_overlay_tiles(
                str(max_zoom),
                "--drop-smallest-as-needed",
                "--extend-zooms-if-still-dropping",
+                "--temporary-directory",
+                tmp,
                str(ndjson_path),
            ],
            check=True,