Improve data pipeline

2026-06-01 20:10:03 +01:00 · 2026-06-01 20:10:03 +01:00 · f99bd4e5c9
commit f99bd4e5c9
parent e8345cbdc1
36 changed files with 966 additions and 129 deletions
--- a/pipeline/transform/crime.py
+++ b/pipeline/transform/crime.py
@ -28,6 +28,17 @@ MINOR_CRIME_TYPES = (
    "Other crime",
 )

+# Legacy police.uk crime-type names (pre-2014 taxonomy) mapped to their closest
+# current equivalent. Without this, ~1.9M incidents from 2010-2013 ("Violent
+# crime", "Public disorder and weapons") are unrecognised and silently dropped,
+# which understates pre-2013 serious crime and creates an artificial 2012->2013
+# step in the by-year series. Applied with `.replace` (not `.replace_strict`) so
+# unmapped current types pass through unchanged.
+LEGACY_CRIME_TYPE_ALIASES = {
+    "Violent crime": "Violence and sexual offences",
+    "Public disorder and weapons": "Public order",
+}
+

 def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
    csvs = sorted(crime_dir.rglob("*.csv"))
@ -96,6 +107,7 @@ def transform_crime(
            & pl.col("Crime type").is_not_null()
            & (pl.col("Crime type") != "")
        )
+        .with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
        .group_by("LSOA code", "Month", "Crime type")
        .agg((pl.col("_weight").first() * pl.len()).alias("count"))
        .group_by("LSOA code", "Crime type")
@ -147,7 +159,10 @@ def _write_crime_by_year(
        & (pl.col("LSOA code") != "")
        & pl.col("Crime type").is_not_null()
        & (pl.col("Crime type") != "")
-    ).with_columns(pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"))
+    ).with_columns(
+        pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"),
+        pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES),
+    )

    # Months observed *anywhere* in the dataset for each year (annualisation denominator).
    # Using crime-type-specific months would over-scale years where a rare type appears
--- a/pipeline/transform/crime_hotspot_tiles.py
+++ b/pipeline/transform/crime_hotspot_tiles.py
@ -17,7 +17,7 @@ from pathlib import Path
 import polars as pl

 from pipeline.local_temp import local_tmp_dir
-from pipeline.transform.crime import find_street_crime_csvs
+from pipeline.transform.crime import LEGACY_CRIME_TYPE_ALIASES, find_street_crime_csvs


 def _latest_months(crime_dir: Path, month_count: int) -> list[str]:
@ -80,6 +80,10 @@ def _write_geojsonseq(csvs: list[Path], output_path: Path) -> tuple[int, int]:
        .drop_nulls(["lon", "lat"])
        .filter(pl.col("lon").is_between(-9.5, 5.0))
        .filter(pl.col("lat").is_between(49.0, 57.0))
+        # Canonicalise any legacy pre-2014 type names so the heatmap's crime_type
+        # values always match the frontend's canonical filter list (a no-op for
+        # the recent months this overlay normally covers).
+        .with_columns(pl.col("crime_type").replace(LEGACY_CRIME_TYPE_ALIASES))
        .group_by("lon", "lat", "month", "crime_type")
        .len()
        .rename({"len": "count"})
--- a/pipeline/transform/crime_spatial.py
+++ b/pipeline/transform/crime_spatial.py
@ -44,6 +44,7 @@ import shapely
 from pyproj import Transformer

 from pipeline.transform.crime import (
+    LEGACY_CRIME_TYPE_ALIASES,
    MINOR_CRIME_TYPES,
    SERIOUS_CRIME_TYPES,
    find_street_crime_csvs,
@ -150,6 +151,11 @@ def _accumulate_counts(
                & (pl.col("Crime type") != "")
                & pl.col("year").is_in(years)
            )
+            # Canonicalise legacy pre-2014 crime-type names ("Violent crime",
+            # "Public disorder and weapons") to their current equivalents before
+            # indexing, so ~1.9M historical incidents are counted instead of
+            # dropped. `.replace` leaves current types unchanged.
+            .with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
            # Map crime types to indices with default=None so an unrecognised
            # type yields a null index we can *report* rather than silently drop
            # (the legacy LSOA path surfaced unknown types via its dynamic pivot).
--- a/pipeline/transform/join_epc_pp.py
+++ b/pipeline/transform/join_epc_pp.py
@ -18,11 +18,49 @@ from ..utils import (
    normalize_postcode_key,
 )

-
 pl.Config.set_tbl_cols(-1)

 RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
 MIN_PRICE = 50_000
+
+# Plausible construction-year range; band-derived years outside it (e.g. OCR
+# noise like 1012 or 2202) are nulled rather than published.
+MIN_BUILD_YEAR = 1700
+MAX_BUILD_YEAR = 2030
+
+
+def epc_band_to_year(band: pl.Expr) -> pl.Expr:
+    """Map an EPC construction age band to a single representative build year.
+
+    EPC age bands are ranges (e.g. ``1950-1966``); we use the band MIDPOINT
+    (1958) rather than the lower bound, which previously biased every band-derived
+    year ~10-15 years too young. Open-ended lower bands (``before 1900``) are too
+    wide to pin to a year and return null. Single-year / ``... onwards`` bands use
+    that year. Already-numeric inputs (a year produced by an earlier call) pass
+    through unchanged. Years outside [MIN_BUILD_YEAR, MAX_BUILD_YEAR] are nulled.
+    """
+    text = (
+        band.cast(pl.Utf8)
+        .str.replace("England and Wales: ", "")
+        .str.replace(" onwards", "")
+    )
+    low = text.str.extract(r"(\d{4})", 1).cast(pl.Int32, strict=False)
+    high = text.str.extract(r"(\d{4})\D+(\d{4})", 2).cast(pl.Int32, strict=False)
+    year = (
+        pl.when(text.str.starts_with("before "))
+        .then(None)
+        .when(high.is_not_null())
+        .then(((low + high) / 2).round(0).cast(pl.Int32))
+        .otherwise(low)
+    )
+    return (
+        pl.when((year >= MIN_BUILD_YEAR) & (year <= MAX_BUILD_YEAR))
+        .then(year)
+        .otherwise(None)
+        .cast(pl.UInt16, strict=False)
+    )
+
+
 EPC_SOURCE_COLUMNS = [
    "address",
    "postcode",
@ -410,13 +448,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat

    # For new-builds (old_new == "Y"), use the first transaction date year as
    # the exact construction date; otherwise fall back to the EPC age band.
-    epc_band_year = (
-        pl.col("construction_age_band")
-        .str.replace("England and Wales: ", "")
-        .str.replace(" onwards", "")
-        .str.extract(r"(\d{4})", 1)
-        .cast(pl.UInt16, strict=False)
-    )
+    epc_band_year = epc_band_to_year(pl.col("construction_age_band"))
    transfer_year = (
        pl.col("first_transfer_date").dt.year().cast(pl.UInt16, strict=False)
    )
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -17,7 +17,11 @@ from shapely.strtree import STRtree
 from thefuzz import fuzz

 from pipeline.local_temp import local_tmp_dir
-from pipeline.transform.join_epc_pp import _scan_epc_certificates
+from pipeline.transform.join_epc_pp import _scan_epc_certificates, epc_band_to_year
+from pipeline.transform.price_estimation.knn import (
+    MAX_COMPARABLE_PSM,
+    MIN_COMPARABLE_PSM,
+)
 from pipeline.utils.fuzzy_join import (
    normalize_address_key,
    normalize_postcode_key,
@ -59,7 +63,7 @@ _AREA_COLUMNS = [
    "Air Quality and Road Safety Score",
    # Ethnicity
    "% South Asian",
-    "% East Asian",
+    "% East/SE Asian",
    "% Black",
    "% Mixed",
    "% White",
@ -1060,14 +1064,10 @@ def _canonical_epc_property_type_expr() -> pl.Expr:


 def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
-    return (
-        pl.col(column)
-        .cast(pl.Utf8)
-        .str.replace("England and Wales: ", "")
-        .str.replace(" onwards", "")
-        .str.extract(r"(\d{4})", 1)
-        .cast(pl.UInt16, strict=False)
-    )
+    # Use the shared band->midpoint-year mapping so the direct-EPC / listings
+    # path matches join_epc_pp (band midpoint, not lower bound; 'before 1900' and
+    # implausible years -> null). Already-numeric inputs pass through unchanged.
+    return epc_band_to_year(pl.col(column))


 def _address_score(query: str, candidate: str | None) -> int:
@ -1956,7 +1956,9 @@ def _build(

    # Broadband: derive max available download speed tier per postcode from
    # Ofcom availability percentages.  Tiers: Gigabit ≥1000, UFBB ≥300,
-    # UFBB(100) ≥100, SFBB ≥30 Mbps.  Stored as string enum.
+    # UFBB(100) ≥100, SFBB ≥30 Mbps.  Stored as a numeric (UInt16) Mbps value so
+    # it sorts/filters correctly; null (not a fabricated 10) when no availability
+    # tier is present, so "no data" is distinguishable from a genuine 10 Mbps.
    broadband = (
        pl.scan_parquet(broadband_path)
        .select(
@ -1969,13 +1971,12 @@ def _build(
            .then(100)
            .when(pl.col("SFBB availability (% premises)") > 0)
            .then(30)
-            .otherwise(10)
+            .otherwise(None)
            .cast(pl.UInt16)
            .alias("max_download_speed"),
        )
        .group_by("bb_postcode")
        .agg(pl.col("max_download_speed").max())
-        .with_columns(pl.col("max_download_speed").cast(pl.Utf8))
    )
    area_side_tables = {
        "iod": iod,
@ -2052,9 +2053,20 @@ def _build(
        .otherwise(pl.col("current_energy_rating"))
        .alias("current_energy_rating"),
    ).with_columns(
-        (pl.col("latest_price") / pl.col("total_floor_area"))
-        .round(0)
-        .cast(pl.Int32)
+        # Null out implausible per-sqm values (outside the kNN comparable band):
+        # bulk/block transactions divided by a single unit's floor area otherwise
+        # produce figures up to ~£1.5M/sqm.
+        pl.when(
+            (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
+            & (
+                (pl.col("latest_price") / pl.col("total_floor_area"))
+                .is_between(MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM)
+            )
+        )
+        .then(
+            (pl.col("latest_price") / pl.col("total_floor_area")).round(0).cast(pl.Int32)
+        )
+        .otherwise(None)
        .alias("Price per sqm"),
    )
    wide = _finalize_merged_columns(wide)
--- a/pipeline/transform/postcode_boundaries/output.py
+++ b/pipeline/transform/postcode_boundaries/output.py
@ -5,6 +5,7 @@ from pathlib import Path

 from pyproj import Transformer
 from shapely import make_valid, set_precision
+from shapely.errors import GEOSException
 from shapely.geometry import MultiPolygon, Polygon, mapping, shape
 from shapely.ops import transform as transform_geometry
 from shapely.ops import unary_union
@ -43,7 +44,14 @@ def _largest_polygonal(geom) -> Polygon | None:
 def to_wgs84_geojson(
    geom: Polygon | MultiPolygon, tolerance: float = 1.0
 ) -> dict | None:
-    """Simplify geometry in BNG, convert to WGS84, return GeoJSON dict."""
+    """Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
+
+    Validates the *serialized* GeoJSON dict (via a ``shape()`` round-trip), not
+    just the intermediate Shapely object: coordinate snapping during
+    serialization can otherwise leave a self-intersecting ring that only shows up
+    once the feature is read back from disk. Any such geometry is repaired with
+    ``make_valid`` before returning so written features are always valid.
+    """
    geom = _largest_polygonal(geom)
    if geom is None:
        return None
@ -55,12 +63,28 @@ def to_wgs84_geojson(

    transformer = _get_to_wgs84()
    wgs84 = transform_geometry(transformer.transform, simplified)
-    wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
+    try:
+        wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
+    except GEOSException:
+        # Precision snapping can fail on pathological geometries; fall back to a
+        # plain validity repair without coordinate snapping.
+        wgs84 = make_valid(wgs84)
    wgs84 = _largest_polygonal(wgs84)
    if wgs84 is None:
        return None

-    return mapping(wgs84)
+    geojson_dict = mapping(wgs84)
+
+    # The geometry that actually reaches disk is the GeoJSON dict, so validate
+    # *that* (not the pre-serialization object) and repair if needed.
+    round_trip = shape(geojson_dict)
+    if round_trip.is_empty or not round_trip.is_valid:
+        round_trip = _largest_polygonal(make_valid(round_trip))
+        if round_trip is None or round_trip.is_empty:
+            return None
+        geojson_dict = mapping(round_trip)
+
+    return geojson_dict


 def _fill_holes(geom):
@ -119,7 +143,11 @@ def merge_fragments(
            pre_green = combined
            combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
            combined = _largest_polygon(combined)
-            combined = _fill_holes(combined)
+            # Do NOT _fill_holes here: interior holes carved by the greenspace
+            # subtraction (lakes, enclosed parks) are intentional, not artifacts.
+            # Filling them would re-add the removed area and negate the
+            # subtraction. Artifact holes from the INSPIRE+Voronoi+make_valid
+            # chain were already removed by the _fill_holes above (pre-subtraction).
            # Revert if subtraction + fragment selection lost >90% of area
            if pre_green.area > 0 and combined.area / pre_green.area < 0.1:
                combined = pre_green
--- a/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
+++ b/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
@ -893,3 +893,54 @@ class TestSubtractGreenspace:
        result = subtract_greenspace(postcode, tree, geoms)
        # 80% < 90% cap, so subtraction should happen
        assert result.area == pytest.approx(2000, rel=0.01)
+
+
+class TestToWgs84GeojsonValidity:
+    """to_wgs84_geojson must emit GeoJSON that round-trips to a valid geometry."""
+
+    def test_geojson_round_trips_to_valid_geometry(self):
+        from shapely.geometry import shape
+
+        geojson = to_wgs84_geojson(box(530000, 180000, 530100, 180100))
+        assert geojson is not None
+        rt = shape(geojson)
+        assert not rt.is_empty
+        assert rt.is_valid
+
+    def test_written_district_features_are_all_valid(self, tmp_path):
+        from shapely.geometry import shape
+
+        postcodes = {
+            "AA1 1AA": box(530000, 180000, 530100, 180100),
+            "AA1 1AB": MultiPolygon(
+                [
+                    box(530200, 180000, 530250, 180050),
+                    box(530200, 180060, 530250, 180110),
+                ]
+            ),
+        }
+        assert write_district_geojson(postcodes, tmp_path) == 1
+        collection = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
+        for feature in collection["features"]:
+            geom = shape(feature["geometry"])
+            assert geom.is_valid
+            assert not geom.is_empty
+
+
+class TestGreenspaceHolePreserved:
+    """Interior holes carved by greenspace subtraction must survive merge_fragments
+    (the post-subtraction _fill_holes that previously negated them was removed)."""
+
+    def test_interior_lake_hole_survives_merge_fragments(self):
+        from shapely.strtree import STRtree
+
+        postcode = box(0, 0, 100, 100)  # 10000 sqm
+        lake = box(30, 30, 70, 70)  # 1600 sqm fully-interior hole (16% removal)
+        result = merge_fragments(
+            [("TEST1", postcode)],
+            greenspace_tree=STRtree([lake]),
+            greenspace_geoms=[lake],
+        )
+        merged = result["TEST1"]
+        assert len(list(merged.interiors)) == 1
+        assert merged.area == pytest.approx(10000 - 1600, rel=0.05)
--- a/pipeline/transform/price_estimation/backtest.py
+++ b/pipeline/transform/price_estimation/backtest.py
@ -67,6 +67,16 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
            .struct.field("price")
            .alias("input_price"),
        )
+        .with_columns(
+            # Date of the input (second-to-last) sale, used by the kNN leakage
+            # filter to exclude the target property's own prior sale from its
+            # comparables. Built from year+month (day defaults to the 1st).
+            pl.date(
+                pl.col("input_year").cast(pl.Int32),
+                pl.col("input_month").cast(pl.Int32),
+                1,
+            ).alias("input_date"),
+        )
        .with_columns(
            (
                pl.col("actual_year").cast(pl.Float64)
--- a/pipeline/transform/price_estimation/estimate.py
+++ b/pipeline/transform/price_estimation/estimate.py
@ -18,6 +18,8 @@ import polars as pl

 from pipeline.transform.price_estimation.knn import (
    KNN_BLEND_WEIGHT,
+    MAX_COMPARABLE_PSM,
+    MIN_COMPARABLE_PSM,
    build_knn_pool,
    knn_median_psm,
 )
@ -31,7 +33,13 @@ from pipeline.transform.price_estimation.utils import (

 MAX_KNN_TO_INDEX_RATIO = 2.0
 MIN_KNN_TO_INDEX_RATIO = 0.5
-MAX_ESTIMATE_TO_LAST_PRICE_RATIO = 6.0
+# Cap the final estimate at this multiple of the last known price as a guard
+# against data errors. Set to ~exp(MAX_LOG_ADJUSTMENT) (~20x) so it is
+# consistent with the log-index clip already applied to the index move: many
+# UK sectors legitimately grew >6x since the 1990s (e.g. parts of inner London
+# 12-14x), so the previous 6x cap truncated genuine appreciation rather than
+# only catching outliers.
+MAX_ESTIMATE_TO_LAST_PRICE_RATIO = 20.0


 def guarded_blend_estimates(
@ -222,11 +230,22 @@ def main():
    ).height
    print(f"  kNN blended: {n_blended:,} of {n_estimated:,} estimates")

-    # Derive estimated price per sqm where both estimated price and floor area exist
+    # Derive estimated price per sqm where both estimated price and floor area
+    # exist. Null out values outside the plausibility band [MIN_COMPARABLE_PSM,
+    # MAX_COMPARABLE_PSM] (the same band the kNN pool uses): extreme values come
+    # from bulk/block transactions or floor-area errors and are not meaningful
+    # per-unit prices.
+    _est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
    df = df.with_columns(
-        (pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
-        .round(0)
-        .cast(pl.Int32, strict=False)
+        pl.when(
+            pl.col("Estimated current price").is_not_null()
+            & pl.col("Total floor area (sqm)").is_not_null()
+            & (pl.col("Total floor area (sqm)") > 0)
+            & (_est_psm >= MIN_COMPARABLE_PSM)
+            & (_est_psm <= MAX_COMPARABLE_PSM)
+        )
+        .then(_est_psm.round(0).cast(pl.Int32, strict=False))
+        .otherwise(None)
        .alias("Est. price per sqm"),
    )

--- a/pipeline/transform/price_estimation/index.py
+++ b/pipeline/transform/price_estimation/index.py
@ -24,6 +24,7 @@ from pipeline.transform.price_estimation.shrinkage import (
 )
 from pipeline.transform.price_estimation.utils import (
    CURRENT_YEAR,
+    TEMPORAL_SMOOTHNESS_LAMBDA,
    TYPE_GROUPS,
    build_hedonic_features,
    extract_centroids,
@ -165,12 +166,50 @@ def solve_robust_index(
    cols_arr = np.concatenate([col2[mask2], col1[mask1]])
    signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])

+    # Temporal smoothness prior: penalise curvature in the year betas with a
+    # second-difference penalty lambda * (beta_t - 2*beta_{t-1} + beta_{t-2})^2,
+    # encoded as extra least-squares rows (sqrt(lambda) * [1, -2, 1] against a
+    # zero target). This damps single-year index spikes without flattening
+    # genuine multi-year trends. Betas are ordered by calendar year; the baseline
+    # year (min_year, implicit beta=0) has no column, so the penalty spans the
+    # non-baseline years only. For cells with <3 betas there is no curvature to
+    # penalise and the solve is unchanged.
+    n_pen = 0
+    pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
+    pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
+    if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
+        sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
+        cols_by_year = [c for _, c in sorted(year_to_col.items())]
+        n_pen = n_cols - 2
+        pen_rows = np.repeat(n + np.arange(n_pen), 3)
+        pen_cols = np.empty(n_pen * 3, dtype=np.int64)
+        for k in range(n_pen):
+            pen_cols[3 * k : 3 * k + 3] = (
+                cols_by_year[k],
+                cols_by_year[k + 1],
+                cols_by_year[k + 2],
+            )
+        pen_rows_arr = pen_rows.astype(np.int64)
+        pen_cols_arr = pen_cols
+        pen_vals_arr = np.tile(
+            [sqrt_lambda, -2.0 * sqrt_lambda, sqrt_lambda], n_pen
+        ).astype(np.float64)
+        pen_b = np.zeros(n_pen, dtype=np.float64)
+    n_total_rows = n + n_pen
+
    weights = base_weights.copy()

    for _ in range(IRLS_ITERATIONS):
        data = signs_arr * weights[rows_arr]
-        A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
-        b = log_ratios * weights
+        if n_pen:
+            all_data = np.concatenate([data, pen_vals_arr])
+            all_rows = np.concatenate([rows_arr, pen_rows_arr])
+            all_cols = np.concatenate([cols_arr, pen_cols_arr])
+            b = np.concatenate([log_ratios * weights, pen_b])
+        else:
+            all_data, all_rows, all_cols = data, rows_arr, cols_arr
+            b = log_ratios * weights
+        A = csc_matrix((all_data, (all_rows, all_cols)), shape=(n_total_rows, n_cols))
        betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]

        # Residuals
--- a/pipeline/transform/price_estimation/shrinkage.py
+++ b/pipeline/transform/price_estimation/shrinkage.py
@ -96,8 +96,11 @@ def spatial_smooth(
    for i, sec in enumerate(sectors_with_coords):
        n = counts.get(sec, 0)
        self_w = n / (n + SPATIAL_BLEND_K)
-        if self_w > 0.95:
-            continue  # enough data, skip smoothing
+        if self_w > 0.90:
+            # Enough data, skip smoothing. Relaxed from 0.95 so higher-volume
+            # cells (n ~270-570) that still carry single-year noise get a light
+            # spatial blend, complementing the temporal smoothness prior.
+            continue

        dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
        # Skip self (index 0, distance ~0)
--- a/pipeline/transform/price_estimation/test_knn.py
+++ b/pipeline/transform/price_estimation/test_knn.py
@ -81,8 +81,21 @@ def test_guarded_blend_routes_unstable_knn_to_index_and_caps_uplift():
        last_prices=np.array([100_000.0, 100_000.0]),
    )

+    # Property 0: unstable kNN (>2x index) is dropped, index estimate kept.
    assert blended[0] == 120_000.0
-    assert blended[1] == 600_000.0
+    # Property 1: a 10x uplift over the last price is legitimate appreciation and
+    # is no longer truncated (cap raised from 6x to 20x).
+    assert blended[1] == 1_000_000.0
+
+
+def test_guarded_blend_caps_uplift_at_20x_last_price():
+    # 50x index estimate over the last price is capped at the 20x ceiling.
+    blended = guarded_blend_estimates(
+        index_est=np.array([5_000_000.0]),
+        knn_est=np.array([np.nan]),
+        last_prices=np.array([100_000.0]),
+    )
+    assert blended[0] == 2_000_000.0  # 100_000 * 20


 def test_bungalow_is_not_a_dead_price_index_type_group():
@ -92,3 +105,50 @@ def test_bungalow_is_not_a_dead_price_index_type_group():

    assert "Bungalow" not in TYPE_GROUPS
    assert df["type_group"].to_list() == [None, None]
+
+
+def test_temporal_regularization_damps_curvature_without_breaking_solve():
+    """The second-difference prior reduces year-to-year curvature and keeps the
+    index well-formed (all years present, finite, contiguous)."""
+    from pipeline.transform.price_estimation import index as index_mod
+
+    years = np.arange(2010, 2021)
+    true = {y: 0.04 * (y - 2010) for y in years}
+    y1, y2, lr, w = [], [], [], []
+    for y in years[:-1]:  # adjacent-year pairs following a smooth trend
+        y1.append(y)
+        y2.append(y + 1)
+        lr.append(true[y + 1] - true[y])
+        w.append(1.0)
+    # A spurious single-year jump at 2015 (poorly identified curvature spike).
+    y1.append(2014)
+    y2.append(2015)
+    lr.append(0.5)
+    w.append(1.0)
+    y1, y2 = np.array(y1), np.array(y2)
+    lr, w = np.array(lr, float), np.array(w, float)
+
+    def solve(lmbda):
+        original = index_mod.TEMPORAL_SMOOTHNESS_LAMBDA
+        index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = lmbda
+        try:
+            return index_mod.solve_robust_index(y1, y2, lr, w)
+        finally:
+            index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = original
+
+    unregularised = solve(0.0)
+    regularised = solve(0.2)
+
+    # Index is well-formed for both.
+    assert set(regularised) == set(range(2010, 2021))
+    assert all(np.isfinite(v) for v in regularised.values())
+    assert regularised[2010] == 0.0  # baseline year pinned to 0
+
+    def max_curvature(d):
+        betas = np.array([d[y] for y in sorted(d)])
+        return float(np.abs(np.diff(betas, 2)).max())
+
+    # Regularisation strictly reduces curvature, and never flattens the genuine
+    # uptrend (the index still rises end to end).
+    assert max_curvature(regularised) < max_curvature(unregularised)
+    assert regularised[2020] > regularised[2010]
--- a/pipeline/transform/price_estimation/utils.py
+++ b/pipeline/transform/price_estimation/utils.py
@ -22,6 +22,13 @@ FLAT_TYPES = ["Flats/Maisonettes"]
 TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats"]
 SHRINKAGE_K = 50

+# Temporal regularization for the repeat-sales index: a second-difference
+# (curvature) penalty lambda * sum((beta_t - 2*beta_{t-1} + beta_{t-2})^2) added
+# to the IRLS solve. A mild penalty damps single-year index spikes (which would
+# otherwise distort the estimate of any property whose last sale landed on a
+# noisy year) without flattening genuine multi-year trends.
+TEMPORAL_SMOOTHNESS_LAMBDA = 0.05
+

 def type_group_expr():
    """Polars expression: Property type -> type_group."""
--- a/pipeline/transform/school_proximity.py
+++ b/pipeline/transform/school_proximity.py
@ -15,6 +15,66 @@ SCHOOL_GROUPS = {
 }


+def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
+    """Label good+/outstanding primary & secondary schools for proximity counts.
+
+    Derives a grade ("1" = outstanding, "2" = good) and a proximity ``category``,
+    returning a ``(postcode, category)`` frame.
+
+    Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
+    overall effectiveness" (OEIF = the previous Ofsted Education Inspection
+    Framework). A large and growing share of schools were last inspected under an
+    UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
+    that column is null/"Not judged" for them even when they are demonstrably
+    good — their status lives in "Ungraded inspection overall outcome" ("School
+    remains Good"/"School remains Outstanding", incl. "(Concerns)"/"(Improving)"
+    variants). Filtering on the graded column alone dropped ~7,000 genuinely
+    good/outstanding schools. We fall back to the ungraded outcome, but ONLY when
+    there is no usable graded result (null/"Not judged"), so a genuine grade 3/4
+    is never overridden.
+    """
+    # Cast to Utf8 so the string predicates below are well-defined even if a
+    # column happens to be entirely null (read back as a Null dtype).
+    oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
+    ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
+    no_usable_grade = oeif.is_null() | (oeif == "Not judged")
+    graded = (
+        ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
+        .with_columns(
+            pl.when(oeif.is_in(["1", "2"]))
+            .then(oeif)
+            .when(
+                no_usable_grade
+                & ungraded.str.starts_with("School remains Outstanding")
+            )
+            .then(pl.lit("1"))
+            .when(no_usable_grade & ungraded.str.starts_with("School remains Good"))
+            .then(pl.lit("2"))
+            .otherwise(None)
+            .alias("_ofsted_grade")
+        )
+        .filter(pl.col("_ofsted_grade").is_not_null())
+    )
+    # Good+ groups include both grade variants; outstanding groups count grade 1.
+    return graded.with_columns(
+        pl.when(pl.col("Ofsted phase") == "Primary")
+        .then(
+            pl.when(pl.col("_ofsted_grade") == "1")
+            .then(pl.lit("outstanding_primary"))
+            .otherwise(pl.lit("good_primary"))
+        )
+        .otherwise(
+            pl.when(pl.col("_ofsted_grade") == "1")
+            .then(pl.lit("outstanding_secondary"))
+            .otherwise(pl.lit("good_secondary"))
+        )
+        .alias("category")
+    ).select(
+        pl.col("Postcode").alias("postcode"),
+        "category",
+    )
+
+
 def main():
    parser = argparse.ArgumentParser(
        description="Count good+ and outstanding primary/secondary schools near each postcode"
@ -30,42 +90,14 @@ def main():
    )
    args = parser.parse_args()

-    # Load Ofsted data: filter to good+ (1, 2) primary/secondary schools.
-    # Post-2025 reform the single "Overall effectiveness" grade was retired;
-    # the legacy 1–4 scale is now carried forward under "Latest OEIF overall
-    # effectiveness" (OEIF = the previous Ofsted Education Inspection
-    # Framework). The new report-card columns use text judgements instead.
-    ofsted = pl.read_parquet(args.ofsted).filter(
-        pl.col("Ofsted phase").is_in(["Primary", "Secondary"])
-        & pl.col("Latest OEIF overall effectiveness").is_in(["1", "2"])
-    )
+    ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted))
    if ofsted.is_empty():
        raise ValueError("No good+ primary/secondary Ofsted schools found")

    print(f"Good+ schools: {len(ofsted):,}")
    print(
        "Outstanding schools: "
-        f"{ofsted.filter(pl.col('Latest OEIF overall effectiveness') == '1').height:,}"
-    )
-
-    # Assign category based on phase and rating. Good+ groups include both
-    # category variants; outstanding groups count grade 1 only.
-    ofsted = ofsted.with_columns(
-        pl.when(pl.col("Ofsted phase") == "Primary")
-        .then(
-            pl.when(pl.col("Latest OEIF overall effectiveness") == "1")
-            .then(pl.lit("outstanding_primary"))
-            .otherwise(pl.lit("good_primary"))
-        )
-        .otherwise(
-            pl.when(pl.col("Latest OEIF overall effectiveness") == "1")
-            .then(pl.lit("outstanding_secondary"))
-            .otherwise(pl.lit("good_secondary"))
-        )
-        .alias("category")
-    ).select(
-        pl.col("Postcode").alias("postcode"),
-        "category",
+        f"{ofsted.filter(pl.col('category').str.starts_with('outstanding')).height:,}"
    )

    # Join with arcgis to get lat/lng for each school's postcode
--- a/pipeline/transform/test_crime.py
+++ b/pipeline/transform/test_crime.py
@ -226,3 +226,44 @@ def test_transform_crime_applies_lsoa_2011_to_2021_lookup(tmp_path):
    assert burglaries["E01000050"] == [{"year": 2024, "count": 12.0}]
    assert burglaries["E01000051"] == [{"year": 2024, "count": 12.0}]
    assert burglaries["E01000099"] == [{"year": 2024, "count": 12.0}]
+
+
+def test_transform_crime_maps_legacy_crime_types(tmp_path):
+    """Pre-2014 police.uk type names are aliased to current equivalents instead
+    of being dropped."""
+    crime_dir = tmp_path / "crime"
+    month_dir = crime_dir / "2013-01"
+    month_dir.mkdir(parents=True)
+
+    header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
+    (month_dir / "2013-01-test-force-street.csv").write_text(
+        "\n".join(
+            [
+                header,
+                "1,2013-01,Test Force,Test Force,-0.1,51.5,On or near X,E01000001,L,Violent crime,Under investigation,",
+                "2,2013-01,Test Force,Test Force,-0.1,51.5,On or near X,E01000001,L,Public disorder and weapons,Under investigation,",
+                "3,2013-01,Test Force,Test Force,-0.1,51.5,On or near X,E01000001,L,Burglary,Under investigation,",
+            ]
+        )
+        + "\n"
+    )
+
+    output = tmp_path / "crime.parquet"
+    by_year_output = tmp_path / "crime_by_year.parquet"
+    transform_crime(crime_dir, output, by_year_output)
+
+    row = pl.read_parquet(output).to_dicts()[0]
+    # Single month -> annualised x12. Legacy names mapped to current columns.
+    assert row["Violence and sexual offences (avg/yr)"] == 12.0
+    assert row["Public order (avg/yr)"] == 12.0
+    assert row["Burglary (avg/yr)"] == 12.0
+    # The legacy names must NOT survive as their own columns.
+    assert "Violent crime (avg/yr)" not in row
+    assert "Public disorder and weapons (avg/yr)" not in row
+
+    by_year = pl.read_parquet(by_year_output).row(0, named=True)
+    serious = {p["year"]: p["count"] for p in by_year["Serious crime (by year)"]}
+    # Serious = Violence and sexual offences (12) + Burglary (12) = 24
+    assert serious[2013] == 24.0
+    minor = {p["year"]: p["count"] for p in by_year["Minor crime (by year)"]}
+    assert minor[2013] == 12.0  # Public order
--- a/pipeline/transform/test_crime_spatial.py
+++ b/pipeline/transform/test_crime_spatial.py
@ -279,3 +279,37 @@ def test_unknown_crime_type_is_dropped_with_warning(tmp_path, capsys):
    err = capsys.readouterr().err
    assert "Cyber fraud" in err
    assert "WARNING" in err
+
+
+def test_legacy_crime_types_are_mapped(tmp_path):
+    """Pre-2014 crime-type names are aliased to current equivalents in the
+    spatial transform instead of being dropped as unknown types."""
+    units = tmp_path / "units"
+    _write_boundaries(
+        units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
+    )
+
+    crime = tmp_path / "crime"
+    _write_month(
+        crime,
+        "2013-01",
+        [
+            _crime_row("2013-01", 1005, 1005, "Violent crime"),
+            _crime_row("2013-01", 1005, 1005, "Public disorder and weapons"),
+        ],
+    )
+
+    output = tmp_path / "crime_by_postcode.parquet"
+    by_year = tmp_path / "crime_by_postcode_by_year.parquet"
+    transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
+
+    row = pl.read_parquet(output).to_dicts()[0]
+    # Single postcode -> area-norm factor 1.0; single month/year -> x12.
+    assert row["Violence and sexual offences (avg/yr)"] == 12.0
+    assert row["Public order (avg/yr)"] == 12.0
+
+    by_year_row = pl.read_parquet(by_year).row(0, named=True)
+    assert by_year_row["Violence and sexual offences (by year)"] == [
+        {"year": 2013, "count": 12.0}
+    ]
+    assert by_year_row["Public order (by year)"] == [{"year": 2013, "count": 12.0}]
--- a/pipeline/transform/test_join_epc_pp.py
+++ b/pipeline/transform/test_join_epc_pp.py
@ -169,7 +169,8 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
            "epc_address": "1 Example Street",
            "current_energy_rating": "C",
            "total_floor_area": 85.0,
-            "construction_age_band": 1950,
+            # Band midpoint of 1950-1966, not the lower bound.
+            "construction_age_band": 1958,
            "was_council_house": "Yes",
        }
    ]
@ -256,3 +257,26 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
            "current_energy_rating": None,
        }
    ]
+
+
+def test_epc_band_to_year_uses_midpoint_and_clamps():
+    import polars as pl
+
+    from pipeline.transform.join_epc_pp import epc_band_to_year
+
+    df = pl.DataFrame(
+        {
+            "b": [
+                "England and Wales: 1950-1966",  # midpoint 1958
+                "1900-1929",  # midpoint 1914
+                "England and Wales: before 1900",  # too wide -> null
+                "2012 onwards",  # single year
+                "1012",  # implausible -> null
+                "2202",  # implausible -> null
+                None,  # null -> null
+                "1958",  # already-numeric-as-string -> pass through
+            ]
+        }
+    )
+    years = df.select(epc_band_to_year(pl.col("b")).alias("y"))["y"].to_list()
+    assert years == [1958, 1914, None, 2012, None, None, None, 1958]
--- a/pipeline/transform/test_school_proximity.py
+++ b/pipeline/transform/test_school_proximity.py
@ -0,0 +1,82 @@
+import polars as pl
+
+from pipeline.transform.school_proximity import classify_good_plus_schools
+
+
+def _school(phase, oeif, ungraded, postcode="AA1 1AA"):
+    return {
+        "Postcode": postcode,
+        "Ofsted phase": phase,
+        "Latest OEIF overall effectiveness": oeif,
+        "Ungraded inspection overall outcome": ungraded,
+    }
+
+
+def _classify(rows):
+    result = classify_good_plus_schools(pl.DataFrame(rows))
+    return {(r["postcode"], r["category"]) for r in result.to_dicts()}
+
+
+def test_legacy_oeif_grades_1_and_2_are_kept():
+    rows = [
+        _school("Primary", "1", None, "AA1 1AA"),
+        _school("Primary", "2", None, "AA1 1AB"),
+        _school("Secondary", "1", None, "AA1 1AC"),
+        _school("Secondary", "2", None, "AA1 1AD"),
+    ]
+    assert _classify(rows) == {
+        ("AA1 1AA", "outstanding_primary"),
+        ("AA1 1AB", "good_primary"),
+        ("AA1 1AC", "outstanding_secondary"),
+        ("AA1 1AD", "good_secondary"),
+    }
+
+
+def test_grades_3_and_4_are_excluded():
+    rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
+    assert _classify(rows) == set()
+
+
+def test_ungraded_remains_good_is_recovered_when_no_graded_result():
+    # Null and "Not judged" OEIF fall back to the ungraded outcome.
+    rows = [
+        _school("Primary", None, "School remains Good", "AA1 1AA"),
+        _school("Secondary", "Not judged", "School remains Outstanding", "AA1 1AB"),
+        # "(Concerns)"/"(Improving)" variants are still good+.
+        _school("Primary", None, "School remains Good (Concerns) - S5 Next", "AA1 1AC"),
+        _school(
+            "Secondary",
+            None,
+            "School remains Outstanding (Concerns) - S5 Next",
+            "AA1 1AD",
+        ),
+    ]
+    assert _classify(rows) == {
+        ("AA1 1AA", "good_primary"),
+        ("AA1 1AB", "outstanding_secondary"),
+        ("AA1 1AC", "good_primary"),
+        ("AA1 1AD", "outstanding_secondary"),
+    }
+
+
+def test_ungraded_non_good_outcomes_are_excluded():
+    rows = [
+        _school("Primary", None, "Some aspects not as strong"),
+        _school("Primary", None, "Standards maintained"),
+        _school("Primary", None, None),
+    ]
+    assert _classify(rows) == set()
+
+
+def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
+    # A real grade 3 must not be promoted by an ungraded "remains Good".
+    rows = [_school("Primary", "3", "School remains Good")]
+    assert _classify(rows) == set()
+
+
+def test_non_primary_secondary_phases_excluded():
+    rows = [
+        _school("Nursery", "1", None),
+        _school("Not applicable", "2", None),
+    ]
+    assert _classify(rows) == set()