idgf

2026-06-02 20:14:32 +01:00 · 2026-06-02 20:14:32 +01:00 · aab85fe32e
commit aab85fe32e
parent fbfebc651c
33 changed files with 2016 additions and 283 deletions
--- a/pipeline/transform/crime_spatial.py
+++ b/pipeline/transform/crime_spatial.py
@ -273,6 +273,28 @@ def _write_avg_yr(
    for type_idx, name in enumerate(ALL_CRIME_TYPES):
        data[f"{name} (avg/yr)"] = avg[:, type_idx]

+    # Serious/Minor rollup headlines, computed the SAME way as the by-year rollup
+    # bars (_write_by_year/_rollup_long): sum the rollup's types per year, then
+    # average over the years in which ANY of those types occurred. This keeps the
+    # headline equal to the mean of the "Serious/Minor crime (by year)" bars.
+    # Summing the per-type avg/yr values instead (as the merge previously did)
+    # divides each type by its OWN years-present and overstates the rollup when a
+    # postcode's serious/minor types occur in disjoint years.
+    for rollup_name, rollup_types in (
+        ("Serious crime", SERIOUS_CRIME_TYPES),
+        ("Minor crime", MINOR_CRIME_TYPES),
+    ):
+        rollup_idx = [ALL_CRIME_TYPES.index(name) for name in rollup_types]
+        rollup_counts = counts[:, rollup_idx, :].sum(axis=1)  # (n_postcodes, n_years)
+        rollup_per_year = per_year[:, rollup_idx, :].sum(axis=1)
+        rollup_years_present = np.clip(
+            (rollup_counts > 0).sum(axis=1), 1, None
+        ).astype(np.float64)
+        rollup_avg = rollup_per_year.sum(axis=1) / rollup_years_present
+        data[f"{rollup_name} (avg/yr)"] = np.round(rollup_avg * norm, 1).astype(
+            np.float32
+        )
+
    output_path.parent.mkdir(parents=True, exist_ok=True)
    pl.DataFrame(data).write_parquet(output_path, compression="zstd")
    print(f"Wrote postcode crime averages: {output_path}")
--- a/pipeline/transform/join_epc_pp.py
+++ b/pipeline/transform/join_epc_pp.py
@ -106,7 +106,14 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
            .alias("potential_energy_rating"),
            _clean_string("property_type").alias("epc_property_type"),
            _clean_string("built_form").alias("built_form"),
-            _clean_string("inspection_date").alias("inspection_date"),
+            # Parse to a real Date once (unparseable/blank -> null) so dedup can
+            # sort newest-first with nulls_last and _event_year can use dt.year();
+            # a lexicographic string sort would let a null/garbled date win under
+            # Polars' default nulls-first descending order. EPC inspection dates
+            # are ISO (YYYY-MM-DD).
+            _clean_string("inspection_date")
+            .str.to_date(format="%Y-%m-%d", strict=False)
+            .alias("inspection_date"),
            _clean_number("total_floor_area", pl.Float64).alias("total_floor_area"),
            _clean_number("number_habitable_rooms", pl.Int16).alias(
                "number_habitable_rooms"
@ -247,9 +254,11 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
        normalize_postcode_key(pl.col("epc_postcode")).alias("_epc_match_postcode"),
    )

-    # Dedup fork: keep latest certificate per property (existing logic)
+    # Dedup fork: keep latest certificate per property. inspection_date is a typed
+    # Date (see _select_epc_columns); nulls_last keeps a real-dated cert ahead of a
+    # null/unparseable-dated one so the genuinely newest certificate is chosen.
    epc = (
-        epc_base.sort("inspection_date", descending=True)
+        epc_base.sort("inspection_date", descending=True, nulls_last=True)
        .group_by("_epc_match_address", "_epc_match_postcode")
        .first()
        .drop("tenure")
@ -303,11 +312,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
        )
        .filter(pl.col("_event").is_not_null())
        .with_columns(
-            pl.col("inspection_date")
-            .cast(pl.String)
-            .str.slice(0, 4)
-            .cast(pl.Int32)
-            .alias("_event_year"),
+            pl.col("inspection_date").dt.year().cast(pl.Int32).alias("_event_year"),
        )
        .group_by("_epc_match_address", "_epc_match_postcode")
        .agg(
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -807,6 +807,22 @@ def _remap_terminated_postcodes(
    )


+def _dedupe_collapsed_properties(wide: pl.LazyFrame) -> pl.LazyFrame:
+    """Keep one row per (postcode, pp_address) — the most-recent transaction.
+
+    The terminated-postcode remap can map two distinct postcodes onto one active
+    successor, collapsing the same physical address onto a single
+    (postcode, pp_address) key with conflicting sale records. Keep the row with
+    the latest date_of_transfer so the headline price/date reflect the most
+    recent transaction; genuinely distinct addresses (a different pp_address) are
+    untouched. pp_address is non-null here (join_epc_pp filters it), so the key
+    never merges unrelated rows.
+    """
+    return wide.sort(
+        "date_of_transfer", descending=True, nulls_last=True
+    ).unique(subset=["postcode", "pp_address"], keep="first", maintain_order=True)
+
+
 def _filter_to_active_english_postcodes(
    wide: pl.LazyFrame, active_postcodes: pl.LazyFrame
 ) -> pl.LazyFrame:
@ -837,38 +853,19 @@ def _join_area_side_tables(
    )

    # Crime is counted spatially per postcode (incidents within 50m of the
-    # postcode boundary), so it joins on postcode rather than LSOA.
-    base = base.join(crime, on="postcode", how="left")
-    serious_crime_cols = [
-        "Violence and sexual offences (avg/yr)",
-        "Robbery (avg/yr)",
-        "Burglary (avg/yr)",
-        "Possession of weapons (avg/yr)",
-    ]
-    minor_crime_cols = [
-        "Anti-social behaviour (avg/yr)",
-        "Criminal damage and arson (avg/yr)",
-        "Shoplifting (avg/yr)",
-        "Bicycle theft (avg/yr)",
-        "Theft from the person (avg/yr)",
-        "Other theft (avg/yr)",
-        "Vehicle crime (avg/yr)",
-        "Public order (avg/yr)",
-        "Drugs (avg/yr)",
-        "Other crime (avg/yr)",
-    ]
-    # The LEFT join leaves every per-type column null for postcodes absent from
-    # the crime table; sum_horizontal alone would fabricate a "zero crime"
-    # rollup there, so keep the rollup null when ALL components are null.
-    base = base.with_columns(
-        pl.when(pl.all_horizontal([pl.col(c).is_null() for c in serious_crime_cols]))
-        .then(None)
-        .otherwise(pl.sum_horizontal(serious_crime_cols))
-        .alias("serious_crime_avg_yr"),
-        pl.when(pl.all_horizontal([pl.col(c).is_null() for c in minor_crime_cols]))
-        .then(None)
-        .otherwise(pl.sum_horizontal(minor_crime_cols))
-        .alias("minor_crime_avg_yr"),
+    # postcode boundary), so it joins on postcode rather than LSOA. crime_spatial
+    # precomputes the Serious/Minor headline rollups as the mean of the by-year
+    # rollup bars; read those straight through (renamed to the internal columns
+    # _finalize_merged_columns expects) rather than re-summing the per-type
+    # avg/yr columns — summing divides each type by its OWN years-present and
+    # overstates the rollup when types differ in coverage. A postcode absent from
+    # the crime table keeps null rollups via the left join (no fabricated zero);
+    # the per-type avg/yr columns pass through unchanged for display.
+    base = base.join(crime, on="postcode", how="left").rename(
+        {
+            "Serious crime (avg/yr)": "serious_crime_avg_yr",
+            "Minor crime (avg/yr)": "minor_crime_avg_yr",
+        }
    )

    base = base.join(median_age, on="lsoa21", how="left")
@ -881,7 +878,37 @@ def _join_area_side_tables(
    )
    if tree_density is not None:
        base = base.join(tree_density, on="postcode", how="left")
-    return base.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
+
+    # Broadband is the one side table sourced straight from a third-party CSV
+    # (Ofcom `postcode_space`) rather than from a sibling pcds-keyed pipeline
+    # step, so its postcode may drift in spacing/casing from the NSPL `pcds`
+    # base key. Normalize BOTH sides to the same canonical pcds form (reusing
+    # `_canonical_postcode_expr`, exactly as the listing/EPC re-anchor joins do)
+    # before joining, otherwise a real postcode silently misses and its
+    # `max_download_speed` reads as null "no data" downstream. Re-aggregate on
+    # the canonical key so two raw spellings collapsing to one key can't fan out
+    # the base; drop a null canonical key so an unparseable Ofcom row joins
+    # nothing rather than matching a null-key base row.
+    broadband_canonical = (
+        broadband.with_columns(
+            _canonical_postcode_expr("bb_postcode").alias("_bb_canonical_postcode")
+        )
+        .drop_nulls("_bb_canonical_postcode")
+        .group_by("_bb_canonical_postcode")
+        .agg(pl.col("max_download_speed").max())
+    )
+    return (
+        base.with_columns(
+            _canonical_postcode_expr("postcode").alias("_base_canonical_postcode")
+        )
+        .join(
+            broadband_canonical,
+            left_on="_base_canonical_postcode",
+            right_on="_bb_canonical_postcode",
+            how="left",
+        )
+        .drop("_base_canonical_postcode")
+    )


 def _finalize_merged_columns(frame: pl.LazyFrame) -> pl.LazyFrame:
@ -1328,7 +1355,7 @@ def _load_direct_epc_candidates(
    )

    return (
-        epc_base.sort("inspection_date", descending=True)
+        epc_base.sort("inspection_date", descending=True, nulls_last=True)
        .group_by("_direct_epc_match_address", "_direct_epc_match_postcode")
        .first()
        .join(
@ -1918,6 +1945,10 @@ def _build(
    # terminated English postcodes are retained under their successor postcode.
    postcode_mapping = build_postcode_mapping(arcgis_path)
    wide = _remap_terminated_postcodes(wide, postcode_mapping.lazy())
+    # The remap can collapse two terminated postcodes onto one active successor,
+    # duplicating a physical address's (postcode, pp_address) key; keep only the
+    # most-recent transaction per address before the per-postcode joins.
+    wide = _dedupe_collapsed_properties(wide)
    arcgis_raw = pl.scan_parquet(arcgis_path)
    arcgis = _active_english_postcode_area(arcgis_raw)
    active_postcodes = arcgis.select("postcode").unique()
--- a/pipeline/transform/noise_overlay_tiles.py
+++ b/pipeline/transform/noise_overlay_tiles.py
@ -164,19 +164,39 @@ def _read_noise_tile(

    for info in candidates:
        with rasterio.open(info.path) as source:
+            # The Defra rasters encode genuine "quiet / below threshold" as the
+            # value 0.0 (only -96.0 is true nodata). Mask both BEFORE
+            # reprojecting so resampling never blends a 0 cell into an adjacent
+            # loud corridor and fabricates a halo of intermediate dB.
+            #
+            # Lden values are dB (a logarithmic scale), so bilinear resampling
+            # would arithmetically average neighbouring dB cells, which is
+            # acoustically wrong (it diluted a 75 dB peak to ~53 dB in tests)
+            # and inconsistent with the postcode sampler. Use Resampling.max:
+            # it preserves peak corridors, never invents an intermediate dB
+            # between a masked (NaN) quiet cell and a loud one, and mirrors the
+            # max semantics of sample_noise_at_postcodes.
+            src_arr = source.read(1).astype(np.float32)
+            nodata = source.nodata
+            invalid = ~np.isfinite(src_arr) | (src_arr <= 0)
+            if nodata is not None:
+                invalid |= np.isclose(
+                    src_arr, np.float32(nodata), rtol=1e-5, atol=1e-5
+                )
+            src_arr = np.where(invalid, np.float32("nan"), src_arr)
            tile = np.full((tile_size, tile_size), np.nan, dtype=np.float32)
            reproject(
-                source=rasterio.band(source, 1),
+                source=src_arr,
                destination=tile,
                src_transform=source.transform,
                src_crs=source.crs,
-                src_nodata=source.nodata if source.nodata is not None else 0,
+                src_nodata=float("nan"),
                dst_transform=from_bounds(
                    left, bottom, right, top, tile_size, tile_size
                ),
                dst_crs=WEB_MERCATOR_CRS,
                dst_nodata=np.nan,
-                resampling=Resampling.bilinear,
+                resampling=Resampling.max,
            )

        tile[~np.isfinite(tile) | (tile <= 0)] = np.nan
--- a/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
+++ b/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
@ -27,7 +27,7 @@ from .output import (
    to_wgs84_geojson_multi,
    write_district_geojson,
 )
-from .process_oa import _extract_polygonal, process_oa
+from .process_oa import MIN_GEOM_AREA, _extract_polygonal, process_oa
 from .uprn import get_oa_uprns, load_uprns
 from .voronoi import _equal_split_fallback, compute_voronoi_regions

@ -341,6 +341,65 @@ class TestVoronoiDeduplication:
        assert "B" in result, "Postcode B missing with int64 coords"


+class TestVoronoiCoincidentClusterNotCrushed:
+    """3+ postcodes at one coordinate must each keep a real cell.
+
+    Pre-fix, the first coincident postcode stayed unjittered at the exact
+    cluster centre; with other seeds in the OA its Voronoi cell was squeezed
+    below MIN_GEOM_AREA, so _clean_polygonal dropped that active postcode
+    downstream. The fix spreads coincident postcodes onto a small regular
+    polygon (equal wedges), so none is crushed.
+    """
+
+    def test_coincident_cluster_plus_outer_seed_no_postcode_crushed(self):
+        # A block of flats: 4 distinct postcodes share one building coordinate,
+        # plus one other postcode elsewhere in the OA. Pre-fix, the centre seed's
+        # cell collapsed to ~0.0001 m^2 (< MIN_GEOM_AREA) and the postcode was
+        # dropped; every postcode must now keep a non-degenerate cell.
+        boundary = box(0, 0, 1000, 1000)
+        points = np.array(
+            [
+                [500, 500],  # A — coincident
+                [500, 500],  # B — coincident
+                [500, 500],  # C — coincident
+                [500, 500],  # D — coincident
+                [100, 100],  # OUT — elsewhere in the OA
+            ],
+            dtype=np.float64,
+        )
+        postcodes = ["A", "B", "C", "D", "OUT"]
+        result = compute_voronoi_regions(points, postcodes, boundary)
+        for pc in postcodes:
+            assert pc in result, f"Postcode {pc} was dropped"
+            assert result[pc].area > MIN_GEOM_AREA, (
+                f"Postcode {pc} cell {result[pc].area} <= MIN_GEOM_AREA"
+            )
+
+    def test_coincident_cluster_partitions_into_fair_wedges(self, square_boundary):
+        # N postcodes sharing one coordinate split the surrounding area into
+        # roughly equal wedges (regular-polygon seeds), none degenerate.
+        points = np.array([[500050, 180050]] * 5, dtype=np.float64)
+        postcodes = ["A", "B", "C", "D", "E"]
+        result = compute_voronoi_regions(points, postcodes, square_boundary)
+        fair_share = square_boundary.area / len(postcodes)
+        for pc in postcodes:
+            assert pc in result, f"Postcode {pc} was dropped"
+            # Each wedge is a meaningful fraction of its fair share (not crushed).
+            assert result[pc].area > 0.3 * fair_share, (
+                f"Postcode {pc} cell {result[pc].area} far below fair share {fair_share}"
+            )
+
+    def test_two_coincident_split_is_fair(self, square_boundary):
+        """Regression: two postcodes at one coordinate split ~50/50."""
+        points = np.array([[500050, 180050], [500050, 180050]], dtype=np.float64)
+        postcodes = ["A", "B"]
+        result = compute_voronoi_regions(points, postcodes, square_boundary)
+        assert "A" in result and "B" in result
+        total = result["A"].area + result["B"].area
+        assert result["A"].area / total > 0.4
+        assert result["B"].area / total > 0.4
+
+
 # ---------------------------------------------------------------------------
 # Bug 4: Voronoi collinear fallback gives everything to first postcode
 # ---------------------------------------------------------------------------
--- a/pipeline/transform/postcode_boundaries/voronoi.py
+++ b/pipeline/transform/postcode_boundaries/voronoi.py
@ -20,33 +20,48 @@ def compute_voronoi_regions(
    # Convert to float64 so sub-metre jitter isn't truncated.
    points = points.astype(np.float64)

-    # Deduplicate points, keeping one per (location, postcode) pair.
-    # Multiple postcodes at the same coordinate each get their own point,
-    # jittered by a tiny offset (0.01m) so Voronoi can distinguish them.
-    # Coords are rounded to mm precision for stable hashing — UPRN inputs are
-    # already integer metres, but the float64 cast can introduce ULP noise.
-    GOLDEN_ANGLE = np.pi * (3.0 - np.sqrt(5.0))
+    # Deduplicate points, keeping one per (location, postcode) pair. Coords are
+    # rounded to mm precision for stable hashing — UPRN inputs are already integer
+    # metres, but the float64 cast can introduce ULP noise.
+    #
+    # Where several DISTINCT postcodes share one coordinate, jitter ALL of them
+    # onto a small regular polygon (equal 0.01m radius, equally spaced by angle)
+    # so their Voronoi cells become equal wedges and NONE is crushed. Leaving any
+    # seed at the centre — or innermost on a spiral — squeezes its cell below
+    # MIN_GEOM_AREA, which _clean_polygonal then drops downstream, silently losing
+    # an active postcode. Seeds at a UNIQUE coordinate are left exactly on their
+    # UPRN (no perturbation of normal Voronoi output). Coords are rounded to mm
+    # for stable hashing (the float64 cast can add ULP noise).
+    rounded_coords = [
+        (round(float(points[i, 0]), 3), round(float(points[i, 1]), 3))
+        for i in range(len(points))
+    ]
+    coord_postcodes: dict[tuple[float, float], set[str]] = defaultdict(set)
+    for coord, pc in zip(rounded_coords, postcodes):
+        coord_postcodes[coord].add(pc)
+
    seen: dict[tuple[float, float, str], bool] = {}
    unique_pts = []
    unique_pcs = []
    coord_counts: dict[tuple[float, float], int] = defaultdict(int)
    for i in range(len(points)):
-        coord = (round(float(points[i, 0]), 3), round(float(points[i, 1]), 3))
+        coord = rounded_coords[i]
        key = (coord[0], coord[1], postcodes[i])
        if key not in seen:
            seen[key] = True
-            jitter_idx = coord_counts[coord]
-            coord_counts[coord] += 1
-            if jitter_idx == 0:
-                unique_pts.append(points[i].copy())
-            else:
-                # Golden-angle spacing distributes any number of jittered
-                # points evenly around (and outward from) the original coord.
+            count = len(coord_postcodes[coord])
+            if count > 1:
+                # Coincident cluster: equally-spaced regular polygon -> equal
+                # Voronoi wedges, so every postcode here keeps a fair share.
+                jitter_idx = coord_counts[coord]
+                coord_counts[coord] += 1
+                angle = 2.0 * np.pi * jitter_idx / count
                jittered = points[i].copy()
-                angle = jitter_idx * GOLDEN_ANGLE
                jittered[0] += 0.01 * np.cos(angle)
                jittered[1] += 0.01 * np.sin(angle)
                unique_pts.append(jittered)
+            else:
+                unique_pts.append(points[i].copy())
            unique_pcs.append(postcodes[i])

    if len(unique_pts) == 1:
--- a/pipeline/transform/price_estimation/index.py
+++ b/pipeline/transform/price_estimation/index.py
@ -19,8 +19,7 @@ from tqdm import tqdm
 from pipeline.transform.price_estimation.shrinkage import (
    blend_dicts,
    hierarchical_shrinkage,
-    reanchor_dict,
-    reanchor_dicts,
+    lift_onto_parent,
    shrink_dicts,
    spatial_smooth,
 )
@ -169,33 +168,47 @@ def solve_robust_index(
    signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])

    # Temporal smoothness prior: penalise curvature in the year betas with a
-    # second-difference penalty lambda * (beta_t - 2*beta_{t-1} + beta_{t-2})^2,
-    # encoded as extra least-squares rows (sqrt(lambda) * [1, -2, 1] against a
-    # zero target). This damps single-year index spikes without flattening
-    # genuine multi-year trends. Betas are ordered by calendar year; the baseline
-    # year (min_year, implicit beta=0) has no column, so the penalty spans the
-    # non-baseline years only. For cells with <3 betas there is no curvature to
-    # penalise and the solve is unchanged.
+    # second-difference penalty lambda * (d2 beta / dt2)^2, encoded as extra
+    # least-squares rows (sqrt(lambda) * [w0, w1, w2] against a zero target).
+    # The weights are the CALENDAR-SPACING-AWARE second-derivative coefficients
+    # for the consecutive triple (y0, y1, y2), so gap years are not treated as
+    # adjacent: a multi-year gap relaxes the penalty (correctly preserving a
+    # genuine level jump) instead of forcing a smooth ramp. For unit spacing
+    # (1, 1) these reduce to [1, -2, 1], leaving contiguous cells unchanged.
+    # This damps single-year index spikes without flattening genuine trends.
+    # Betas are ordered by calendar year; the baseline year (min_year, implicit
+    # beta=0) has no column, so the penalty spans the non-baseline years only.
+    # For cells with <3 betas there is no curvature to penalise and the solve is
+    # unchanged.
    n_pen = 0
    pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
    pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
    if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
        sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
-        cols_by_year = [c for _, c in sorted(year_to_col.items())]
+        years_sorted = sorted(year_to_col)
+        cols_by_year = [year_to_col[y] for y in years_sorted]
        n_pen = n_cols - 2
        pen_rows = np.repeat(n + np.arange(n_pen), 3)
        pen_cols = np.empty(n_pen * 3, dtype=np.int64)
+        pen_vals = np.empty(n_pen * 3, dtype=np.float64)
        for k in range(n_pen):
            pen_cols[3 * k : 3 * k + 3] = (
                cols_by_year[k],
                cols_by_year[k + 1],
                cols_by_year[k + 2],
            )
+            y0, y1, y2 = years_sorted[k], years_sorted[k + 1], years_sorted[k + 2]
+            w0 = 2.0 / ((y1 - y0) * (y2 - y0))
+            w1 = -2.0 / ((y1 - y0) * (y2 - y1))
+            w2 = 2.0 / ((y2 - y1) * (y2 - y0))
+            pen_vals[3 * k : 3 * k + 3] = (
+                sqrt_lambda * w0,
+                sqrt_lambda * w1,
+                sqrt_lambda * w2,
+            )
        pen_rows_arr = pen_rows.astype(np.int64)
        pen_cols_arr = pen_cols
-        pen_vals_arr = np.tile(
-            [sqrt_lambda, -2.0 * sqrt_lambda, sqrt_lambda], n_pen
-        ).astype(np.float64)
+        pen_vals_arr = pen_vals
        pen_b = np.zeros(n_pen, dtype=np.float64)
    n_total_rows = n + n_pen

@ -252,7 +265,11 @@ def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
        idx = solve_robust_index(y1, y2, lr, w)
        if idx:
            indices[key] = idx
-            n_pairs[key] = len(y1)
+            # Count only information-bearing pairs: same-year (year1==year2) and
+            # baseline-baseline pairs cancel in the sparse solve and contribute
+            # zero information to the annual index, so including them would
+            # inflate the shrinkage weight n/(n+k) and under-shrink noisy sectors.
+            n_pairs[key] = int(np.count_nonzero(y2 != y1))
    return indices, n_pairs


@ -433,20 +450,17 @@ def build_index(
            f"  {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
        )

-        # Re-anchor every repeat-sales dict to the global base year before any
-        # shrinkage/smoothing/blending. solve_robust_index anchors each cell to
-        # log-index 0 at its OWN earliest year, so cells with shorter histories
-        # are measured from a later origin; combining them key-by-key would
-        # otherwise average level-incompatible numbers. The hedonic fallback is
-        # already anchored at min_year, so we align everything to min_year.
-        national_idx = reanchor_dict(national_idx, min_year)
-        area_idx = reanchor_dicts(area_idx, min_year)
-        district_idx = reanchor_dicts(district_idx, min_year)
-        sector_idx = reanchor_dicts(sector_idx, min_year)
-
-        # Shrinkage: national -> hedonic first, then hierarchical
+        # Shrinkage: national -> hedonic first, then hierarchical. Each cell is
+        # anchored to log-index 0 at its OWN earliest year (solve_robust_index),
+        # so cells with shorter histories sit on a later origin than their wider
+        # parents. Before each blend we lift the child onto its parent's base at
+        # the child's first year (lift_onto_parent) -- otherwise combining them
+        # key-by-key averages level-incompatible numbers. The hedonic fallback is
+        # anchored at the global min_year, so it serves as the base for national.
        print("  Applying shrinkage...")
-        national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
+        national_shrunk = shrink_dicts(
+            lift_onto_parent(national_idx, hedonic_idx), hedonic_idx, national_n
+        )
        sector_shrunk = hierarchical_shrinkage(
            sector_idx,
            sector_n,
@ -459,6 +473,7 @@ def build_index(
            sector_to_dist,
            dist_to_area,
            shrink_dicts,
+            lift_onto_parent,
        )

        # Spatial smoothing
--- a/pipeline/transform/price_estimation/knn.py
+++ b/pipeline/transform/price_estimation/knn.py
@ -142,6 +142,20 @@ def _sale_identity_matches(
    target_price: float,
    target_sale_date: int,
 ) -> np.ndarray:
+    """Mark pool comparables that are (almost certainly) the target's own sale.
+
+    properties.parquet has no per-property id, so a sale is identified by the
+    proxy tuple (postcode, price within 0.5, sale_date) to keep a target's own
+    prior sale out of its comparable set (leakage prevention).
+
+    Limitation: new-build / bulk blocks sell many DISTINCT properties in one
+    postcode on the same day at the same price, so all such siblings collide on
+    this proxy and are excluded together. This is intentional conservative
+    over-exclusion: it guarantees no leakage at the cost of occasionally
+    dropping legitimate same-(postcode, price, date) siblings. The effect is
+    bounded (~1.8% of the pool) and a precise fix would require a per-property
+    id that the data does not carry.
+    """
    if not target_postcode or not np.isfinite(target_price) or target_sale_date < 0:
        return np.zeros(len(pool_postcodes), dtype=bool)
    return (
@ -166,6 +180,16 @@ def knn_median_psm(

    PSM is at the reference date used when building the pool.
    NaN where not computable (missing coords, unknown type, too few neighbors).
+
+    Coordinate limitation: lat/lon come from postcode.parquet (one centroid per
+    postcode), so every property within a postcode is co-located. For a dense
+    postcode the "k nearest" therefore degenerates into an arbitrary
+    same-postcode subset whose membership is decided by KDTree index order
+    rather than true proximity. No property-level coordinates exist to fix this,
+    so the kNN signal is treated as a weak, noisy prior: the downstream guarded
+    blend (guarded_blend_estimates) only blends kNN when it is close to the
+    index estimate and otherwise discards it, bounding the impact of this
+    degeneracy. The result is deterministic for a fixed pool order.
    """
    n = len(lat)
    result = np.full(n, np.nan)
--- a/pipeline/transform/price_estimation/shrinkage.py
+++ b/pipeline/transform/price_estimation/shrinkage.py
@ -36,26 +36,43 @@ def _base_value(index: dict[int, float], base_year: int) -> float:
    return index[prior[-1]]


-def reanchor_dict(index: dict[int, float], base_year: int) -> dict[int, float]:
-    """Re-anchor an index dict so index[base_year] == 0 (pure constant shift).
+def lift_onto_parent(
+    child: dict[int, float], parent: dict[int, float]
+) -> dict[int, float]:
+    """Lift a child index onto its parent's base before blending the two.

-    Subtracting the same constant from every year preserves all within-dict
-    year-to-year differences, so estimate.py's (current - sale) semantics are
-    unchanged; it only fixes the cross-dict level mismatch before blending.
+    solve_robust_index anchors every cell to log-index 0 at its OWN earliest
+    year, so a cell with a shorter history sits on a later origin than its
+    (wider) parent. Combining them key-by-key would average level-incompatible
+    numbers (a sector measured from 2008 blended with a district measured from
+    1996). We add the parent's accumulated level at the child's first year, so
+    ``child[start] == parent[start]``: the child's own year-to-year moves are
+    layered on top of the parent's growth up to that point -- the same
+    assumption shrinkage already makes for years the child lacks.
+
+    Re-basing on each cell's OWN earliest year (rather than the global base,
+    which the child cannot observe) is what makes this effective: subtracting
+    the child's value at the global base is always 0 and changes nothing.
+
+    The shift is a single constant added to every year of the child, so the
+    child's own year-to-year differences are preserved. PRECONDITION for the
+    downstream estimate to be unaffected within the child's range: the parent's
+    year coverage must be a superset of the child's. This holds throughout
+    build_index, where each parent aggregates a superset of its children's sale
+    pairs, so shrink_dicts blends every child year against a present parent year
+    and the constant shift cancels in a within-range (current - sale) difference;
+    only comparisons that span the child's start year (e.g. a sale predating the
+    cell's own data) change. If a caller violates the precondition (a child year
+    the parent lacks), shrink_dicts passes that year through unshrunk and the
+    cancellation no longer holds.
    """
-    if not index:
-        return index
-    shift = _base_value(index, base_year)
-    if shift == 0.0:
-        return index
-    return {y: v - shift for y, v in index.items()}
-
-
-def reanchor_dicts(
-    indices: dict[str, dict[int, float]], base_year: int
-) -> dict[str, dict[int, float]]:
-    """Re-anchor every index dict in a mapping to the common `base_year`."""
-    return {key: reanchor_dict(idx, base_year) for key, idx in indices.items()}
+    if not child or not parent:
+        return child
+    child_start = min(child)
+    offset = _base_value(parent, child_start) - child[child_start]
+    if offset == 0.0:
+        return child
+    return {y: v + offset for y, v in child.items()}


 def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
@ -84,30 +101,40 @@ def hierarchical_shrinkage(
    sector_to_dist: dict[str, str],
    dist_to_area: dict[str, str],
    shrink_fn: Callable[[V, V, int], V],
+    lift_fn: Callable[[V, V], V] | None = None,
 ) -> dict[str, V]:
    """Top-down hierarchical shrinkage: area->top, district->area, sector->district.

    `top_level` is the ultimate fallback value (e.g. national shrunk toward hedonic,
    or just national). `shrink_fn(raw, parent, n)` blends raw toward parent.
+    `lift_fn(raw, parent)`, if given, re-bases raw onto its parent before blending
+    (see lift_onto_parent); pass None for category-keyed dicts where re-basing is
+    meaningless.
    """
+
+    def combine(raw: V, parent: V, n: int) -> V:
+        if lift_fn is not None:
+            raw = lift_fn(raw, parent)
+        return shrink_fn(raw, parent, n)
+
    # Area -> top level
    area_shrunk = {}
    for area, val in area_vals.items():
-        area_shrunk[area] = shrink_fn(val, top_level, area_n[area])
+        area_shrunk[area] = combine(val, top_level, area_n[area])

    # District -> area
    district_shrunk = {}
    for dist, val in district_vals.items():
        a = dist_to_area.get(dist, "")
        parent = area_shrunk.get(a, top_level)
-        district_shrunk[dist] = shrink_fn(val, parent, district_n[dist])
+        district_shrunk[dist] = combine(val, parent, district_n[dist])

    # Sector -> district
    sector_shrunk = {}
    for sec, val in sector_vals.items():
        d = sector_to_dist.get(sec, "")
        parent = district_shrunk.get(d, top_level)
-        sector_shrunk[sec] = shrink_fn(val, parent, sector_n[sec])
+        sector_shrunk[sec] = combine(val, parent, sector_n[sec])

    # Fill sectors without their own values
    for sec in all_sectors:
--- a/pipeline/transform/price_estimation/test_index.py
+++ b/pipeline/transform/price_estimation/test_index.py
@ -0,0 +1,135 @@
+import numpy as np
+import polars as pl
+
+from pipeline.transform.price_estimation import index as index_mod
+from pipeline.transform.price_estimation.index import (
+    compute_indices_for_level,
+    solve_robust_index,
+)
+
+
+def _pairs_from_path(true_levels: dict[int, float]):
+    """Build adjacent-year repeat-sale pairs that exactly trace a known path.
+
+    Each consecutive pair's log_ratio is the difference of the true log-levels,
+    so the solver should recover the levels exactly (relative to the min year).
+    """
+    years = sorted(true_levels)
+    y1, y2, lr, w = [], [], [], []
+    for a, b in zip(years[:-1], years[1:]):
+        y1.append(a)
+        y2.append(b)
+        lr.append(true_levels[b] - true_levels[a])
+        w.append(1.0)
+    return (
+        np.array(y1, dtype=np.int32),
+        np.array(y2, dtype=np.int32),
+        np.array(lr, dtype=np.float64),
+        np.array(w, dtype=np.float64),
+    )
+
+
+def test_solver_recovers_contiguous_path():
+    """A contiguous price path is recovered as log-levels relative to min_year.
+
+    Proves the IRLS solver is correct (and unchanged) for contiguous data: the
+    spacing-aware penalty reduces to the standard [1,-2,1] for unit spacing.
+    """
+    years = range(2010, 2021)
+    true = {y: 0.04 * (y - 2010) for y in years}  # smooth (zero curvature) ramp
+    # Replicate each adjacent pair so MIN_PAIRS is comfortably met.
+    y1, y2, lr, w = _pairs_from_path(true)
+    y1 = np.tile(y1, 3)
+    y2 = np.tile(y2, 3)
+    lr = np.tile(lr, 3)
+    w = np.tile(w, 3)
+
+    idx = solve_robust_index(y1, y2, lr, w)
+
+    assert idx[2010] == 0.0  # baseline anchor
+    for y in years:
+        assert abs(idx[y] - (true[y] - true[2010])) < 1e-3
+
+
+def test_gap_spanning_level_jump_is_not_smoothed_into_a_ramp():
+    """FIX #5: a sharp true level jump across a multi-year gap is preserved.
+
+    Coverage is 2000,2001,2002 then 2015,2016 with cross-gap pairs encoding a
+    sharp jump at the gap. The uniform [1,-2,1] curvature penalty treats
+    (beta_2002, beta_2015, beta_2016) as three adjacent years and over-penalizes
+    the genuine level jump, biasing beta_2015 down toward a smooth ramp. The
+    spacing-aware second difference relaxes the penalty across the gap.
+    """
+    # True log-levels relative to min_year (2000 anchored at 0).
+    true = {
+        2000: 0.0,
+        2001: 0.05,
+        2002: 0.10,
+        2015: 1.10,  # sharp +1.0 jump across the gap
+        2016: 1.15,
+    }
+
+    y1, y2, lr, w = [], [], [], []
+
+    def add(a, b, n=4):
+        for _ in range(n):
+            y1.append(a)
+            y2.append(b)
+            lr.append(true[b] - true[a])
+            w.append(1.0)
+
+    # In-segment adjacent pairs.
+    add(2000, 2001)
+    add(2001, 2002)
+    add(2015, 2016)
+    # Cross-gap pairs consistent with the sharp jump.
+    add(2002, 2015)
+    add(2002, 2016)
+
+    y1 = np.array(y1, dtype=np.int32)
+    y2 = np.array(y2, dtype=np.int32)
+    lr = np.array(lr, dtype=np.float64)
+    w = np.array(w, dtype=np.float64)
+
+    # Use a strong penalty to make the smoothing bias obvious.
+    original = index_mod.TEMPORAL_SMOOTHNESS_LAMBDA
+    index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = 1.0
+    try:
+        idx = solve_robust_index(y1, y2, lr, w)
+    finally:
+        index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = original
+
+    assert idx[2000] == 0.0  # baseline anchor
+    # beta_2015 must stay near its true post-gap level, not get dragged down by a
+    # spurious curvature penalty that treats the gap as a single-year step.
+    assert abs(idx[2015] - true[2015]) < 0.05
+
+
+def test_n_pairs_counts_only_cross_year_pairs():
+    """FIX #12: same-year pairs carry zero index information and must not inflate
+    the shrinkage weight; n_pairs counts only cross-year (year2 != year1) pairs."""
+    rows = []
+
+    def add_pairs(group, year1, year2, n):
+        for _ in range(n):
+            rows.append(
+                {
+                    "grp": group,
+                    "year1": year1,
+                    "year2": year2,
+                    "log_ratio": 0.03 * (year2 - year1),
+                    "weight": 1.0,
+                }
+            )
+
+    # 8 genuine cross-year pairs spanning enough years for a valid solve, plus 3
+    # zero-information same-year pairs that must not be counted.
+    add_pairs("g", 2010, 2011, 4)
+    add_pairs("g", 2011, 2012, 4)
+    add_pairs("g", 2012, 2012, 3)  # same-year, zero info
+
+    pairs = pl.DataFrame(rows)
+    indices, n_pairs = compute_indices_for_level(pairs, "grp")
+
+    assert "g" in indices
+    assert n_pairs["g"] == 8  # not 11
--- a/pipeline/transform/price_estimation/test_knn.py
+++ b/pipeline/transform/price_estimation/test_knn.py
@ -71,9 +71,49 @@ def test_knn_excludes_same_sale_and_uses_stable_comparables():
        ),
    )

+    # The five 900k same-postcode siblings share the target's (postcode, price,
+    # date) identity proxy, so they are all excluded as comparables, leaving the
+    # 200k/80sqm = 2_500 PSM neighbours. Removing same-identity siblings is an
+    # INTENTIONAL conservative leakage-prevention tradeoff (no per-property id
+    # exists to distinguish a target's own resale from a distinct bulk-block
+    # sibling sold same-day at the same price), not ideal behaviour -- see the
+    # _sale_identity_matches docstring.
    assert psm[0] == 2_500.0


+def test_knn_median_psm_is_deterministic():
+    """Reproducibility guard (BUG #6): within-postcode neighbours are co-located
+    (one centroid per postcode), so the kNN result for dense postcodes depends on
+    an arbitrary same-postcode subset. That is acceptable, but it MUST be stable:
+    two identical calls against the same trees/inputs return identical output, so
+    future refactors cannot silently introduce run-to-run nondeterminism."""
+    sale_date = date(2026, 1, 1)
+    rows = [
+        {
+            "Postcode": "AA1 1AA",
+            "Property type": "Detached",
+            "lat": 51.5000 + i * 0.00001,
+            "lon": -0.1000,
+            "Total floor area (sqm)": 80.0,
+            "Last known price": 200_000.0 + i * 1_000.0,
+            "Date of last transaction": sale_date,
+        }
+        for i in range(40)
+    ]
+    df = pl.DataFrame(rows)
+    trees = build_knn_pool(df.lazy(), _flat_index(), 2026.0)
+
+    args = dict(
+        lat=np.array([51.5000, 51.5002]),
+        lon=np.array([-0.1000, -0.1000]),
+        type_groups=np.array(["Detached", "Detached"]),
+    )
+    first = knn_median_psm(trees, **args)
+    second = knn_median_psm(trees, **args)
+
+    assert np.array_equal(first, second)
+
+
 def test_guarded_blend_routes_unstable_knn_to_index_and_caps_uplift():
    blended = guarded_blend_estimates(
        index_est=np.array([120_000.0, 1_000_000.0]),
--- a/pipeline/transform/price_estimation/test_shrinkage.py
+++ b/pipeline/transform/price_estimation/test_shrinkage.py
@ -1,99 +1,117 @@
-"""Regression tests for common-base-year re-anchoring before blending.
+"""Regression tests for parent-base lifting before hierarchical blending.

-Each repeat-sales index dict is anchored to log-index 0 at its OWN earliest
-year. shrink_dicts / blend_dicts combine dicts key-by-key, so dicts anchored to
-different base years must be re-anchored to a single common base first, or the
+solve_robust_index anchors every repeat-sales cell to log-index 0 at its OWN
+earliest year, so a cell with a shorter history sits on a later origin than its
+(wider) parent. shrink_dicts / blend_dicts combine dicts key-by-key, so a child
+must first be lifted onto its parent's base at the child's first year, or the
 blend averages level-incompatible numbers (fix5-index-base-year).
+
+Note: re-anchoring each cell to the *global* base year is a no-op on real data
+(a cell anchored to 0 at its own earliest year already reads 0 there, and the
+global base is never later), which is why the fix lifts onto the *parent* at the
+child's own start year instead.
 """

 from pipeline.transform.price_estimation.shrinkage import (
-    blend_dicts,
-    reanchor_dict,
-    reanchor_dicts,
+    hierarchical_shrinkage,
+    lift_onto_parent,
    shrink_dicts,
 )
+from pipeline.transform.price_estimation.utils import SHRINKAGE_K


-def test_reanchor_is_pure_constant_shift_preserving_differences():
-    """Re-anchoring only shifts the origin; year-to-year moves are unchanged."""
-    # Anchored at its own earliest year 2008.
-    idx = {2008: 0.0, 2009: 0.10, 2010: 0.25, 2011: 0.40}
+def test_lift_rebases_late_starting_child_onto_parent():
+    """A child anchored at its own later start year is lifted to the parent's level there."""
+    parent = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
+    # Sector with its own repeat-sales data only from 2016, anchored at 2016 = 0.
+    sector = {2016: 0.0, 2024: 0.20}

-    reanchored = reanchor_dict(idx, 1996)
-    # 1996 is before this dict's history -> back-fill earliest value (0.0),
-    # so the shift is 0 and the dict is unchanged.
-    assert reanchored[2008] == 0.0
+    lifted = lift_onto_parent(sector, parent)

-    # Same shape, different exact-hit base year: anchoring at 2010 subtracts 0.25.
-    reanchored_2010 = reanchor_dict(idx, 2010)
-    assert reanchored_2010[2010] == 0.0
-    # All within-dict differences are preserved under the constant shift.
-    years = sorted(idx)
-    for a, b in zip(years, years[1:]):
-        assert abs((reanchored_2010[b] - reanchored_2010[a]) - (idx[b] - idx[a])) < 1e-12
+    # child[start] now equals the parent's accumulated level at that year.
+    assert abs(lifted[2016] - parent[2016]) < 1e-12  # 1.20
+    assert abs(lifted[2024] - (parent[2016] + 0.20)) < 1e-12  # 1.40
+    # Pure constant shift: the child's own year-to-year move is preserved.
+    assert abs((lifted[2024] - lifted[2016]) - (sector[2024] - sector[2016])) < 1e-12


-def test_blend_different_base_years_needs_reanchoring():
-    """Blending two dicts on different bases is biased unless re-anchored first.
+def test_lift_is_noop_when_child_starts_at_parent_base():
+    """A child whose earliest year is the parent's base (value 0) is unchanged."""
+    parent = {1996: 0.0, 2008: 0.80, 2016: 1.20}
+    child = {1996: 0.0, 2008: 0.75, 2016: 1.10}
+    assert lift_onto_parent(child, parent) == child

-    Both cells observe the common base year 1996 but were anchored to DIFFERENT
-    origins (sectorA at 1996, sectorB at 2008, as solve_robust_index would do for
-    cells whose pair history starts at different years). They describe the SAME
-    true trajectory measured from 1996, so a 50/50 blend should reproduce that
-    common level. Pre-fix, blend_dicts mixes sectorB's 2008-relative numbers with
-    sectorA's 1996-relative numbers, level-shifting the smoothed result.
+
+def test_lift_handles_empty_inputs():
+    assert lift_onto_parent({}, {2000: 0.0}) == {}
+    assert lift_onto_parent({2000: 0.0}, {}) == {2000: 0.0}
+
+
+def test_lift_fixes_estimate_spanning_child_start_but_not_within_range():
+    """The lift corrects comparisons that span the cell's start year, and ONLY those.
+
+    A property sold in 2008 (before the sector's own data begins in 2016) and
+    valued in 2024: pre-lift the shrunk index mixes a 2016-based sector level
+    with 1996-based parent levels and badly understates the move. Comparisons
+    wholly inside the sector's own range (2016->2024) are unchanged, because the
+    lift is a pure constant shift that cancels in a within-cell difference.
    """
-    base_year = 1996
+    parent = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
+    sector = {2016: 0.0, 2024: 0.20}  # own data starts 2016
+    n = 30
+    w = n / (n + SHRINKAGE_K)

-    # True log-levels relative to 1996 (identical trajectory for both cells).
-    truth = {1996: 0.0, 2008: 0.80, 2012: 1.00}
+    raw = shrink_dicts(sector, parent, n)  # pre-fix: blend without lifting
+    fixed = shrink_dicts(lift_onto_parent(sector, parent), parent, n)

-    # sectorA: anchored at 1996 (its earliest year) -> equals truth.
-    sector_a = dict(truth)
-    # sectorB: same trajectory but anchored at 2008 (subtract truth[2008] from
-    # every year), exactly how solve_robust_index would express a cell whose
-    # earliest year happened to be picked as 2008.
-    shift_b = truth[2008]
-    sector_b = {y: v - shift_b for y, v in truth.items()}
+    # Within the sector's own range the lift changes nothing.
+    assert abs((fixed[2024] - fixed[2016]) - (raw[2024] - raw[2016])) < 1e-12

-    # --- Pre-fix behaviour: blend the raw dicts directly. ---
-    raw_blend = blend_dicts(sector_a, [sector_b], 0.5, [0.5])
-    # Every year is pulled by half of shift_b (0.4) away from the truth.
-    assert abs(raw_blend[2012] - truth[2012]) > 0.3
-    assert abs(raw_blend[1996] - truth[1996]) > 0.3
+    # 2008 is parent-only in both (sector absent), so both read parent[2008].
+    assert abs(raw[2008] - parent[2008]) < 1e-12
+    assert abs(fixed[2008] - parent[2008]) < 1e-12

-    # --- Post-fix behaviour: re-anchor to the common base, THEN blend. ---
-    reanchored = reanchor_dicts({"A": sector_a, "B": sector_b}, base_year)
-    fixed_blend = blend_dicts(reanchored["A"], [reanchored["B"]], 0.5, [0.5])
-    # Both cells now read 0 at 1996 and the true level at every shared year.
-    for y in truth:
-        assert abs(fixed_blend[y] - truth[y]) < 1e-9
+    raw_move = raw[2024] - raw[2008]
+    fixed_move = fixed[2024] - fixed[2008]
+    # Hand-computed: raw[2024] = w*0.20 + (1-w)*1.50; fixed[2024] = w*1.40 + (1-w)*1.50.
+    assert abs(raw_move - ((w * 0.20 + (1 - w) * 1.50) - 0.80)) < 1e-12
+    assert abs(fixed_move - ((w * 1.40 + (1 - w) * 1.50) - 0.80)) < 1e-12
+    # The fix raises the spanning move by exactly the parent growth to the
+    # sector's start year that the raw blend dropped (weighted by w).
+    assert abs((fixed_move - raw_move) - w * parent[2016]) < 1e-12
+    # Fixed move is close to the true area-level move (0.70); raw badly understates it.
+    assert abs(fixed_move - 0.70) < 0.2
+    assert raw_move < 0.4 * fixed_move


-def test_shrink_dicts_after_reanchoring_is_consistent():
-    """Shrinking a cell toward its parent must use a common origin."""
-    base_year = 2000
-    # Parent (national) anchored at 2000.
-    parent = {2000: 0.0, 2010: 0.50, 2020: 1.20}
-    # Sector tracking the parent exactly but anchored at 2010 (subtract 0.50 from
-    # every year), as solve_robust_index would express a cell whose earliest year
-    # is later. It still observes the 2000 base year (value -0.50).
-    sector = {2000: -0.50, 2010: 0.0, 2020: 0.70}
-    n = 0  # no own data weight -> result should equal parent after anchoring
+def test_hierarchical_shrinkage_lift_fn_only_changes_spanning_comparisons():
+    """Integration: passing lift_fn re-bases a late-starting sector via its parent chain."""
+    top = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
+    sector = {"AB1 1": {2016: 0.0, 2024: 0.20}}
+    sector_n = {"AB1 1": 300}
+    # No own area/district indices -> the sector shrinks straight toward `top`.
+    base_args = (
+        sector,
+        sector_n,
+        {},
+        {},
+        {},
+        {},
+        top,
+        ["AB1 1"],
+        {"AB1 1": "AB1"},
+        {"AB1": "AB"},
+        shrink_dicts,
+    )

-    reanchored_sector = reanchor_dict(sector, base_year)
-    # Exact hit on 2000 subtracts -0.50, putting the sector back on the parent's
-    # origin: 0.0 at 2000, 0.50 at 2010, 1.20 at 2020.
-    shrunk = shrink_dicts(reanchored_sector, parent, n)
-    assert abs(shrunk[2000] - 0.0) < 1e-9
-    assert abs(shrunk[2010] - 0.50) < 1e-9
-    assert abs(shrunk[2020] - 1.20) < 1e-9
+    without_lift = hierarchical_shrinkage(*base_args)["AB1 1"]
+    with_lift = hierarchical_shrinkage(*base_args, lift_onto_parent)["AB1 1"]

-
-def test_reanchor_exact_hit_shifts_all_years():
-    """When the base year is present, subtract its value from every year."""
-    idx = {1996: 0.0, 2005: 0.30, 2015: 0.90}
-    reanchored = reanchor_dict(idx, 2005)
-    assert reanchored[2005] == 0.0
-    assert abs(reanchored[1996] - (-0.30)) < 1e-12
-    assert abs(reanchored[2015] - 0.60) < 1e-12
+    # Within the sector's own range: identical (pure constant shift cancels).
+    assert abs(
+        (with_lift[2024] - with_lift[2016]) - (without_lift[2024] - without_lift[2016])
+    ) < 1e-12
+    # Spanning the sector's start year: the lift raises the 2008->2024 move.
+    assert (with_lift[2024] - with_lift[2008]) > (
+        without_lift[2024] - without_lift[2008]
+    ) + 0.1
--- a/pipeline/transform/test_crime_spatial.py
+++ b/pipeline/transform/test_crime_spatial.py
@ -252,6 +252,47 @@ def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
    assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}


+def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
+    # Two SERIOUS types occur in DISJOINT years for one postcode: Burglary only in
+    # 2014, Robbery only in 2024 (each a single full month -> 12/yr). The headline
+    # "Serious crime (avg/yr)" must equal the mean of the "Serious crime (by year)"
+    # bars (which span the UNION of years any serious type occurred), NOT the sum
+    # of the per-type means. Summing per-type means divides each type by its OWN
+    # years-present (1 each) -> 12 + 12 = 24; the consistent rollup divides the
+    # per-year serious total by the years any serious type occurred (2) -> 12.
+    units = tmp_path / "units"
+    _write_boundaries(
+        units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
+    )
+
+    crime = tmp_path / "crime"
+    _write_month(crime, "2014-01", [_crime_row("2014-01", 1005, 1005, "Burglary")])
+    _write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Robbery")])
+
+    output = tmp_path / "crime_by_postcode.parquet"
+    by_year = tmp_path / "crime_by_postcode_by_year.parquet"
+    transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
+
+    avg = pl.read_parquet(output).row(0, named=True)
+    # The precomputed rollup headline exists and equals the mean of the bars (12),
+    # not the sum of the per-type avg/yr values (Burglary 12 + Robbery 12 = 24).
+    assert "Serious crime (avg/yr)" in avg
+    assert avg["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
+    assert avg["Robbery (avg/yr)"] == pytest.approx(12.0, abs=0.05)
+    assert avg["Serious crime (avg/yr)"] == pytest.approx(12.0, abs=0.05)
+
+    serious_bars = {
+        p["year"]: p["count"]
+        for p in pl.read_parquet(by_year).row(0, named=True)["Serious crime (by year)"]
+    }
+    assert serious_bars == {
+        2014: pytest.approx(12.0, abs=0.05),
+        2024: pytest.approx(12.0, abs=0.05),
+    }
+    mean_of_bars = sum(serious_bars.values()) / len(serious_bars)
+    assert avg["Serious crime (avg/yr)"] == pytest.approx(mean_of_bars, abs=0.05)
+
+
 def test_avg_yr_denominator_is_per_postcode_not_global(tmp_path):
    # P (AB1 1AA) has burglaries only in its single most-recent year (2024); Q
    # (AB1 1AB), far away, has a burglary in 2014. The type therefore spans TWO
--- a/pipeline/transform/test_join_epc_pp.py
+++ b/pipeline/transform/test_join_epc_pp.py
@ -58,7 +58,7 @@ def test_scan_epc_certificates_supports_legacy_uppercase_csv(tmp_path: Path):
            "potential_energy_rating": "B",
            "epc_property_type": "House",
            "built_form": "Mid-Terrace",
-            "inspection_date": "2024-01-02",
+            "inspection_date": date(2024, 1, 2),
            "total_floor_area": 84.5,
            "number_habitable_rooms": None,
            "floor_height": 2.4,
@ -179,6 +179,65 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
    assert df.get_column("historical_prices").list.len().to_list() == [2]


+def test_run_dedup_prefers_valid_dated_cert_over_garbled_date(tmp_path: Path):
+    # Two certificates for the same property. The cert with the garbled,
+    # unparseable inspection_date must NOT be chosen as "latest": a string sort
+    # nulls-first would have picked it, attaching a stale rating/floor area. The
+    # valid-dated cert wins, so its rating ("C") and floor area (85) survive.
+    zip_path = tmp_path / "domestic-csv.zip"
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
+        csv_buffer = io.StringIO()
+        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
+        writer.writeheader()
+        writer.writerows(
+            [
+                _row(
+                    current_energy_rating="c",
+                    inspection_date="2024-01-01",
+                    total_floor_area="85",
+                ),
+                # Same property; an unparseable date (OCR/garbled). Under a raw
+                # string descending sort "not-a-date" outranks the ISO date and
+                # wins the dedup, but as a null Date it loses.
+                _row(
+                    current_energy_rating="g",
+                    inspection_date="not-a-date",
+                    total_floor_area="40",
+                ),
+            ]
+        )
+        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
+
+    price_paid_path = tmp_path / "price-paid.parquet"
+    pl.DataFrame(
+        {
+            "price": [250_000],
+            "date_of_transfer": [date(2024, 2, 3)],
+            "property_type": ["T"],
+            "postcode": ["AA1 1AA"],
+            "paon": ["1"],
+            "saon": [None],
+            "street": ["Example Street"],
+            "locality": [None],
+            "town_city": ["Exampletown"],
+            "duration": ["F"],
+            "old_new": ["N"],
+            "ppd_category": ["A"],
+        }
+    ).write_parquet(price_paid_path)
+
+    output_path = tmp_path / "epc-pp.parquet"
+    _run(zip_path, price_paid_path, output_path, tmp_path)
+
+    df = pl.read_parquet(output_path)
+
+    assert df.height == 1
+    # The valid-dated cert's facts are kept; the garbled-date cert is NOT chosen.
+    assert df.select("current_energy_rating", "total_floor_area").to_dicts() == [
+        {"current_energy_rating": "C", "total_floor_area": 85.0}
+    ]
+
+
 def test_run_excludes_price_paid_rows_without_full_postcode(tmp_path: Path):
    zip_path = tmp_path / "domestic-csv.zip"
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
--- a/pipeline/transform/test_merge.py
+++ b/pipeline/transform/test_merge.py
@ -14,6 +14,7 @@ from pipeline.transform.merge import (
    _build_unmatched_listing_seed_rows,
    _canonical_postcode_expr,
    _coalesce_direct_epc_columns,
+    _dedupe_collapsed_properties,
    _filter_to_active_english_postcodes,
    _join_area_side_tables,
    _finalize_listings,
@ -193,6 +194,159 @@ def test_postcode_feature_validation_rejects_unsupported_or_ungeocoded_rows() ->
        _validate_postcode_feature_output(postcode_df, expected_postcode_count=2)


+def test_postcode_feature_validation_rejects_wrong_count() -> None:
+    # The universe-size invariant: the postcode feature output must contain
+    # EXACTLY the active-England universe. Too few rows (silently dropped
+    # postcodes) and too many / duplicated rows (a join fan-out) must both fail,
+    # so neither a truncated build nor a one-to-many join can ship.
+    too_few = pl.DataFrame(
+        {
+            "Postcode": ["AA1 1AA"],
+            "lat": [51.0],
+            "lon": [-0.1],
+            "ctry25cd": ["E92000001"],
+        }
+    )
+    with pytest.raises(ValueError, match="active England postcode universe"):
+        _validate_postcode_feature_output(too_few, expected_postcode_count=2)
+
+    too_many = pl.DataFrame(
+        {
+            "Postcode": ["AA1 1AA", "BB1 1BB", "CC1 1CC"],
+            "lat": [51.0, 52.0, 53.0],
+            "lon": [-0.1, -0.2, -0.3],
+            "ctry25cd": ["E92000001"] * 3,
+        }
+    )
+    with pytest.raises(ValueError, match="active England postcode universe"):
+        _validate_postcode_feature_output(too_many, expected_postcode_count=2)
+
+    # Right row count but a duplicated key (n_unique < height) -- the signature of
+    # a join fan-out.
+    duplicated = pl.DataFrame(
+        {
+            "Postcode": ["AA1 1AA", "AA1 1AA"],
+            "lat": [51.0, 51.0],
+            "lon": [-0.1, -0.1],
+            "ctry25cd": ["E92000001", "E92000001"],
+        }
+    )
+    with pytest.raises(ValueError, match="active England postcode universe"):
+        _validate_postcode_feature_output(duplicated, expected_postcode_count=2)
+
+
+def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None:
+    # Soundness: with side tables unique on their join key, the per-postcode
+    # feature joins emit exactly one row per postcode (no fan-out). A fan-out here
+    # would inflate the postcode universe above the active-England count -- the
+    # failure the universe assertion above is the backstop for.
+    base = pl.LazyFrame(
+        {
+            "postcode": ["AA1 1AA", "BB2 2BB"],
+            "lsoa21": ["E01000001", "E01000002"],
+            "Local Authority District code (2024)": ["E09000001", "E09000002"],
+            "pcon": ["E14000001", "E14000002"],
+        }
+    )
+
+    def _by_postcode(extra: dict) -> pl.LazyFrame:
+        return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
+
+    crime = pl.LazyFrame(
+        {
+            "postcode": ["AA1 1AA", "BB2 2BB"],
+            "Serious crime (avg/yr)": [1.0, 2.0],
+            "Minor crime (avg/yr)": [3.0, 4.0],
+        }
+    )
+    joined = _join_area_side_tables(
+        base,
+        iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
+        ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
+        crime=crime,
+        median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
+        election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
+        poi_counts=_by_postcode({}),
+        noise=_by_postcode({}),
+        school_proximity=_by_postcode({}),
+        conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
+        tree_density=None,
+        broadband=pl.LazyFrame(
+            {
+                "bb_postcode": ["AA1 1AA", "BB2 2BB"],
+                "max_download_speed": pl.Series([100, 300], dtype=pl.UInt16),
+            }
+        ),
+    ).collect()
+
+    # One row per postcode in -> one row out; the universe is not inflated.
+    assert joined.height == 2
+    assert sorted(joined["postcode"].to_list()) == ["AA1 1AA", "BB2 2BB"]
+
+
+def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None:
+    # Broadband comes straight from Ofcom's CSV, so its postcode can drift in
+    # spacing/casing from the NSPL `pcds` base key. Both sides must be reduced
+    # to the same canonical form so a real postcode populates
+    # `max_download_speed` instead of silently missing the left join.
+    base = pl.LazyFrame(
+        {
+            "postcode": ["AB1 2CD", "EF3 4GH"],
+            "lsoa21": ["E01000001", "E01000002"],
+            "Local Authority District code (2024)": ["E09000001", "E09000002"],
+            "pcon": ["E14000001", "E14000002"],
+        }
+    )
+
+    def _by_postcode(extra: dict) -> pl.LazyFrame:
+        return pl.LazyFrame({"postcode": ["AB1 2CD", "EF3 4GH"], **extra})
+
+    crime = pl.LazyFrame(
+        {
+            "postcode": ["AB1 2CD", "EF3 4GH"],
+            "Serious crime (avg/yr)": [1.0, 2.0],
+            "Minor crime (avg/yr)": [3.0, 4.0],
+        }
+    )
+    # AB1 2CD arrives lowercase + un-spaced; EF3 4GH arrives under two distinct
+    # raw spellings that canonicalize to one key (the max speed must win, with
+    # no fan-out of the base row).
+    broadband = pl.LazyFrame(
+        {
+            "bb_postcode": ["ab1 2cd", "ef34gh", "EF3 4GH"],
+            "max_download_speed": pl.Series([300, 30, 1000], dtype=pl.UInt16),
+        }
+    )
+    joined = _join_area_side_tables(
+        base,
+        iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
+        ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
+        crime=crime,
+        median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
+        election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
+        poi_counts=_by_postcode({}),
+        noise=_by_postcode({}),
+        school_proximity=_by_postcode({}),
+        conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
+        tree_density=None,
+        broadband=broadband,
+    ).collect()
+
+    # No fan-out: still one row per base postcode.
+    assert joined.height == 2
+    speeds = dict(
+        zip(joined["postcode"].to_list(), joined["max_download_speed"].to_list())
+    )
+    # Spacing/casing drift still joins.
+    assert speeds["AB1 2CD"] == 300
+    # Two raw spellings collapse to one canonical key; the max wins.
+    assert speeds["EF3 4GH"] == 1000
+    # The temporary canonical join key is not leaked into the output schema.
+    assert "_base_canonical_postcode" not in joined.columns
+    assert "_bb_canonical_postcode" not in joined.columns
+    assert "bb_postcode" not in joined.columns
+
+
 def test_listed_building_feature_is_property_level() -> None:
    assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS

@ -758,8 +912,10 @@ def test_coalesce_direct_epc_was_council_house_prefers_yes() -> None:

 def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
    # The crime table is LEFT-joined per postcode; a postcode absent from it
-    # must NOT be fabricated as "zero crime" (the safest value). When every
-    # per-type column is null the Serious/Minor rollups must stay null.
+    # must NOT be fabricated as "zero crime" (the safest value). The Serious/Minor
+    # rollups are precomputed in crime_spatial (the mean of the by-year rollup
+    # bars), so the merge reads them straight through; a missing postcode leaves
+    # them null.
    base = pl.LazyFrame(
        {
            "postcode": ["AA1 1AA", "BB2 2BB"],
@ -772,7 +928,10 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
    def _by_postcode(extra: dict) -> pl.LazyFrame:
        return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})

-    # Crime is present only for AA1 1AA; BB2 2BB is absent from the table.
+    # Crime is present only for AA1 1AA; BB2 2BB is absent from the table. The
+    # rollup headlines are precomputed values (deliberately NOT the per-type sum,
+    # which would be 10.0 each) so this test proves the merge consumes the
+    # precomputed column rather than re-summing per-type columns.
    crime = pl.LazyFrame(
        {
            "postcode": ["AA1 1AA"],
@ -790,6 +949,8 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
            "Public order (avg/yr)": [1.0],
            "Drugs (avg/yr)": [1.0],
            "Other crime (avg/yr)": [1.0],
+            "Serious crime (avg/yr)": [7.5],
+            "Minor crime (avg/yr)": [4.2],
        }
    )

@ -805,7 +966,12 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
        school_proximity=_by_postcode({}),
        conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
        tree_density=None,
-        broadband=pl.LazyFrame({"bb_postcode": ["AA1 1AA", "BB2 2BB"]}),
+        broadband=pl.LazyFrame(
+            {
+                "bb_postcode": ["AA1 1AA", "BB2 2BB"],
+                "max_download_speed": pl.Series([100, 300], dtype=pl.UInt16),
+            }
+        ),
    ).collect()

    by_postcode = {
@ -814,14 +980,50 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
            "postcode", "serious_crime_avg_yr", "minor_crime_avg_yr"
        ).iter_rows(named=True)
    }
-    # Present postcode: rollups are the component sums (1+2+3+4, 10×1).
-    assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 10.0
-    assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 10.0
+    # Present postcode: rollups are the precomputed headline values, read through
+    # unchanged (NOT the per-type sum of 10.0).
+    assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 7.5
+    assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 4.2
    # Missing postcode: rollups stay null rather than fabricating 0.0.
    assert by_postcode["BB2 2BB"]["serious_crime_avg_yr"] is None
    assert by_postcode["BB2 2BB"]["minor_crime_avg_yr"] is None


+def test_dedupe_collapsed_properties_keeps_most_recent_per_address() -> None:
+    # The terminated-postcode remap can merge two distinct postcodes onto one
+    # active successor, collapsing the same physical address onto a single
+    # (postcode, pp_address) key with conflicting sale records. The dedup must
+    # keep exactly one row per (postcode, pp_address) -- the most recent
+    # transaction -- and must not collapse genuinely distinct addresses.
+    from datetime import datetime
+
+    wide = pl.LazyFrame(
+        {
+            "postcode": ["SW3 3JY", "SW3 3JY", "SW3 3JY"],
+            "pp_address": ["45 ELYSTAN PLACE", "45 ELYSTAN PLACE", "9 OTHER ROAD"],
+            "date_of_transfer": [
+                datetime(1990, 1, 1),
+                datetime(2015, 6, 1),
+                datetime(2000, 1, 1),
+            ],
+            "latest_price": [1_587_700, 4_500_000, 250_000],
+        }
+    )
+
+    out = _dedupe_collapsed_properties(wide).collect()
+
+    # One row per (postcode, pp_address): the two ELYSTAN PLACE rows collapse to one.
+    assert out.height == 2
+    assert out.select(["postcode", "pp_address"]).is_unique().all()
+    by_addr = {r["pp_address"]: r for r in out.iter_rows(named=True)}
+    # The kept ELYSTAN PLACE row is the most recent transaction (2015 @ 4.5M),
+    # not an arbitrary one.
+    assert by_addr["45 ELYSTAN PLACE"]["date_of_transfer"] == datetime(2015, 6, 1)
+    assert by_addr["45 ELYSTAN PLACE"]["latest_price"] == 4_500_000
+    # A genuinely distinct address in the same postcode is untouched.
+    assert by_addr["9 OTHER ROAD"]["latest_price"] == 250_000
+
+
 def _property_candidates(rows: list[dict]) -> pl.DataFrame:
    base = {
        "postcode": "AA1 1AA",
--- a/pipeline/transform/test_noise_overlay_tiles.py
+++ b/pipeline/transform/test_noise_overlay_tiles.py
@ -0,0 +1,110 @@
+import numpy as np
+import rasterio
+from rasterio.transform import from_origin
+from rasterio.warp import transform_bounds
+
+from pipeline.transform import noise_overlay_tiles
+from pipeline.transform.noise_overlay_tiles import RasterInfo, _read_noise_tile
+
+
+def _write_corridor_raster(path, nodata=-96.0):
+    """A small EPSG:27700 raster: a column of 70 dB cells adjacent to genuine
+    0.0 (quiet) cells. Bilinear blending of the 0 cells would fabricate a halo
+    of intermediate dB values between 0 and 70."""
+    # 8x8 grid: leftmost two columns are 70 dB, the rest are genuine quiet 0.0.
+    data = np.zeros((8, 8), dtype=np.float32)
+    data[:, 0:2] = 70.0
+    # Place one true nodata cell to make sure it is also masked out.
+    data[0, 7] = nodata
+
+    # 10m cells anchored somewhere inside England's BNG extent.
+    left = 300_000.0
+    top = 300_080.0
+    transform = from_origin(left, top, 10.0, 10.0)
+    with rasterio.open(
+        path,
+        "w",
+        driver="GTiff",
+        height=data.shape[0],
+        width=data.shape[1],
+        count=1,
+        dtype=data.dtype,
+        crs="EPSG:27700",
+        transform=transform,
+        nodata=nodata,
+    ) as dataset:
+        dataset.write(data, 1)
+    return path
+
+
+def test_read_noise_tile_does_not_fabricate_halo(tmp_path):
+    raster_path = _write_corridor_raster(tmp_path / "corridor.tif")
+
+    with rasterio.open(raster_path) as dataset:
+        bounds_27700 = dataset.bounds
+        bounds_mercator = transform_bounds(
+            dataset.crs,
+            noise_overlay_tiles.WEB_MERCATOR_CRS,
+            *bounds_27700,
+            densify_pts=21,
+        )
+
+    info = RasterInfo(path=raster_path, bounds_mercator=bounds_mercator)
+
+    # Render at high resolution so any bilinear halo would surface as
+    # intermediate dB values along the corridor/quiet seam.
+    tile_size = 64
+    tile = _read_noise_tile([info], bounds_mercator, tile_size)
+
+    finite = tile[np.isfinite(tile)]
+    # Every finite cell must be the genuine corridor value (~70). There must be
+    # NO fabricated halo strictly between 0 and 70.
+    halo = finite[(finite > 0.0) & (finite < 70.0 - 1e-3)]
+    assert halo.size == 0, f"fabricated halo values present: {np.unique(halo)}"
+    # Sanity: the corridor itself must still be rendered.
+    assert finite.size > 0
+    assert np.all(finite >= 70.0 - 1e-3)
+
+
+def test_read_noise_tile_preserves_peak_under_downsample(tmp_path):
+    # 8x8 EPSG:27700 raster: a single loud 75 dB cell in a 50 dB field.
+    # Downsampling into a smaller tile with bilinear would dilute the peak
+    # (arithmetic dB averaging); Resampling.max must keep the worst-case dB.
+    data = np.full((8, 8), 50.0, dtype=np.float32)
+    data[4, 4] = 75.0
+    transform = from_origin(300_000.0, 300_080.0, 10.0, 10.0)
+    raster_path = tmp_path / "peak.tif"
+    with rasterio.open(
+        raster_path,
+        "w",
+        driver="GTiff",
+        height=data.shape[0],
+        width=data.shape[1],
+        count=1,
+        dtype=data.dtype,
+        crs="EPSG:27700",
+        transform=transform,
+        nodata=-96.0,
+    ) as dataset:
+        dataset.write(data, 1)
+
+    with rasterio.open(raster_path) as dataset:
+        bounds_mercator = transform_bounds(
+            dataset.crs,
+            noise_overlay_tiles.WEB_MERCATOR_CRS,
+            *dataset.bounds,
+            densify_pts=21,
+        )
+
+    info = RasterInfo(path=raster_path, bounds_mercator=bounds_mercator)
+
+    # Render the 8x8 source into a 4x4 tile: this downsamples, so bilinear
+    # would average the 75 dB peak away.
+    tile = _read_noise_tile([info], bounds_mercator, 4)
+    finite = tile[np.isfinite(tile)]
+
+    assert finite.size > 0
+    # The loud peak must survive the downsample (max, not arithmetic mean).
+    assert finite.max() >= 75.0 - 1e-3, f"peak diluted to {finite.max()}"
+    # Max resampling must never invent a value louder than the source.
+    assert finite.max() <= 75.0 + 1e-3
--- a/pipeline/transform/test_transform_poi.py
+++ b/pipeline/transform/test_transform_poi.py
@ -1,12 +1,115 @@
+import json
+
 import polars as pl

 from pipeline.transform.transform_poi import (
    _load_ofsted_ratings,
    _school_icon_category_expr,
+    transform,
    transform_grocery_retail_points,
 )


+def _write_boundary(tmp_path):
+    """A FeatureCollection whose single feature covers the London-area test
+    coords used by the transform() fixtures, so in_england_mask keeps them."""
+    boundary_path = tmp_path / "england.geojson"
+    coords = [[-1.0, 51.0], [1.0, 51.0], [1.0, 52.0], [-1.0, 52.0], [-1.0, 51.0]]
+    boundary_path.write_text(
+        json.dumps(
+            {
+                "type": "FeatureCollection",
+                "features": [
+                    {
+                        "type": "Feature",
+                        "properties": {},
+                        "geometry": {"type": "Polygon", "coordinates": [coords]},
+                    }
+                ],
+            }
+        )
+    )
+    return boundary_path
+
+
+def _write_transform_inputs(tmp_path, raw_pois: pl.DataFrame):
+    """Materialise the parquet inputs transform() requires around a given raw
+    OSM POIs frame. NaPTAN / grocery / GIAS / Ofsted are minimal but valid."""
+    input_path = tmp_path / "pois.parquet"
+    raw_pois.write_parquet(input_path)
+
+    naptan_path = tmp_path / "naptan.parquet"
+    pl.DataFrame(
+        {
+            "id": ["naptan-1"],
+            "name": ["Test Rail Station"],
+            "category": ["Rail station"],
+            "lat": [51.51],
+            "lng": [-0.13],
+        }
+    ).write_parquet(naptan_path)
+
+    grocery_path = tmp_path / "grocery.parquet"
+    pl.DataFrame(
+        {
+            "id": list(range(1, 6)),
+            "retailer": ["Tesco"] * 5,
+            "fascia": ["Tesco"] * 5,
+            "store_name": [f"Tesco Test {i}" for i in range(1, 6)],
+            "long_wgs": [-0.14] * 5,
+            "lat_wgs": [51.52] * 5,
+        }
+    ).write_parquet(grocery_path)
+
+    gias_path = tmp_path / "gias.parquet"
+    pl.DataFrame(
+        {
+            "urn": [1001],
+            "name": ["Test Primary School"],
+            "phase": ["Primary"],
+            "type": ["Community school"],
+            "type_group": ["Local authority maintained schools"],
+            "age_range": ["4–11"],
+            "gender": ["Mixed"],
+            "religious_character": [None],
+            "admissions_policy": ["Comprehensive"],
+            "nursery_provision": ["No"],
+            "sixth_form": ["No"],
+            "capacity": [200],
+            "pupils": [180],
+            "fsm_percent": [12.5],
+            "trust": [None],
+            "address": ["1 Test Street"],
+            "postcode": ["E1 1AA"],
+            "local_authority": ["Test LA"],
+            "website": [None],
+            "telephone": ["02012345678"],
+            "head_name": ["Jane Doe"],
+            "lat": [51.53],
+            "lng": [-0.12],
+        }
+    ).write_parquet(gias_path)
+
+    ofsted_path = tmp_path / "ofsted.parquet"
+    pl.DataFrame(
+        {
+            "URN": [1001],
+            "Latest OEIF overall effectiveness": ["2"],
+            "Ungraded inspection overall outcome": [None],
+        }
+    ).write_parquet(ofsted_path)
+
+    boundary_path = _write_boundary(tmp_path)
+    return {
+        "input_path": input_path,
+        "naptan_path": naptan_path,
+        "boundary_path": boundary_path,
+        "grocery_retail_points_path": grocery_path,
+        "gias_path": gias_path,
+        "ofsted_path": ofsted_path,
+    }
+
+
 def test_transform_grocery_retail_points_outputs_chain_categories():
    raw = pl.DataFrame(
        {
@ -292,3 +395,79 @@ def test_school_icon_category_handles_one_sided_age_ranges():
        "Primary school",
        "School",
    ]
+
+
+def test_transform_dedupes_multi_tag_pois(tmp_path):
+    # One OSM object can carry several tag keys that map to the SAME friendly
+    # category, so pois.py emits one raw row per key with the SAME id.
+    # "amenity/pharmacy" and "shop/chemist" both map to "Pharmacy".
+    raw = pl.DataFrame(
+        {
+            "id": ["n42", "n42"],
+            "name": ["Boots", "Boots"],
+            "category": ["amenity/pharmacy", "shop/chemist"],
+            "lat": [51.50, 51.50],
+            "lng": [-0.10, -0.10],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    # No (id, category) pair appears more than once.
+    assert out.group_by("id", "category").len()["len"].max() == 1
+    # The single physical pharmacy is present exactly once.
+    pharmacies = out.filter(
+        (pl.col("id") == "n42") & (pl.col("category") == "Pharmacy")
+    )
+    assert pharmacies.height == 1
+
+
+def test_osm_supermarkets_dropped(tmp_path):
+    # GEOLYTIX is authoritative for supermarkets; an OSM "shop/supermarket" row
+    # must not flow through as a second Groceries/Supermarket pin. A
+    # complementary grocery category (Convenience Store) must still survive.
+    raw = pl.DataFrame(
+        {
+            "id": ["n1", "n2"],
+            "name": ["Some Supermarket", "Corner Shop"],
+            "category": ["shop/supermarket", "shop/convenience"],
+            "lat": [51.50, 51.51],
+            "lng": [-0.10, -0.11],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    osm_supermarkets = out.filter(
+        (pl.col("group") == "Groceries") & (pl.col("category") == "Supermarket")
+    )
+    assert osm_supermarkets.height == 0
+    # Complementary OSM grocery category survives.
+    convenience = out.filter(pl.col("category") == "Convenience Store")
+    assert convenience.height == 1
+
+
+def test_transform_output_unique_per_id_category(tmp_path):
+    # Soundness: the full transform() output has at most one row per
+    # (id, category) overall, across every source.
+    raw = pl.DataFrame(
+        {
+            "id": ["n42", "n42", "n7", "n8"],
+            "name": ["Boots", "Boots", "St Mary's", "St Mary's"],
+            "category": [
+                "amenity/pharmacy",
+                "shop/chemist",
+                "amenity/place_of_worship",
+                "building/church",
+            ],
+            "lat": [51.50, 51.50, 51.55, 51.55],
+            "lng": [-0.10, -0.10, -0.15, -0.15],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    assert out.group_by("id", "category").len()["len"].max() == 1
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -6,6 +6,10 @@ import polars as pl
 from pipeline.utils.england_geometry import in_england_mask

 DROP_CATEGORIES = {
+    # GEOLYTIX Grocery Retail Points is the authoritative supermarket source
+    # (transform_grocery_retail_points), so drop OSM supermarkets to avoid
+    # double-counting each store as both a GEOLYTIX brand and an OSM "Supermarket".
+    "shop/supermarket",
    # Street furniture & infrastructure
    "amenity/advice",
    "amenity/atm",
@ -364,14 +368,6 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "leisure/yes",
        ],
    ),
-    (
-        "Groceries",
-        "Supermarket",
-        "🛒",
-        [
-            "shop/supermarket",
-        ],
-    ),
    (
        "Groceries",
        "Convenience Store",
@ -1534,6 +1530,14 @@ def transform(
        pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
    )

+    # A single OSM object can carry several tag keys that map to the same
+    # friendly category (e.g. amenity/pharmacy + shop/chemist -> "Pharmacy"),
+    # which pois.py emits as multiple raw rows sharing one id. Collapse those
+    # duplicates so they don't inflate downstream proximity counts; rows sharing
+    # an id with DIFFERENT categories are preserved. Other sources are
+    # pre-deduplicated.
+    lf = lf.unique(subset=["id", "category"], keep="first", maintain_order=True)
+
    naptan_df = pl.scan_parquet(naptan_path).collect()
    mask = in_england_mask(
        boundary_path,