try

2026-06-04 22:34:26 +01:00 · 2026-06-04 22:34:26 +01:00 · c938b71904
commit c938b71904
parent 843d14b7ba
13 changed files with 698 additions and 109 deletions
--- a/analyses/online_listings_buy.ipynb
+++ b/analyses/online_listings_buy.ipynb
@ -434,7 +434,8 @@
    "                color=\"white\" if v > 55 else \"black\")\n",
    "fig.colorbar(im, ax=ax, label=\"% missing\", fraction=0.025, pad=0.01)\n",
    "ax.set_title(\"Missing (null or empty-string) % by provider\")\n",
-    "plt.tight_layout(); plt.show()"
+    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
@ -640,9 +641,11 @@
    "    ax.axvline(np.log10(v), color=\"red\", ls=\"--\", lw=1)\n",
    "    ax.text(np.log10(v), ax.get_ylim()[1] * 0.92, lab, rotation=90, fontsize=7,\n",
    "            color=\"red\", va=\"top\")\n",
-    "ax.set_xlabel(\"log10(Asking price)\"); ax.set_ylabel(\"count\")\n",
+    "ax.set_xlabel(\"log10(Asking price)\")\n",
    "ax.set_ylabel(\"count\")\n",
    "ax.set_title(\"Asking price distribution (log scale)\")\n",
-    "plt.tight_layout(); plt.show()"
+    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
@ -764,9 +767,12 @@
    "ax.scatter(x[impossible], y[impossible], s=14, alpha=0.8, color=\"red\", label=\"impossible (area < beds×8)\")\n",
    "xs = np.linspace(0, x.max(), 60)\n",
    "ax.plot(xs, np.maximum(10, xs * 8), \"k--\", lw=1, label=\"area = beds × 8 m²\")\n",
-    "ax.set_xlabel(\"Bedrooms\"); ax.set_ylabel(\"Total floor area (m²)\")\n",
+    "ax.set_xlabel(\"Bedrooms\")\n",
-    "ax.set_title(\"Floor area vs bedrooms (≤160 m²)\"); ax.legend(fontsize=8)\n",
+    "ax.set_ylabel(\"Total floor area (m²)\")\n",
-    "plt.tight_layout(); plt.show()"
+    "ax.set_title(\"Floor area vs bedrooms (≤160 m²)\")\n",
    "ax.legend(fontsize=8)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
@ -947,9 +953,13 @@
    "for prov, c in colors.items():\n",
    "    gp = g.filter(pl.col(\"provider\") == prov)\n",
    "    ax.scatter(gp[\"lon\"], gp[\"lat\"], s=2, alpha=0.30, color=c, label=prov)\n",
-    "ax.set_xlabel(\"lon\"); ax.set_ylabel(\"lat\"); ax.set_aspect(\"equal\", adjustable=\"datalim\")\n",
+    "ax.set_xlabel(\"lon\")\n",
-    "ax.set_title(\"Listing coordinates by provider (25k sample)\"); ax.legend(markerscale=4, fontsize=8)\n",
+    "ax.set_ylabel(\"lat\")\n",
-    "plt.tight_layout(); plt.show()"
+    "ax.set_aspect(\"equal\", adjustable=\"datalim\")\n",
    "ax.set_title(\"Listing coordinates by provider (25k sample)\")\n",
    "ax.legend(markerscale=4, fontsize=8)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
@ -2040,16 +2050,21 @@
    "    clean.filter((pl.col(\"Bedrooms\") == 0) & (pl.col(\"Property sub-type\") != \"Studio\")).height,\n",
    "    0,\n",
    "]\n",
-    "x = np.arange(len(labels)); w = 0.38\n",
+    "x = np.arange(len(labels))\n",
    "w = 0.38\n",
    "fig, ax = plt.subplots(figsize=(10, 3.6))\n",
    "b1 = ax.bar(x - w / 2, before, w, label=\"raw\", color=\"#dc2626\")\n",
    "b2 = ax.bar(x + w / 2, after, w, label=\"clean\", color=\"#16a34a\")\n",
-    "ax.set_xticks(x); ax.set_xticklabels(labels, fontsize=8)\n",
+    "ax.set_xticks(x)\n",
-    "ax.set_yscale(\"symlog\"); ax.set_ylabel(\"value (symlog)\")\n",
+    "ax.set_xticklabels(labels, fontsize=8)\n",
-    "ax.set_title(\"Before / after cleanup\"); ax.legend()\n",
+    "ax.set_yscale(\"symlog\")\n",
    "ax.set_ylabel(\"value (symlog)\")\n",
    "ax.set_title(\"Before / after cleanup\")\n",
    "ax.legend()\n",
    "for bars in (b1, b2):\n",
    "    ax.bar_label(bars, fmt=\"%.0f\", fontsize=7, padding=2)\n",
-    "plt.tight_layout(); plt.show()"
+    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
--- a/pipeline/download/noise.py
+++ b/pipeline/download/noise.py
@ -1,7 +1,7 @@
 """Download Defra Round 4 (2022) strategic noise data for England.
 Downloads modelled noise levels (road, rail, airport) as GeoTIFF rasters via
-WCS, then samples the local maximum around each postcode representative point.
+WCS, then samples the 10m cell at each postcode representative point.
 Outputs a parquet file with postcode-level noise in dB for each source.
 Uses smaller 20km tiles at native 10m resolution so values are not understated
@ -98,15 +98,21 @@ NOISE_NODATA_SENTINEL = np.float32(-96.0)
 # NOISE_COLOR_STOPS[0]) — a rendering threshold, not the data's reporting floor.
 NOISE_QUIET_FLOOR_DB = np.float32(40.0)
-# The pipeline has postcode representative points rather than complete unit
+# Sample noise at the postcode representative point itself (no neighbourhood
-# polygons here. Use a small local footprint and take the maximum 10m cell so
+# window). A 50m MAX-of-window grabbed the single loudest 10m cell within ~1.2 ha
-# postcode-level noise is not understated by centroid rounding.
+# of every postcode; because Defra road contours hug every modelled road and
-POSTCODE_NOISE_RADIUS_M = 50
+# representative points sit on/near streets, that inflated postcode noise by
 # roughly +9 dB (log scale) — making ~94% of England read >=55 dB Lden and
 # collapsing the metric's discrimination at the quiet end. Radius 0 ->
 # filter_size 1 -> the maximum_filter is skipped and each postcode reads the
 # 10m cell it actually sits in.
 POSTCODE_NOISE_RADIUS_M = 0
-# Adjacent download tiles must overlap by at least the sampling radius so every
+# Adjacent download tiles overlap by the sampling radius so every postcode's
-# postcode's 50m max-window is fully contained in at least one tile. Without
+# sampling footprint is fully contained in at least one tile. With point
-# this, a loud pixel just across a tile seam is invisible to a postcode on the
+# sampling (radius 0) this is 0 — a representative point falls inside exactly
-# far side, under-reporting noise near seams.
+# one tile — but the relationship is kept so any future non-zero radius keeps
 # its window seam-safe.
 TILE_OVERLAP_M = POSTCODE_NOISE_RADIUS_M
 # Retry/split behaviour for slow Defra WCS requests. Some 100km eastern tiles
@ -413,8 +419,13 @@ def sample_noise_at_postcodes(
    label: str,
    col_name: str,
 ) -> pl.Series:
-    """Sample max noise values from 10m tiles around postcode representative points."""
+    """Sample noise from 10m tiles at postcode representative points.
-    print(f"[{label}] Sampling max noise values from {len(tile_paths)} tiles...")
+
    With POSTCODE_NOISE_RADIUS_M == 0 (the default) each postcode reads the
    single 10m cell it sits in; a larger radius reduces to a max over the
    surrounding window.
    """
    print(f"[{label}] Sampling noise values from {len(tile_paths)} tiles...")
    noise_db = np.full(len(easting), np.nan, dtype=np.float32)
    radius_cells = max(0, math.ceil(POSTCODE_NOISE_RADIUS_M / RESOLUTION))
    filter_size = radius_cells * 2 + 1
--- a/pipeline/download/test_noise.py
+++ b/pipeline/download/test_noise.py
@ -126,19 +126,23 @@ def test_download_raster_raises_on_missing_strict_tiles(monkeypatch, tmp_path):
 def test_generate_tiles_neighbours_overlap_by_radius():
    # Use an explicit non-zero overlap so the assertion verifies a real positive
    # overlap. The production radius is 0 (point sampling), which would make this
    # a vacuous ">= 0" check; this keeps the seam-safety guard meaningful for any
    # future non-zero sampling radius.
    tile_size = 20_000
-    overlap = noise.POSTCODE_NOISE_RADIUS_M
+    overlap = 50
-    tiles = noise._generate_tiles(
+    tiles = noise._generate_tiles(0, 60_000, 0, 60_000, tile_size, overlap, tile_size)
        0, 60_000, 0, 60_000, tile_size, overlap, tile_size
    )
    by_origin = {(min_e, min_n): (max_e, max_n) for min_e, min_n, max_e, max_n in tiles}
    saw_horizontal_overlap = False
    # Horizontally adjacent tiles must overlap by >= overlap.
    for (min_e, min_n), (max_e, _max_n) in by_origin.items():
        right_origin = (min_e + tile_size, min_n)
        if right_origin in by_origin:
            assert max_e - right_origin[0] >= overlap
            saw_horizontal_overlap = True
    # Vertically adjacent tiles must overlap by >= overlap.
    for (min_e, min_n), (_max_e, max_n) in by_origin.items():
@ -146,6 +150,8 @@ def test_generate_tiles_neighbours_overlap_by_radius():
        if up_origin in by_origin:
            assert max_n - up_origin[1] >= overlap
    assert saw_horizontal_overlap  # the fixture actually has adjacent tiles
 def test_generate_tiles_clamps_to_grid_extent():
    tile_size = 20_000
@ -193,9 +199,7 @@ def test_sample_noise_recovers_value_across_overlapping_seam(monkeypatch, tmp_pa
    tile_size = 100
    overlap = noise.POSTCODE_NOISE_RADIUS_M
    tiles = noise._generate_tiles(0, 200, 0, 100, tile_size, overlap, tile_size)
-    by_origin = {
+    by_origin = {(min_e, min_n): (max_e, max_n) for min_e, min_n, max_e, max_n in tiles}
        (min_e, min_n): (max_e, max_n) for min_e, min_n, max_e, max_n in tiles
    }
    left_min_e, left_min_n = 0, 0
    left_max_e, left_max_n = by_origin[(left_min_e, left_min_n)]
    # Overlap fix is what makes the left tile reach across the seam.
@ -269,7 +273,9 @@ def test_sample_noise_distinguishes_nodata_from_in_coverage_quiet(
    assert result.to_list() == [None, float(noise.NOISE_QUIET_FLOOR_DB), 65.0]
-def test_sample_noise_preserves_genuine_reading_above_quiet_floor(monkeypatch, tmp_path):
+def test_sample_noise_preserves_genuine_reading_above_quiet_floor(
    monkeypatch, tmp_path
 ):
    monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 0)
    monkeypatch.setattr(noise, "RESOLUTION", 10)
@ -324,6 +330,30 @@ def test_sample_noise_nodata_window_stays_null(monkeypatch, tmp_path):
    assert result.to_list() == [None]
 def test_sample_noise_default_radius_samples_at_point_not_window(monkeypatch, tmp_path):
    # Regression: production samples noise at the postcode's own 10m cell
    # (POSTCODE_NOISE_RADIUS_M == 0), NOT a max-of-window that would grab the
    # loudest nearby road cell and inflate every postcode's noise by ~+9 dB.
    monkeypatch.setattr(noise, "RESOLUTION", 10)
    assert noise.POSTCODE_NOISE_RADIUS_M == 0
    # Cell 0 = quiet (at the 40 dB floor), cell 1 = loud road (70), adjacent.
    data = np.array([[40.0, 70.0]], dtype=np.float32)
    _write_geotiff(tmp_path / "noise.tif", data, 0, 10, 10, nodata=-96.0)
    result = noise.sample_noise_at_postcodes(
        [tmp_path / "noise.tif"],
        # Cell centres: easting 5 -> quiet cell 0; the loud cell 1 is at 15.
        easting=np.array([5.0]),
        northing=np.array([5.0]),
        label="Road",
        col_name="road_noise_lden_db",
    )
    # Point sampling reads the quiet own-cell (40), not the loud neighbour (70).
    assert result.to_list() == [40.0]
 def test_sample_noise_at_postcodes_uses_local_maximum(monkeypatch, tmp_path):
    monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 15)
    monkeypatch.setattr(noise, "RESOLUTION", 10)
--- a/pipeline/transform/join_epc_pp.py
+++ b/pipeline/transform/join_epc_pp.py
@ -21,7 +21,15 @@ from ..utils import (
 pl.Config.set_tbl_cols(-1)
 RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
-MIN_PRICE = 50_000
+# Value-quality floor for price aggregations. A flat nominal floor is a blunt
 # tool against a deflating threshold — £50k was completely normal for a 1990s
 # house, so a 50k floor wrongly discarded ~a third of legitimate 1990s
 # open-market sales (and deleted properties whose only sales were old/cheap),
 # biasing early-year price history upward. 10k recovers the large [10k,50k)
 # band of genuine cheaper sales while still excluding the nominal/junk transfers
 # (£1 etc.). A small tail of real sub-10k sales is still dropped — a deliberate
 # conservative tradeoff to keep clearly-implausible transfers out.
 MIN_PRICE = 10_000
 # Plausible construction-year range; band-derived years outside it (e.g. OCR
 # noise like 1012 or 2202) are nulled rather than published.
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -30,6 +30,7 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
 MIN_FLOOR_AREA_M2 = 10
 CONSERVATION_AREA_FEATURE = "Within conservation area"
 TREE_DENSITY_FEATURE = "Street tree density percentile"
 LISTED_BUILDING_FEATURE = "Listed building"
 LISTED_BUILDING_MATCH_RADIUS_M = 250.0
 LISTED_BUILDING_NEAREST_POSTCODES = 3
@ -92,6 +93,10 @@ _AREA_COLUMNS = [
    "Noise (dB)",
    "Max available download speed (Mbps)",
    CONSERVATION_AREA_FEATURE,
    # Tree canopy is a 50m-radius percentile around the postcode centroid, so it
    # is postcode-grain: it belongs in the area output (one value per postcode,
    # covering property-less postcodes too) rather than duplicated per property.
    TREE_DENSITY_FEATURE,
    # Schools
    "Good+ primary schools within 5km",
    "Good+ secondary schools within 5km",
@ -116,7 +121,6 @@ _AREA_COLUMNS = [
 _DYNAMIC_POI_DISTANCE_RE = re.compile(r"^Distance to nearest amenity \(.+\) \(km\)$")
 _DYNAMIC_POI_COUNT_RE = re.compile(r"^Number of amenities \(.+\) within (2|5)km$")
 TREE_DENSITY_FEATURE = "Street tree density percentile"
 _POSTCODE_TREE_DENSITY_PERCENTILE_RE = re.compile(
    r"^Tree canopy density percentile within \d+m$"
 )
@ -818,9 +822,9 @@ def _dedupe_collapsed_properties(wide: pl.LazyFrame) -> pl.LazyFrame:
    untouched. pp_address is non-null here (join_epc_pp filters it), so the key
    never merges unrelated rows.
    """
-    return wide.sort(
+    return wide.sort("date_of_transfer", descending=True, nulls_last=True).unique(
-        "date_of_transfer", descending=True, nulls_last=True
+        subset=["postcode", "pp_address"], keep="first", maintain_order=True
-    ).unique(subset=["postcode", "pp_address"], keep="first", maintain_order=True)
+    )
 def _filter_to_active_english_postcodes(
@ -1108,13 +1112,26 @@ def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
    return epc_band_to_year(pl.col(column))
-def _address_score(query: str, candidate: str | None) -> int:
+def _address_score(query: str, candidate: str | None, *, allow_token_set: bool) -> int:
    if not candidate:
        return 0
-    return max(
+    # token_set_ratio returns 100 whenever the shorter token set is a subset of
-        fuzz.token_set_ratio(query, candidate),
+    # the longer. For a NUMBER-LESS query that is unsafe — a single locality
-        fuzz.token_sort_ratio(query, candidate),
+    # token (e.g. "KINGSWOOD") subsets to 100 against any long address that
-    )
+    # merely contains it — so number-less queries score with token_sort_ratio
    # only, matching the canonical fuzzy_join._score_bucket. For a NUMBERED
    # query the unconditional _numbers_compatible gate has already guaranteed the
    # candidate carries compatible house numbers, so token_set cannot inflate
    # across different addresses; allowing it recovers genuine matches where the
    # scraped listing appends trailing town/county tokens the bare register
    # address omits (e.g. "105 RIDGEWAY DRIVE BROMLEY KENT" vs "105 RIDGEWAY
    # DRIVE").
    if allow_token_set:
        return max(
            fuzz.token_set_ratio(query, candidate),
            fuzz.token_sort_ratio(query, candidate),
        )
    return fuzz.token_sort_ratio(query, candidate)
 def _has_number(address: str | None) -> bool:
@ -1153,9 +1170,12 @@ def _best_listing_match(
    ``uprn_index`` (postcode-independent, so it is robust even when the
    listing's postcode is slightly off); (2) failing that, the highest
    fuzzy street-address similarity within the listing's own postcode bucket.
-    No property-attribute heuristics are used — a house number in the listing
+    No property-attribute heuristics are used — `_numbers_compatible` gates
-    address gates the fuzzy match (`_numbers_compatible`) and lowers the score
+    every fuzzy match unconditionally (so a number-less listing can never match
-    threshold; a number-less address must match the street almost exactly.
+    a numbered property, and vice versa), as in the canonical
    `fuzzy_join._score_bucket`. A house number additionally lowers the score
    threshold and (via `_address_score`) permits token_set scoring; a number-less
    address scores on token_sort only and must match the street almost exactly.
    ``addressed_fields`` names the candidate columns to fuzzy-match against (a
    candidate may carry both a register and an EPC address). Returns
@ -1180,9 +1200,11 @@ def _best_listing_match(
            address = candidate.get(field)
            if not address:
                continue
-            if listing_has_numbers and not _numbers_compatible(query, address):
+            # Unconditional number gate (matches fuzzy_join): a number-less
            # listing cannot match a numbered candidate and vice versa.
            if not _numbers_compatible(query, address):
                continue
-            score = _address_score(query, address)
+            score = _address_score(query, address, allow_token_set=listing_has_numbers)
            if score > best_score:
                best_score = score
                best = candidate
@ -1675,7 +1697,9 @@ def _coalesce_direct_epc_columns(wide: pl.LazyFrame) -> pl.LazyFrame:
        # "Yes". "Former council house" should fire if EITHER side says so.
        if raw_column == "was_council_house":
            return (
-                pl.when((pl.col(raw_column) == "Yes") | (pl.col(direct_column) == "Yes"))
+                pl.when(
                    (pl.col(raw_column) == "Yes") | (pl.col(direct_column) == "Yes")
                )
                .then(pl.lit("Yes"))
                .otherwise(coalesce)
                .alias(raw_column)
@ -1716,9 +1740,13 @@ def _build_unmatched_listing_seed_rows(
        "total_floor_area": pl.coalesce(
            pl.col("_actual_total_floor_area"), pl.col("_direct_total_floor_area")
        ),
        # Prefer the direct-EPC habitable-room count over the listing's value:
        # the scraped room count is bedrooms + bathrooms (upstream storage.py
        # defect), so it over-counts. Fall back to the listing value only when
        # the direct-EPC match has no count.
        "number_habitable_rooms": pl.coalesce(
            pl.col("_actual_number_habitable_rooms"),
            pl.col("_direct_number_habitable_rooms"),
            pl.col("_actual_number_habitable_rooms"),
        ),
        "latest_price": pl.col("_actual_asking_price"),
    }
@ -1836,14 +1864,19 @@ def _finalize_listings(df: pl.DataFrame) -> pl.DataFrame:
        # Listing coordinates win over the postcode centroid.
        pl.coalesce(pl.col("_actual_lat").cast(pl.Float64), pl.col("lat")).alias("lat"),
        pl.coalesce(pl.col("_actual_lon").cast(pl.Float64), pl.col("lon")).alias("lon"),
-        # Listing's floor area / rooms override any EPC/PP value when present.
+        # Listing's floor area overrides any EPC/PP value when present.
        pl.coalesce(
            pl.col("_actual_total_floor_area").cast(pl.Float64),
            pl.col("Total floor area (sqm)"),
        ).alias("Total floor area (sqm)"),
        # Rooms: prefer the EPC habitable-room count and fall back to the listing
        # value only when no EPC count exists. The scraped "Number of bedrooms &
        # living rooms" is actually bedrooms + bathrooms (an upstream storage.py
        # defect), so preferring it would inflate the room count and overwrite a
        # correct EPC value.
        pl.coalesce(
            pl.col("_actual_number_habitable_rooms").cast(pl.Int16),
            pl.col("Number of bedrooms & living rooms"),
            pl.col("_actual_number_habitable_rooms").cast(pl.Int16),
        ).alias("Number of bedrooms & living rooms"),
        pl.when(pl.col("_actual_property_type").is_in(_PROPERTY_TYPE_VALUES))
        .then(pl.col("_actual_property_type"))
@ -2130,12 +2163,15 @@ def _build(
        pl.when(
            (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
            & (
-                (pl.col("latest_price") / pl.col("total_floor_area"))
+                (pl.col("latest_price") / pl.col("total_floor_area")).is_between(
-                .is_between(MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM)
+                    MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM
                )
            )
        )
        .then(
-            (pl.col("latest_price") / pl.col("total_floor_area")).round(0).cast(pl.Int32)
+            (pl.col("latest_price") / pl.col("total_floor_area"))
            .round(0)
            .cast(pl.Int32)
        )
        .otherwise(None)
        .alias("Price per sqm"),
--- a/pipeline/transform/test_join_epc_pp.py
+++ b/pipeline/transform/test_join_epc_pp.py
@ -378,7 +378,10 @@ def test_run_new_build_keeps_early_first_transfer_when_sub_min_price(tmp_path: P
    price_paid_path = tmp_path / "price-paid.parquet"
    pl.DataFrame(
        {
-            "price": [30_000, 300_000],
+            # 5_000 is below MIN_PRICE (10_000) — a nominal/junk transfer that
            # must still anchor the construction year but stay out of the price
            # aggregations.
            "price": [5_000, 300_000],
            "date_of_transfer": [date(2015, 2, 3), date(2022, 2, 3)],
            "property_type": ["T", "T"],
            "postcode": ["AA1 1AA", "AA1 1AA"],
@ -408,6 +411,48 @@ def test_run_new_build_keeps_early_first_transfer_when_sub_min_price(tmp_path: P
    assert df.get_column("historical_prices").list.len().to_list() == [1]
 def test_run_keeps_sale_above_lowered_min_price(tmp_path: Path):
    # A genuine cheap sale of 30_000 sits between the OLD floor (50k) and the
    # NEW floor (10k): it must now be RETAINED in the price aggregations. This
    # pins the 50k->10k change — it fails on the pre-fix 50k floor (where 30k was
    # excluded, giving historical_prices length 1 / latest_price 250_000).
    zip_path = tmp_path / "domestic-csv.zip"
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
        csv_buffer = io.StringIO()
        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
        writer.writeheader()
        writer.writerow(_row())
        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
    price_paid_path = tmp_path / "price-paid.parquet"
    pl.DataFrame(
        {
            "price": [250_000, 30_000],
            "date_of_transfer": [date(2018, 2, 3), date(2022, 2, 3)],
            "property_type": ["T", "T"],
            "postcode": ["AA1 1AA", "AA1 1AA"],
            "paon": ["1", "1"],
            "saon": [None, None],
            "street": ["Example Street", "Example Street"],
            "locality": [None, None],
            "town_city": ["Exampletown", "Exampletown"],
            "duration": ["F", "F"],
            "old_new": ["N", "N"],
            "ppd_category": ["A", "A"],
        }
    ).write_parquet(price_paid_path)
    output_path = tmp_path / "epc-pp.parquet"
    _run(zip_path, price_paid_path, output_path, tmp_path)
    df = pl.read_parquet(output_path)
    assert df.height == 1
    # Both sales now survive the 10k floor; the 30_000 (2022) is the most recent.
    assert df.get_column("historical_prices").list.len().to_list() == [2]
    assert df.get_column("latest_price").to_list() == [30_000]
 def test_epc_band_to_year_uses_midpoint_and_clamps():
    import polars as pl
--- a/pipeline/transform/test_merge.py
+++ b/pipeline/transform/test_merge.py
@ -13,6 +13,7 @@ from pipeline.transform.merge import (
    _active_english_postcode_area,
    _build_unmatched_listing_seed_rows,
    _canonical_postcode_expr,
    _best_listing_match,
    _coalesce_direct_epc_columns,
    _dedupe_collapsed_properties,
    _filter_to_active_english_postcodes,
@ -78,6 +79,40 @@ def test_conservation_area_feature_is_area_level() -> None:
    assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS
 def test_tree_density_is_area_level_and_survives_the_split() -> None:
    # Street tree density is a postcode-centroid percentile (constant per
    # postcode), so it must route to the postcode/area output -- not be stripped
    # by _area_columns_from -- and must NOT be duplicated into the property
    # output. Regression for the drift where it landed only in properties.parquet
    # and was lost for the ~308k property-less postcodes.
    assert TREE_DENSITY_FEATURE in _AREA_COLUMNS
    df = pl.DataFrame(
        {
            "Postcode": ["AA1 1AA"],
            "Last known price": [250_000],
            TREE_DENSITY_FEATURE: [42.0],
        }
    )
    postcode_features = pl.DataFrame(
        {
            "Postcode": ["AA1 1AA", "BB1 1BB"],
            "lat": [51.0, 52.0],
            "lon": [-0.1, -0.2],
            "ctry25cd": ["E92000001", "E92000001"],
            TREE_DENSITY_FEATURE: [42.0, 7.0],
        }
    )
    postcode_df, properties_df = _split_normal_outputs(
        df, postcode_features, expected_postcode_count=2
    )
    assert TREE_DENSITY_FEATURE in postcode_df.columns
    assert postcode_df[TREE_DENSITY_FEATURE].to_list() == [42.0, 7.0]
    assert TREE_DENSITY_FEATURE not in properties_df.columns
 def test_crime_columns_are_spatial_counts_not_per_capita() -> None:
    # Crime is now a raw spatial count per postcode; the per-1k-residents
    # variants were dropped along with the LSOA population denominator.
@ -767,6 +802,41 @@ def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks(
    assert seed["was_council_house"].to_list() == ["No"]
 def test_build_unmatched_listing_seed_rows_prefers_direct_epc_rooms_over_listing(
    tmp_path,
 ) -> None:
    # When BOTH the listing room count and a direct-EPC count exist, the EPC
    # value must win: the scraped "Number of bedrooms & living rooms" is actually
    # bedrooms + bathrooms (upstream defect), so preferring it would inflate the
    # count. This pins the coalesce direction (direct-EPC before listing).
    listings_path = tmp_path / "listings.parquet"
    arcgis_path = tmp_path / "arcgis.parquet"
    _sample_listings_frame().with_columns(
        # The corrupt listing room count (beds + baths).
        pl.lit(5, dtype=pl.Int32).alias("Number of bedrooms & living rooms"),
    ).write_parquet(listings_path)
    _stub_arcgis(arcgis_path)
    listings = _load_listings_for_merge(listings_path, arcgis_path).with_columns(
        # The genuine EPC habitable-room count.
        pl.lit(3, dtype=pl.Int16).alias("_direct_number_habitable_rooms"),
    )
    template_schema = pl.Schema(
        {
            "postcode": pl.Utf8,
            "pp_address": pl.Utf8,
            "number_habitable_rooms": pl.Int16,
            **{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES},
        }
    )
    seed = _build_unmatched_listing_seed_rows(
        listings.select("_listing_idx"), listings, template_schema
    )
    assert seed["number_habitable_rooms"].to_list() == [3]
 _DIRECT_EPC_CANDIDATE_SCHEMA = {
    "_direct_epc_row": pl.UInt32,
    "_direct_epc_match_address": pl.Utf8,
@ -1249,6 +1319,98 @@ def test_integrate_listings_seeds_listing_with_unmatched_street(tmp_path) -> Non
    assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
 def test_best_listing_match_rejects_numberless_listing_against_numbered_property() -> (
    None
 ):
    # Regression: a number-less listing (street/locality only) must NOT match a
    # numbered property. The number gate is unconditional (like fuzzy_join), and
    # the score is token_sort_ratio only, so a single locality token can no
    # longer subset-inflate to 100 against a long numbered address.
    candidates = [{"pp_address": "FLAT A3 CHESHAM HEIGHTS ST MONICAS ROAD"}]
    result = _best_listing_match(
        listing_uprn=None,
        query="KINGSWOOD",
        uprn_index={},
        bucket_candidates=candidates,
        addressed_fields=["pp_address"],
    )
    assert result is None
 def test_best_listing_match_allows_numberless_to_numberless_named_house() -> None:
    # A number-less listing CAN still match a number-less (named-house) property
    # when the street/name matches almost exactly.
    candidates = [{"pp_address": "WOODLANDS HOUSE OAK LANE"}]
    result = _best_listing_match(
        listing_uprn=None,
        query="WOODLANDS HOUSE OAK LANE",
        uprn_index={},
        bucket_candidates=candidates,
        addressed_fields=["pp_address"],
    )
    assert result is not None
    candidate, score, method, field = result
    assert method == "address"
    assert score >= 90.0
 def test_best_listing_match_still_matches_numbered_listing_to_numbered_property() -> (
    None
 ):
    # No regression for numbered listings: the number gate still permits a
    # compatible house number and the lower with-numbers threshold applies.
    candidates = [{"pp_address": "10 OAK LANE"}]
    result = _best_listing_match(
        listing_uprn=None,
        query="10 OAK LANE",
        uprn_index={},
        bucket_candidates=candidates,
        addressed_fields=["pp_address"],
    )
    assert result is not None
    _candidate, score, method, _field = result
    assert method == "address"
    assert score >= 82.0
 def test_best_listing_match_numbered_listing_with_trailing_locality_still_matches() -> (
    None
 ):
    # A scraped numbered listing often appends town/county tokens that the bare
    # Price-Paid register address omits. token_sort alone would score this ~73
    # (below 82) and drop a correct match; token_set (allowed for numbered
    # queries, where the number gate makes it safe) recovers it.
    candidates = [{"pp_address": "105 RIDGEWAY DRIVE"}]
    result = _best_listing_match(
        listing_uprn=None,
        query="105 RIDGEWAY DRIVE BROMLEY KENT",
        uprn_index={},
        bucket_candidates=candidates,
        addressed_fields=["pp_address"],
    )
    assert result is not None
    candidate, score, _method, _field = result
    assert candidate["pp_address"] == "105 RIDGEWAY DRIVE"
    assert score >= 82.0
 def test_best_listing_match_numbered_query_cannot_subset_inflate_across_numbers() -> (
    None
 ):
    # token_set for numbered queries is safe only because the number gate runs
    # first: a query and candidate with incompatible house numbers never reach
    # scoring, so token_set cannot inflate "10 OAK LANE" onto "12 OAK LANE".
    candidates = [{"pp_address": "12 OAK LANE KINGSTON"}]
    result = _best_listing_match(
        listing_uprn=None,
        query="10 OAK LANE",
        uprn_index={},
        bucket_candidates=candidates,
        addressed_fields=["pp_address"],
    )
    assert result is None
 def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows() -> (
    None
 ):
@ -1325,9 +1487,12 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
    assert finalized["Est. price per sqm"].to_list() == [5_000, 7_368]
    assert finalized["Estimated current price"].to_list() == [600_000, 700_000]
    assert finalized["Last known price"].to_list() == [500_000, 700_000]
-    # Listing's preferred floor area / rooms / property type / tenure.
+    # Listing's preferred floor area / property type / tenure.
    assert finalized["Total floor area (sqm)"].to_list() == [110.0, 95.0]
-    assert finalized["Number of bedrooms & living rooms"].to_list() == [4, 3]
+    # Rooms prefer the EPC habitable-room count over the listing's beds+baths
    # value: row 0 keeps the EPC 3 (not the listing's _actual 4); row 1 has no
    # EPC count so it falls back to the listing's 3.
    assert finalized["Number of bedrooms & living rooms"].to_list() == [3, 3]
    assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"]
    assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"]
    # Postcode-level feature carried through to both matched and unmatched rows.
--- a/pipeline/transform/test_transform_poi.py
+++ b/pipeline/transform/test_transform_poi.py
@ -5,11 +5,72 @@ import polars as pl
 from pipeline.transform.transform_poi import (
    _load_ofsted_ratings,
    _school_icon_category_expr,
    osm_groceries_colocated_with_geolytix,
    transform,
    transform_grocery_retail_points,
 )
 def test_osm_groceries_colocated_with_geolytix_drops_only_brand_matched_duplicates():
    # GEOLYTIX is authoritative for its chains. An OSM grocery that sits on top
    # of a GEOLYTIX store AND carries its brand is the same physical store and
    # must be dropped; an independent shop at the same spot, and a same-brand
    # store far from any GEOLYTIX point, must be kept.
    geolytix = pl.DataFrame(
        {
            "category": ["Tesco"],
            "lat": [51.5000],
            "lng": [-0.1000],
        }
    )
    osm = pl.DataFrame(
        {
            "id": ["dup-brand", "independent", "far-brand"],
            "name": ["Tesco Express", "Bob's Corner Shop", "Tesco Express"],
            # ~1 m, ~2 m, and ~55 km from the GEOLYTIX Tesco.
            "lat": [51.50001, 51.50002, 52.0],
            "lng": [-0.10001, -0.1000, -1.0],
        }
    )
    drop_ids = osm_groceries_colocated_with_geolytix(osm, geolytix, radius_m=50.0)
    assert drop_ids == ["dup-brand"]
 def test_osm_groceries_colocated_with_geolytix_dedupes_cooperative_spelling():
    # GEOLYTIX brand "Co-op" tokenises to "coop"; OSM commonly spells it
    # "The Co-operative Food" -> "cooperative". The alias folds them so the
    # genuine duplicate is still dropped.
    geolytix = pl.DataFrame({"category": ["Co-op"], "lat": [53.0], "lng": [-1.5]})
    osm = pl.DataFrame(
        {
            "id": ["coop-dup"],
            "name": ["The Co-operative Food"],
            "lat": [53.00001],
            "lng": [-1.5],
        }
    )
    assert osm_groceries_colocated_with_geolytix(osm, geolytix, radius_m=50.0) == [
        "coop-dup"
    ]
 def test_osm_groceries_colocated_with_geolytix_handles_empty_inputs():
    geolytix = pl.DataFrame({"category": ["Tesco"], "lat": [51.5], "lng": [-0.1]})
    empty = pl.DataFrame(
        schema={"id": pl.Utf8, "name": pl.Utf8, "lat": pl.Float64, "lng": pl.Float64}
    )
    assert osm_groceries_colocated_with_geolytix(empty, geolytix) == []
    osm = pl.DataFrame(
        {"id": ["x"], "name": ["Tesco Express"], "lat": [51.5], "lng": [-0.1]}
    )
    empty_glx = pl.DataFrame(
        schema={"category": pl.Utf8, "lat": pl.Float64, "lng": pl.Float64}
    )
    assert osm_groceries_colocated_with_geolytix(osm, empty_glx) == []
 def _write_boundary(tmp_path):
    """A FeatureCollection whose single feature covers the London-area test
    coords used by the transform() fixtures, so in_england_mask keeps them."""
@ -345,12 +406,7 @@ def test_load_ofsted_ratings_falls_back_to_ungraded_outcome(tmp_path):
        }
    ).write_parquet(ofsted_path)
-    ratings = (
+    ratings = _load_ofsted_ratings(ofsted_path).collect().sort("urn").to_dicts()
        _load_ofsted_ratings(ofsted_path)
        .collect()
        .sort("urn")
        .to_dicts()
    )
    assert ratings == [
        {"urn": 1, "ofsted_rating": "Outstanding"},
@ -384,9 +440,9 @@ def test_school_icon_category_handles_one_sided_age_ranges():
        },
    )
-    categories = df.select(
+    categories = df.select(_school_icon_category_expr().alias("category"))[
-        _school_icon_category_expr().alias("category")
+        "category"
-    )["category"].to_list()
+    ].to_list()
    assert categories == [
        "Nursery school",
@ -449,6 +505,45 @@ def test_osm_supermarkets_dropped(tmp_path):
    assert convenience.height == 1
 def test_transform_grocery_dedup_drops_only_grocery_aspect(tmp_path):
    # The _write_transform_inputs fixture seeds 5 GEOLYTIX "Tesco" points at
    # (51.52, -0.14). An OSM object colocated there carrying "Tesco" in its name
    # is the same physical store, so its Convenience Store (Groceries) row is a
    # duplicate and must be dropped — but its NON-grocery aspect (a Post Office
    # sharing the same OSM id) must survive. An independent shop away from the
    # GEOLYTIX point keeps its grocery row.
    raw = pl.DataFrame(
        {
            "id": ["n1", "n1", "n2"],
            "name": ["Tesco Express", "Tesco Express", "Corner Shop"],
            "category": [
                "shop/convenience",
                "amenity/post_office",
                "shop/convenience",
            ],
            "lat": [51.52, 51.52, 51.40],
            "lng": [-0.14, -0.14, -0.05],
        }
    )
    inputs = _write_transform_inputs(tmp_path, raw)
    out = transform(**inputs).collect()
    # The colocated, brand-matched grocery row is dropped.
    n1_grocery = out.filter((pl.col("id") == "n1") & (pl.col("group") == "Groceries"))
    assert n1_grocery.height == 0
    # Its non-grocery aspect (Post Office) survives.
    n1_post_office = out.filter(
        (pl.col("id") == "n1") & (pl.col("category") == "Post Office")
    )
    assert n1_post_office.height == 1
    # The independent corner shop (no brand, far away) keeps its grocery row.
    n2_grocery = out.filter(
        (pl.col("id") == "n2") & (pl.col("category") == "Convenience Store")
    )
    assert n2_grocery.height == 1
 def test_transform_output_unique_per_id_category(tmp_path):
    # Soundness: the full transform() output has at most one row per
    # (id, category) overall, across every source.
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -1,6 +1,7 @@
 import argparse
 from pathlib import Path
 import numpy as np
 import polars as pl
 from pipeline.utils.england_geometry import in_england_mask
@ -955,7 +956,6 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
    # Note: schools come from the GIAS register (see transform_gias_schools).
    # Niche/tertiary education amenities that GIAS does not cover are dropped
    # rather than mixed in with state-funded schools.
    (
        "Local Businesses",
        "Hotel",
@ -1441,38 +1441,128 @@ def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
    # category mirrors icon_category so the dashboard renders one toggle per
    # school type (Nursery / Primary / Secondary / Sixth form / University /…)
    # instead of bundling every GIAS row under a single "School" pill.
-    return pl.scan_parquet(gias_path).join(ofsted, on="urn", how="left").select(
+    return (
-        pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
+        pl.scan_parquet(gias_path)
-        pl.col("name"),
+        .join(ofsted, on="urn", how="left")
-        icon_category_expr.alias("category"),
+        .select(
-        icon_category_expr.alias("icon_category"),
+            pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
-        pl.lit("Education").alias("group"),
+            pl.col("name"),
-        pl.col("lat").cast(pl.Float64),
+            icon_category_expr.alias("category"),
-        pl.col("lng").cast(pl.Float64),
+            icon_category_expr.alias("icon_category"),
-        emoji_expr.alias("emoji"),
+            pl.lit("Education").alias("group"),
-        pl.col("phase").alias("school_phase"),
+            pl.col("lat").cast(pl.Float64),
-        pl.col("type").alias("school_type"),
+            pl.col("lng").cast(pl.Float64),
-        pl.col("type_group").alias("school_type_group"),
+            emoji_expr.alias("emoji"),
-        pl.col("age_range").alias("school_age_range"),
+            pl.col("phase").alias("school_phase"),
-        pl.col("gender").alias("school_gender"),
+            pl.col("type").alias("school_type"),
-        pl.col("religious_character").alias("school_religious_character"),
+            pl.col("type_group").alias("school_type_group"),
-        pl.col("admissions_policy").alias("school_admissions_policy"),
+            pl.col("age_range").alias("school_age_range"),
-        pl.col("nursery_provision").alias("school_nursery_provision"),
+            pl.col("gender").alias("school_gender"),
-        pl.col("sixth_form").alias("school_sixth_form"),
+            pl.col("religious_character").alias("school_religious_character"),
-        pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
+            pl.col("admissions_policy").alias("school_admissions_policy"),
-        pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
+            pl.col("nursery_provision").alias("school_nursery_provision"),
-        pl.col("fsm_percent").cast(pl.Float32, strict=False).alias("school_fsm_percent"),
+            pl.col("sixth_form").alias("school_sixth_form"),
-        pl.col("trust").alias("school_trust"),
+            pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
-        pl.col("address").alias("school_address"),
+            pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
-        pl.col("postcode").alias("school_postcode"),
+            pl.col("fsm_percent")
-        pl.col("local_authority").alias("school_local_authority"),
+            .cast(pl.Float32, strict=False)
-        pl.col("website").alias("school_website"),
+            .alias("school_fsm_percent"),
-        pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
+            pl.col("trust").alias("school_trust"),
-        pl.col("head_name").alias("school_head_name"),
+            pl.col("address").alias("school_address"),
-        pl.col("ofsted_rating").alias("school_ofsted_rating"),
+            pl.col("postcode").alias("school_postcode"),
            pl.col("local_authority").alias("school_local_authority"),
            pl.col("website").alias("school_website"),
            pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
            pl.col("head_name").alias("school_head_name"),
            pl.col("ofsted_rating").alias("school_ofsted_rating"),
        )
    )
 # OSM convenience-format stores that GEOLYTIX also covers (Tesco Express,
 # Sainsbury's Local, Co-op Food, Morrisons Daily, Spar, ...) would otherwise be
 # counted twice: once as a GEOLYTIX brand row and once as an OSM "Convenience
 # Store". GEOLYTIX is authoritative for its chains, so an OSM grocery row that
 # sits on top of a GEOLYTIX point AND carries that point's brand name is the
 # same physical store and is dropped. Independent corner shops never carry a
 # chain brand, so they are kept.
 GROCERY_DEDUP_RADIUS_M = 50.0
 # Brand-token aliases so an OSM name spelt differently from the GEOLYTIX brand
 # still matches. GEOLYTIX's "Co-op" tokenises to "coop", but OSM frequently
 # spells it "The Co-operative Food" -> "cooperative"; without this, ~300+ genuine
 # Co-op duplicates would survive. Keys/values are post-strip (alnum-only) tokens.
 _GROCERY_TOKEN_ALIASES = {
    "cooperative": "coop",
    "cooperatives": "coop",
 }
 def _significant_tokens(name: str | None) -> set[str]:
    """Lower-case alphanumeric tokens of length >= 3 from a POI name (aliased)."""
    if not name:
        return set()
    tokens: set[str] = set()
    for raw in str(name).lower().split():
        token = "".join(ch for ch in raw if ch.isalnum())
        if len(token) >= 3:
            tokens.add(_GROCERY_TOKEN_ALIASES.get(token, token))
    return tokens
 def osm_groceries_colocated_with_geolytix(
    osm_groceries: pl.DataFrame,
    geolytix: pl.DataFrame,
    radius_m: float = GROCERY_DEDUP_RADIUS_M,
 ) -> list[str]:
    """Return OSM grocery ids that duplicate a GEOLYTIX store.
    An OSM Groceries row is a duplicate when a GEOLYTIX point lies within
    ``radius_m`` metres AND that point's brand tokens (its ``category``, e.g.
    "Tesco", "Co-op") are all present in the OSM row's name — i.e. the same
    physical branded store. Brands with no token >= 3 chars (e.g. "M&S") never
    match, so they are conservatively kept rather than risk a false drop.
    ``osm_groceries`` needs columns ``id``, ``name``, ``lat``, ``lng``;
    ``geolytix`` needs ``category`` (the brand), ``lat``, ``lng``.
    """
    if osm_groceries.is_empty() or geolytix.is_empty():
        return []
    from scipy.spatial import cKDTree
    glx_lat = geolytix["lat"].to_numpy().astype(float)
    glx_lng = geolytix["lng"].to_numpy().astype(float)
    glx_brand_tokens = [_significant_tokens(b) for b in geolytix["category"].to_list()]
    osm_lat = osm_groceries["lat"].to_numpy().astype(float)
    osm_lng = osm_groceries["lng"].to_numpy().astype(float)
    osm_ids = osm_groceries["id"].to_list()
    osm_name_tokens = [_significant_tokens(n) for n in osm_groceries["name"].to_list()]
    # Equirectangular projection to metres around the shared mean latitude — at
    # England's scale this is accurate to well under the dedup radius.
    mean_lat = float(np.mean(np.concatenate([glx_lat, osm_lat])))
    cos_lat = float(np.cos(np.radians(mean_lat)))
    glx_xy = np.column_stack([glx_lng * cos_lat * 111_320.0, glx_lat * 110_540.0])
    osm_xy = np.column_stack([osm_lng * cos_lat * 111_320.0, osm_lat * 110_540.0])
    tree = cKDTree(glx_xy)
    neighbours = tree.query_ball_point(osm_xy, r=radius_m)
    drop_ids: list[str] = []
    for osm_idx, glx_indices in enumerate(neighbours):
        tokens = osm_name_tokens[osm_idx]
        if not tokens:
            continue
        for glx_idx in glx_indices:
            brand = glx_brand_tokens[glx_idx]
            if brand and brand.issubset(tokens):
                drop_ids.append(osm_ids[osm_idx])
                break
    return drop_ids
 def transform(
    input_path: Path,
    naptan_path: Path,
@ -1553,6 +1643,27 @@ def transform(
    grocery_df = pl.read_parquet(grocery_retail_points_path)
    grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
    # Drop OSM grocery rows that duplicate a GEOLYTIX store (same brand,
    # colocated) so a Tesco Express / Co-op / Spar isn't counted twice.
    osm_groceries = (
        lf.filter(pl.col("group") == "Groceries")
        .select("id", "name", "lat", "lng")
        .collect(engine="streaming")
    )
    duplicate_ids = osm_groceries_colocated_with_geolytix(osm_groceries, grocery_pois)
    if duplicate_ids:
        print(
            f"Dropping {len(duplicate_ids):,} OSM grocery POIs that duplicate a "
            "GEOLYTIX store"
        )
        # Scope the drop to the Groceries group: a single OSM object can also
        # carry a non-grocery aspect (e.g. a convenience store that is also a
        # Post Office), which must survive — only its duplicate grocery row goes.
        lf = lf.filter(
            ~((pl.col("group") == "Groceries") & pl.col("id").is_in(duplicate_ids))
        )
    frames = [
        lf,
        naptan,
--- a/server-rs/Cargo.lock
+++ b/server-rs/Cargo.lock
@ -3901,6 +3901,7 @@ dependencies = [
 "serde",
 "serde_json",
 "sha2 0.11.0",
 "tikv-jemalloc-sys",
 "tikv-jemallocator",
 "tokio",
 "tower",
--- a/server-rs/Cargo.toml
+++ b/server-rs/Cargo.toml
@ -41,6 +41,10 @@ sentry = { version = "0.46.0", default-features = false, features = ["backtrace"
 # steady-state RSS). Decay is configured via `malloc_conf` in main.rs.
 [target.'cfg(not(target_env = "msvc"))'.dependencies]
 tikv-jemallocator = { version = "0.6", features = ["unprefixed_malloc_on_supported_platforms"] }
 # Direct mallctl access so we can force-purge dirty pages back to the OS after the
 # data load (jemalloc's idle decay/background_thread doesn't reliably return the
 # ~30GB of load-time transient buffers without subsequent allocation activity).
 tikv-jemalloc-sys = "0.6"
 [lints.clippy]
 min_ident_chars = "warn"
--- a/server-rs/src/main.rs
+++ b/server-rs/src/main.rs
@ -136,9 +136,74 @@ fn resident_memory_kib() -> Option<u64> {
    })
 }
 /// Force jemalloc to return all dirty/muzzy pages to the OS immediately.
 ///
 /// jemalloc keeps freed pages "dirty" for fast reuse and only hands them back to
 /// the OS via decay. Under the bursty allocate/free of request handling (and the
 /// huge startup-load transients) that decay lags badly, so RSS balloons far above
 /// the live working set. `arena.4096.purge` (4096 == `MALLCTL_ARENAS_ALL`) purges
 /// every arena synchronously, dropping RSS back down.
 #[cfg(target_os = "linux")]
 fn jemalloc_purge() {
    const PURGE: &[u8] = b"arena.4096.purge\0";
    // Safety: a write-only mallctl with no input/output buffers; the name is a
    // valid NUL-terminated jemalloc control string.
    unsafe {
        tikv_jemalloc_sys::mallctl(
            PURGE.as_ptr().cast(),
            std::ptr::null_mut(),
            std::ptr::null_mut(),
            std::ptr::null_mut(),
            0,
        );
    }
 }
 #[cfg(not(target_os = "linux"))]
 fn jemalloc_purge() {}
 /// Enable jemalloc's background purge thread at runtime. Setting it via
 /// `background_thread:true` in `malloc_conf` is unreliable (it can be silently
 /// dropped depending on init ordering), so we also flip it explicitly here.
 #[cfg(target_os = "linux")]
 fn enable_jemalloc_background_thread() {
    let enabled: bool = true;
    // Safety: write-only mallctl of a bool to a valid control name.
    unsafe {
        tikv_jemalloc_sys::mallctl(
            b"background_thread\0".as_ptr().cast(),
            std::ptr::null_mut(),
            std::ptr::null_mut(),
            std::ptr::addr_of!(enabled) as *mut core::ffi::c_void,
            std::mem::size_of::<bool>(),
        );
    }
 }
 #[cfg(not(target_os = "linux"))]
 fn enable_jemalloc_background_thread() {}
 /// Periodically purge jemalloc arenas so request-handling transients are returned
 /// to the OS instead of accumulating as dirty pages. A plain OS thread (not a
 /// tokio task) keeps the madvise sweep off the async runtime.
 #[cfg(target_os = "linux")]
 fn spawn_jemalloc_purger() {
    std::thread::Builder::new()
        .name("jemalloc-purge".to_string())
        .spawn(|| loop {
            std::thread::sleep(Duration::from_secs(10));
            jemalloc_purge();
        })
        .ok();
 }
 #[cfg(not(target_os = "linux"))]
 fn spawn_jemalloc_purger() {}
 #[cfg(target_os = "linux")]
 fn trim_allocator(label: &'static str) {
    let before = resident_memory_kib();
    jemalloc_purge();
    let trimmed = unsafe { libc::malloc_trim(0) };
    let after = resident_memory_kib();
    if let (Some(before), Some(after)) = (before, after) {
@ -401,6 +466,12 @@ async fn main() -> anyhow::Result<()> {
        )
        .init();
    // Keep jemalloc from hoarding freed memory: run its background purge thread
    // and a periodic explicit purge so load-time and request-time transients are
    // returned to the OS instead of inflating RSS.
    enable_jemalloc_background_thread();
    spawn_jemalloc_purger();
    // Initialize Prometheus metrics
    let metrics_handle = metrics::init_metrics();
    info!("Prometheus metrics initialized");
@ -1016,17 +1087,13 @@ async fn main() -> anyhow::Result<()> {
            .layer(sentry::integrations::tower::SentryHttpLayer::new()),
    );
-    // Lock all current and future memory pages to prevent swapping
+    // NOTE: we deliberately do NOT mlockall() here. Locking MCL_CURRENT|MCL_FUTURE
-    unsafe {
+    // pinned the allocator's entire mapped heap — including jemalloc's freed/dirty
-        if libc::mlockall(libc::MCL_CURRENT | libc::MCL_FUTURE) != 0 {
+    // pages — resident and non-reclaimable, inflating RSS from the ~10GB working
-            let err = std::io::Error::last_os_error();
+    // set to ~40GB and defeating the allocator's page-return entirely. The hot
-            tracing::warn!(
+    // working set stays resident naturally; freed pages are returned to the OS.
-                "mlockall failed (need CAP_IPC_LOCK or sufficient RLIMIT_MEMLOCK): {err}"
+
-            );
+    trim_allocator("startup complete");
        } else {
            info!("All memory pages locked (mlockall)");
        }
    }
    let addr = consts::SERVER_ADDRESS;
    let listener = tokio::net::TcpListener::bind(addr)
--- a/server-rs/src/routes/tiles.rs
+++ b/server-rs/src/routes/tiles.rs
@ -312,6 +312,7 @@ fn build_style(is_dark: bool, layers: &[serde_json::Value], tile_url: &str) -> s
 pub async fn init_tile_reader(path: &std::path::Path) -> anyhow::Result<TileReader> {
    let backend = FileBackend::open(path)?;
-    let reader = AsyncPmTilesReader::try_from_cached_source(backend, HashMapCache::default()).await?;
+    let reader =
        AsyncPmTilesReader::try_from_cached_source(backend, HashMapCache::default()).await?;
    Ok(reader)
 }