idgf

2026-06-02 20:14:32 +01:00 · 2026-06-02 20:14:32 +01:00 · aab85fe32e
commit aab85fe32e
parent fbfebc651c
33 changed files with 2016 additions and 283 deletions
--- a/pipeline/download/arcgis.py
+++ b/pipeline/download/arcgis.py
@ -4,27 +4,24 @@ import polars as pl
 from pathlib import Path

 from pipeline.local_temp import local_tmp_dir
-from pipeline.utils import download, extract_zip
+from pipeline.utils import code_col_overrides, download, extract_zip

 URL = "https://www.arcgis.com/sharing/rest/content/items/36b718ad00de49afb9ad364f8b815b9e/data"


 def convert_to_parquet(data_path: Path, parquet_path: Path) -> None:
-    # Classification code columns (ruc21ind, oac11ind, imd20ind) look numeric
-    # in early rows but contain string codes like "UN1" (Unclassified) later
-    # on. Force them to String to avoid mid-stream dtype inference failures.
-    # Note: NSPL renames these year suffixes as new releases roll in (e.g.
-    # Feb 2026 bumped oac from oac21ind → oac11ind, imd from imd19ind →
-    # imd20ind), so keep this dict in sync with the current CSV headers —
-    # polars silently ignores overrides for missing columns, masking drift.
+    # Classification code columns (e.g. ruc21ind, oac11ind, imd20ind) look
+    # numeric in early rows but contain string codes like "UN1" (Unclassified)
+    # later on. Force them to String to avoid mid-stream dtype inference
+    # failures. NSPL renames these year suffixes each release, and polars
+    # silently ignores overrides for missing columns, so match on the
+    # suffix-free stem (read from the header) rather than hard-coding suffixes.
+    csv_path = data_path / "Data/NSPL_FEB_2026_UK.csv"
+    names = pl.scan_csv(csv_path).collect_schema().names()
    df = pl.scan_csv(
-        data_path / "Data/NSPL_FEB_2026_UK.csv",
+        csv_path,
        try_parse_dates=True,
-        schema_overrides={
-            "ruc21ind": pl.String,
-            "oac11ind": pl.String,
-            "imd20ind": pl.String,
-        },
+        schema_overrides=code_col_overrides(names),
    )
    print(f"Columns: {df.collect_schema().names()}")
    parquet_path.parent.mkdir(parents=True, exist_ok=True)
--- a/pipeline/download/median_age.py
+++ b/pipeline/download/median_age.py
@ -43,6 +43,35 @@ AGE_BANDS = [
    (85, 5),  # Aged 85 years and over
 ]

+# Canonical NOMIS TS007A (C2021_AGE_19_NAME) band labels, in the SAME order as
+# AGE_BANDS. Index i here corresponds to AGE_BANDS[i]; we validate the pivot
+# output against this set and use it (not positional string parsing) to order
+# the columns, so a stray/relabelled/missing band fails loudly instead of
+# silently mis-aligning counts against the wrong lower bound.
+EXPECTED_BAND_NAMES = [
+    "Aged 0 to 4 years",
+    "Aged 5 to 9 years",
+    "Aged 10 to 14 years",
+    "Aged 15 to 19 years",
+    "Aged 20 to 24 years",
+    "Aged 25 to 29 years",
+    "Aged 30 to 34 years",
+    "Aged 35 to 39 years",
+    "Aged 40 to 44 years",
+    "Aged 45 to 49 years",
+    "Aged 50 to 54 years",
+    "Aged 55 to 59 years",
+    "Aged 60 to 64 years",
+    "Aged 65 to 69 years",
+    "Aged 70 to 74 years",
+    "Aged 75 to 79 years",
+    "Aged 80 to 84 years",
+    "Aged 85 years and over",
+]
+assert len(EXPECTED_BAND_NAMES) == len(AGE_BANDS), (
+    "EXPECTED_BAND_NAMES and AGE_BANDS must stay aligned 1:1"
+)
+

 def compute_median_age(counts: list[int]) -> float:
    """Compute median age from five-year band counts using linear interpolation."""
@ -62,6 +91,53 @@ def compute_median_age(counts: list[int]) -> float:
    return float("nan")


+def _bands_to_median_table(pivoted: pl.DataFrame) -> pl.DataFrame:
+    """Validate the pivoted age-band columns, then compute median age per LSOA.
+
+    The pivot must contain exactly the canonical NOMIS TS007A bands; a
+    missing/extra/relabelled band would otherwise silently mis-align counts
+    against the wrong AGE_BANDS lower bound, so we fail loudly instead.
+    """
+    # Validate the pivoted age-band columns against the canonical NOMIS set
+    # BEFORE computing anything.
+    band_cols = [c for c in pivoted.columns if c != "GEOGRAPHY_CODE"]
+    found = set(band_cols)
+    expected = set(EXPECTED_BAND_NAMES)
+    if found != expected:
+        missing = sorted(expected - found)
+        unexpected = sorted(found - expected)
+        raise ValueError(
+            "Census age-band columns do not match the expected NOMIS TS007A bands.\n"
+            f"  expected {len(EXPECTED_BAND_NAMES)} bands, found {len(band_cols)}\n"
+            f"  missing:    {missing}\n"
+            f"  unexpected: {unexpected}\n"
+            "Refusing to compute medians against misaligned bands."
+        )
+
+    # Use the canonical order (guaranteed aligned with AGE_BANDS), not positional
+    # string parsing, and treat a null band (zero-population) as 0 rather than
+    # crashing on sum().
+    band_cols = list(EXPECTED_BAND_NAMES)
+    pivoted = pivoted.with_columns(pl.col(band_cols).fill_null(0))
+
+    print(f"Age bands found: {len(band_cols)}")
+    print(f"  First: {band_cols[0]}")
+    print(f"  Last:  {band_cols[-1]}")
+
+    rows = pivoted.select("GEOGRAPHY_CODE", *band_cols).to_dicts()
+    medians = []
+    for row in rows:
+        counts = [row[col] for col in band_cols]
+        median = compute_median_age(counts)
+        medians.append(
+            {"lsoa21": row["GEOGRAPHY_CODE"], "median_age": round(median, 1)}
+        )
+
+    return pl.DataFrame(medians).with_columns(
+        pl.col("median_age").cast(pl.Float32),
+    )
+
+
 def download_and_convert(output_path: Path) -> None:
    print("Downloading Census 2021 age by five-year bands from NOMIS...")
    frames = []
@ -94,29 +170,7 @@ def download_and_convert(output_path: Path) -> None:
        values="OBS_VALUE",
    )

-    # Extract age band columns in order and compute median
-    # NOMIS returns band names like "Aged 0 to 4 years", "Aged 85 years and over"
-    band_cols = [c for c in pivoted.columns if c != "GEOGRAPHY_CODE"]
-    # Sort by the lower bound of each band
-    band_cols.sort(key=lambda c: int(c.split()[1]))
-
-    print(f"Age bands found: {len(band_cols)}")
-    print(f"  First: {band_cols[0]}")
-    print(f"  Last:  {band_cols[-1]}")
-
-    # Compute median age per LSOA
-    rows = pivoted.select("GEOGRAPHY_CODE", *band_cols).to_dicts()
-    medians = []
-    for row in rows:
-        counts = [row[col] for col in band_cols]
-        median = compute_median_age(counts)
-        medians.append(
-            {"lsoa21": row["GEOGRAPHY_CODE"], "median_age": round(median, 1)}
-        )
-
-    result = pl.DataFrame(medians).with_columns(
-        pl.col("median_age").cast(pl.Float32),
-    )
+    result = _bands_to_median_table(pivoted)

    print(f"England LSOAs: {result.height}")
    print(
--- a/pipeline/download/noise.py
+++ b/pipeline/download/noise.py
@ -83,11 +83,32 @@ NATIVE_RESOLUTION = 10
 # Request pixel resolution in metres.
 RESOLUTION = NATIVE_RESOLUTION

+# Defra encodes TRUE "no data" with this sentinel (NOT 0.0). A 0.0 cell that is
+# otherwise inside the raster means "modelled below the lowest reporting band",
+# i.e. genuinely quiet — see noise_overlay_tiles.py:167.
+NOISE_NODATA_SENTINEL = np.float32(-96.0)
+
+# Lowest modelled Defra Lden reporting band (dB). Verified against the actual
+# rasters: the minimum positive in-coverage value is 40.0 dB with NO values in
+# (0, 40) — below the band, cells are encoded as 0.0 (genuinely quiet). We floor
+# in-coverage cells to 40.0 so a below-band 0.0 surfaces as "we know it's quiet"
+# (~40 dB) instead of collapsing to null ("we don't know"), WITHOUT inflating the
+# ~35% of genuine 40-44.99 dB readings that a 45.0 floor would wrongly bump to 45.
+# NB: 45.0 is the overlay's lowest *paint* stop (noise_overlay_tiles.
+# NOISE_COLOR_STOPS[0]) — a rendering threshold, not the data's reporting floor.
+NOISE_QUIET_FLOOR_DB = np.float32(40.0)
+
 # The pipeline has postcode representative points rather than complete unit
 # polygons here. Use a small local footprint and take the maximum 10m cell so
 # postcode-level noise is not understated by centroid rounding.
 POSTCODE_NOISE_RADIUS_M = 50

+# Adjacent download tiles must overlap by at least the sampling radius so every
+# postcode's 50m max-window is fully contained in at least one tile. Without
+# this, a loud pixel just across a tile seam is invisible to a postcode on the
+# far side, under-reporting noise near seams.
+TILE_OVERLAP_M = POSTCODE_NOISE_RADIUS_M
+
 # Retry/split behaviour for slow Defra WCS requests. Some 100km eastern tiles
 # intermittently return 504s; smaller fallback requests usually succeed.
 MAX_RETRIES = 3
@ -287,6 +308,31 @@ def _download_tile(
    return [], [(min_e, min_n, max_e, max_n)]


+def _generate_tiles(
+    min_e: int,
+    max_e: int,
+    min_n: int,
+    max_n: int,
+    tile_size: int,
+    overlap_m: int,
+    step: int,
+) -> list[Tile]:
+    """Generate download tile bboxes stepping by ``step`` but extending each
+    tile's far edge by ``overlap_m`` so neighbours overlap.
+
+    Overlapping neighbours guarantee that every postcode's POSTCODE_NOISE_RADIUS_M
+    sampling window is fully contained in at least one tile, so a loud pixel near
+    a seam is never lost (the sampler takes np.fmax across tiles).
+    """
+    tiles: list[Tile] = []
+    for tile_min_e in range(min_e, max_e, step):
+        for tile_min_n in range(min_n, max_n, step):
+            tile_max_e = min(tile_min_e + tile_size + overlap_m, BNG_MAX_E)
+            tile_max_n = min(tile_min_n + tile_size + overlap_m, BNG_MAX_N)
+            tiles.append((tile_min_e, tile_min_n, tile_max_e, tile_max_n))
+    return tiles
+
+
 def download_raster(
    tile_dir: Path,
    wcs_base: str,
@ -296,12 +342,9 @@ def download_raster(
    allow_missing_tiles: bool = False,
 ) -> list[Path]:
    """Download noise GeoTIFF raster covering England, returning paths to saved files."""
-    tiles = []
-    for min_e in range(BNG_MIN_E, BNG_MAX_E, TILE_SIZE):
-        for min_n in range(BNG_MIN_N, BNG_MAX_N, TILE_SIZE):
-            max_e = min(min_e + TILE_SIZE, BNG_MAX_E)
-            max_n = min(min_n + TILE_SIZE, BNG_MAX_N)
-            tiles.append((min_e, min_n, max_e, max_n))
+    tiles = _generate_tiles(
+        BNG_MIN_E, BNG_MAX_E, BNG_MIN_N, BNG_MAX_N, TILE_SIZE, TILE_OVERLAP_M, TILE_SIZE
+    )

    print(
        f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)..."
@ -385,14 +428,23 @@ def sample_noise_at_postcodes(
            if len(candidate_indices) == 0:
                continue

+            # Defra rasters encode TRUE nodata as the -96.0 sentinel (and
+            # occasionally non-finite / dataset.nodata); genuinely quiet ground
+            # below the model's lowest reporting band is encoded as 0.0. Only
+            # the former is "we don't know" — the latter is a real "we know it's
+            # quiet" reading and must not collapse to null. So treat ONLY true
+            # nodata as -inf (it never wins a max and never counts as coverage),
+            # and clamp every in-coverage cell up to NOISE_QUIET_FLOOR_DB so a
+            # below-threshold 0.0 surfaces as the documented quiet floor.
            grid = dataset.read(1).astype(np.float32, copy=False)
-            invalid = ~np.isfinite(grid) | (grid == 0)
+            nodata = ~np.isfinite(grid) | np.isclose(
+                grid, NOISE_NODATA_SENTINEL, rtol=1e-5, atol=1e-5
+            )
            if dataset.nodata is not None:
-                invalid |= np.isclose(
+                nodata |= np.isclose(
                    grid, np.float32(dataset.nodata), rtol=1e-5, atol=1e-5
                )
-            grid = grid.copy()
-            grid[invalid] = -np.inf
+            grid = np.where(nodata, -np.inf, np.maximum(grid, NOISE_QUIET_FLOOR_DB))
            if filter_size > 1:
                grid = maximum_filter(
                    grid, size=filter_size, mode="constant", cval=-np.inf
@ -412,12 +464,15 @@ def sample_noise_at_postcodes(

            sampled_indices = candidate_indices[in_bounds]
            sampled = grid[rows[in_bounds], cols[in_bounds]]
-            valid = sampled != -np.inf
-            if not np.any(valid):
+            # A finite sample means at least one in-coverage cell sat in the
+            # window (quiet -> floor, or louder). -inf means the whole window was
+            # true nodata, so the postcode stays uncovered (null) for this tile.
+            covered = np.isfinite(sampled)
+            if not np.any(covered):
                continue

-            sampled_indices = sampled_indices[valid]
-            sampled = sampled[valid]
+            sampled_indices = sampled_indices[covered]
+            sampled = sampled[covered]
            existing = noise_db[sampled_indices]
            noise_db[sampled_indices] = np.where(
                np.isnan(existing), sampled, np.maximum(existing, sampled)
--- a/pipeline/download/places.py
+++ b/pipeline/download/places.py
@ -84,6 +84,38 @@ LONDON_COUNTY_CODES = {"E13000001", "E13000002"}
 DISPLAY_CITY_NEAREST_POSTCODE_MAX_M = 3_000
 WGS84_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)

+# England British National Grid (EPSG:27700) bounding box, with margin. ONS NSPL stores
+# postcodes that have no grid reference at the Null-Island sentinel lat=99.999999,
+# long=0.000000, whose paired easting/northing collapse to the grid origin (0, 0) (or
+# inf). Requiring coordinates inside this box drops the sentinel from every index, so an
+# active postcode lacking a grid ref can never become a false nearest neighbour.
+ENGLAND_BNG_MIN_EAST = 50_000.0
+ENGLAND_BNG_MAX_EAST = 660_000.0
+ENGLAND_BNG_MIN_NORTH = 0.0
+ENGLAND_BNG_MAX_NORTH = 660_000.0
+
+
+def _valid_wgs84_expr() -> pl.Expr:
+    """Rows with a real lat/long inside England (drops the ONS lat=99.999999, long=0.0
+    no-grid-reference sentinel and any nulls), so they never enter a coordinate index."""
+    return (
+        pl.col("lat").is_not_null()
+        & pl.col("long").is_not_null()
+        & pl.col("lat").is_between(ENGLAND_BBOX_SOUTH, ENGLAND_BBOX_NORTH)
+        & pl.col("long").is_between(ENGLAND_BBOX_WEST, ENGLAND_BBOX_EAST)
+    )
+
+
+def _valid_bng_expr() -> pl.Expr:
+    """Rows with a real easting/northing inside England (drops the (0, 0) grid-origin /
+    inf paired with the ONS no-grid-reference sentinel and any nulls)."""
+    return (
+        pl.col("east1m").is_not_null()
+        & pl.col("north1m").is_not_null()
+        & pl.col("east1m").is_between(ENGLAND_BNG_MIN_EAST, ENGLAND_BNG_MAX_EAST)
+        & pl.col("north1m").is_between(ENGLAND_BNG_MIN_NORTH, ENGLAND_BNG_MAX_NORTH)
+    )
+
 # Suffixes to strip from raw station names before appending the typed suffix.
 _STATION_STRIP = (
    " tube station",
@ -303,7 +335,7 @@ def _outcode_tree(postcodes_path: Path) -> tuple[cKDTree, list[str]]:
            postcodes_path, columns=["pcds", "lat", "long", "ctry25cd", "doterm"]
        )
        .filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
-        .filter(pl.col("lat").is_not_null() & pl.col("long").is_not_null())
+        .filter(_valid_wgs84_expr())
    )
    coords = np.column_stack(
        [df["lat"].to_numpy().astype(np.float64), df["long"].to_numpy().astype(np.float64)]
@ -359,12 +391,22 @@ def _build_street_places(
    return sorted(places, key=lambda place: place["name"].lower())


+def _poi_dedup_key(name: str, place_type: str, lat: float, lon: float) -> tuple:
+    """Geographic de-dup key: round(.,2) is ~1.1km lat / ~0.7km UK lon.
+
+    Coarse enough to collapse the SAME physical POI mapped twice a few metres
+    apart, fine enough to keep genuinely distinct same-named POIs in different
+    towns (e.g. "Victoria Park" in London vs Bristol).
+    """
+    return (name.lower(), place_type, round(lat, 2), round(lon, 2))
+
+
 def _pois_to_places(pois: pl.DataFrame) -> list[dict]:
-    """Map high-value named POIs onto gazetteer place rows (M), de-duplicated by (name, type)."""
+    """Map high-value named POIs onto gazetteer place rows (M), de-duplicated by (name, type, coords)."""
    if pois.is_empty():
        return []

-    seen: set[tuple[str, str]] = set()
+    seen: set[tuple] = set()
    places: list[dict] = []
    for row in pois.iter_rows(named=True):
        place_type = HIGH_VALUE_POI_CATEGORIES.get(str(row.get("category", "")))
@ -373,7 +415,9 @@ def _pois_to_places(pois: pl.DataFrame) -> list[dict]:
        name = str(row.get("name") or "").strip()
        if len(name) < 3:
            continue
-        key = (name.lower(), place_type)
+        lat = float(row["lat"])
+        lon = float(row["lng"])
+        key = _poi_dedup_key(name, place_type, lat, lon)
        if key in seen:
            continue
        seen.add(key)
@ -381,8 +425,8 @@ def _pois_to_places(pois: pl.DataFrame) -> list[dict]:
            {
                "name": name,
                "place_type": place_type,
-                "lat": float(row["lat"]),
-                "lon": float(row["lng"]),
+                "lat": lat,
+                "lon": lon,
                "population": 0,
                "travel_destination": False,
                "display_city": None,
@ -395,11 +439,16 @@ def _append_high_value_pois(places: list[dict], pois_path: Path) -> int:
    pois = pl.read_parquet(pois_path, columns=["name", "category", "lat", "lng"])
    new_places = _pois_to_places(pois)
    existing = {
-        (str(place["name"]).lower(), place["place_type"]) for place in places
+        _poi_dedup_key(
+            str(place["name"]), place["place_type"], place["lat"], place["lon"]
+        )
+        for place in places
    }
    added = 0
    for place in new_places:
-        key = (place["name"].lower(), place["place_type"])
+        key = _poi_dedup_key(
+            place["name"], place["place_type"], place["lat"], place["lon"]
+        )
        if key in existing:
            continue
        places.append(place)
@ -409,10 +458,14 @@ def _append_high_value_pois(places: list[dict], pois_path: Path) -> int:


 def _postcode_lookup(postcodes_path: Path) -> dict[str, tuple[float, float]]:
-    df = pl.read_parquet(
-        postcodes_path,
-        columns=["pcds", "lat", "long", "ctry25cd", "doterm"],
-    ).filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
+    df = (
+        pl.read_parquet(
+            postcodes_path,
+            columns=["pcds", "lat", "long", "ctry25cd", "doterm"],
+        )
+        .filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
+        .filter(_valid_wgs84_expr())
+    )
    return {
        _normalize_postcode(postcode): (float(lat), float(lon))
        for postcode, lat, lon in df.select(["pcds", "lat", "long"]).iter_rows()
@ -470,7 +523,7 @@ def _london_postcode_tree(postcodes_path: Path) -> tuple[cKDTree, np.ndarray]:
        .filter(
            (pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null()
        )
-        .filter(pl.col("east1m").is_not_null() & pl.col("north1m").is_not_null())
+        .filter(_valid_bng_expr())
        .with_columns(_is_london_admin_expr().alias("is_london"))
        .select("east1m", "north1m", "is_london")
    )
--- a/pipeline/download/test_median_age.py
+++ b/pipeline/download/test_median_age.py
@ -0,0 +1,75 @@
+import math
+
+import polars as pl
+import pytest
+
+from pipeline.download import median_age
+from pipeline.download.median_age import (
+    AGE_BANDS,
+    EXPECTED_BAND_NAMES,
+    compute_median_age,
+)
+
+
+def test_expected_band_names_align_with_age_bands():
+    assert len(EXPECTED_BAND_NAMES) == len(AGE_BANDS)
+
+
+def test_compute_median_age_interpolates_within_median_band():
+    # All weight in the 30-34 band -> median is the band midpoint via linear
+    # interpolation: 30 + ((50 - 0) / 100) * 5 = 32.5.
+    counts = [0] * len(AGE_BANDS)
+    counts[6] = 100  # "Aged 30 to 34 years"
+    assert compute_median_age(counts) == pytest.approx(32.5)
+
+    # 50 below the median band, 100 inside the 35-39 band holding the median.
+    # half = 75; cumulative before band 7 = 50; 35 + ((75 - 50) / 100) * 5 = 36.25.
+    counts = [0] * len(AGE_BANDS)
+    counts[0] = 50  # below the median band
+    counts[7] = 100  # "Aged 35 to 39 years" holds the median
+    assert compute_median_age(counts) == pytest.approx(36.25)
+
+
+def test_compute_median_age_empty_lsoa_is_nan():
+    assert math.isnan(compute_median_age([0] * len(AGE_BANDS)))
+
+
+def _pivoted(band_to_counts: dict[str, list]) -> pl.DataFrame:
+    """Build a pivot-shaped frame: GEOGRAPHY_CODE + one column per band."""
+    n = len(next(iter(band_to_counts.values())))
+    data = {"GEOGRAPHY_CODE": [f"E0100000{i}" for i in range(n)]}
+    data.update(band_to_counts)
+    return pl.DataFrame(data)
+
+
+def test_null_band_count_is_treated_as_zero_not_crash():
+    # One LSOA has a null in the 85+ band (NOMIS can return null for a band with
+    # zero people). It must be coerced to 0, not raise TypeError in sum(). With
+    # all 100 people in the 30-34 band the median is the band midpoint, 32.5.
+    counts_by_band = {name: [0] for name in EXPECTED_BAND_NAMES}
+    counts_by_band["Aged 30 to 34 years"] = [100]
+    counts_by_band["Aged 85 years and over"] = [None]
+    pivoted = _pivoted(counts_by_band)
+
+    table = median_age._bands_to_median_table(pivoted)
+
+    assert table.height == 1
+    assert table["median_age"][0] == pytest.approx(32.5)
+
+
+def test_missing_band_raises_clear_error():
+    counts_by_band = {name: [10] for name in EXPECTED_BAND_NAMES}
+    del counts_by_band["Aged 85 years and over"]
+    pivoted = _pivoted(counts_by_band)
+
+    with pytest.raises(ValueError, match=r"do not match the expected NOMIS"):
+        median_age._bands_to_median_table(pivoted)
+
+
+def test_relabelled_band_raises_clear_error():
+    counts_by_band = {name: [10] for name in EXPECTED_BAND_NAMES}
+    counts_by_band["Total"] = counts_by_band.pop("Aged 85 years and over")
+    pivoted = _pivoted(counts_by_band)
+
+    with pytest.raises(ValueError, match=r"unexpected:"):
+        median_age._bands_to_median_table(pivoted)
--- a/pipeline/download/test_noise.py
+++ b/pipeline/download/test_noise.py
@ -125,6 +125,205 @@ def test_download_raster_raises_on_missing_strict_tiles(monkeypatch, tmp_path):
        noise.download_raster(tmp_path, "base", "coverage", "Road")


+def test_generate_tiles_neighbours_overlap_by_radius():
+    tile_size = 20_000
+    overlap = noise.POSTCODE_NOISE_RADIUS_M
+    tiles = noise._generate_tiles(
+        0, 60_000, 0, 60_000, tile_size, overlap, tile_size
+    )
+
+    by_origin = {(min_e, min_n): (max_e, max_n) for min_e, min_n, max_e, max_n in tiles}
+
+    # Horizontally adjacent tiles must overlap by >= overlap.
+    for (min_e, min_n), (max_e, _max_n) in by_origin.items():
+        right_origin = (min_e + tile_size, min_n)
+        if right_origin in by_origin:
+            assert max_e - right_origin[0] >= overlap
+
+    # Vertically adjacent tiles must overlap by >= overlap.
+    for (min_e, min_n), (_max_e, max_n) in by_origin.items():
+        up_origin = (min_e, min_n + tile_size)
+        if up_origin in by_origin:
+            assert max_n - up_origin[1] >= overlap
+
+
+def test_generate_tiles_clamps_to_grid_extent():
+    tile_size = 20_000
+    overlap = noise.POSTCODE_NOISE_RADIUS_M
+    tiles = noise._generate_tiles(
+        noise.BNG_MAX_E - tile_size,
+        noise.BNG_MAX_E,
+        noise.BNG_MAX_N - tile_size,
+        noise.BNG_MAX_N,
+        tile_size,
+        overlap,
+        tile_size,
+    )
+    # The final (top-right) tile cannot extend past the England extent even
+    # though the overlap would otherwise push it beyond.
+    for _min_e, _min_n, max_e, max_n in tiles:
+        assert max_e <= noise.BNG_MAX_E
+        assert max_n <= noise.BNG_MAX_N
+
+
+def _write_geotiff(path, data, left, top, resolution, nodata):
+    with rasterio.open(
+        path,
+        "w",
+        driver="GTiff",
+        height=data.shape[0],
+        width=data.shape[1],
+        count=1,
+        dtype=data.dtype,
+        crs="EPSG:27700",
+        transform=from_origin(left, top, resolution, resolution),
+        nodata=nodata,
+    ) as dataset:
+        dataset.write(data, 1)
+
+
+def test_sample_noise_recovers_value_across_overlapping_seam(monkeypatch, tmp_path):
+    monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 50)
+    monkeypatch.setattr(noise, "RESOLUTION", 10)
+
+    # Two download tiles share a vertical seam at easting=100. _generate_tiles
+    # decides their real footprints: with the overlap fix the LEFT tile extends
+    # past the seam by POSTCODE_NOISE_RADIUS_M and thus covers a loud cell that
+    # physically sits just across the seam.
+    tile_size = 100
+    overlap = noise.POSTCODE_NOISE_RADIUS_M
+    tiles = noise._generate_tiles(0, 200, 0, 100, tile_size, overlap, tile_size)
+    by_origin = {
+        (min_e, min_n): (max_e, max_n) for min_e, min_n, max_e, max_n in tiles
+    }
+    left_min_e, left_min_n = 0, 0
+    left_max_e, left_max_n = by_origin[(left_min_e, left_min_n)]
+    # Overlap fix is what makes the left tile reach across the seam.
+    assert left_max_e > 100
+
+    # The loud 70 dB cell centre is at easting 105 (just across the seam) and
+    # the postcode point is at easting 75 in the left tile, within 50m of it.
+    res = noise.RESOLUTION
+    width = int((left_max_e - left_min_e) // res)
+    height = int((left_max_n - left_min_n) // res)
+    left_data = np.zeros((height, width), dtype=np.float32)
+    loud_row = height - 1 - int((25 - left_min_n) // res)  # northing ~25
+    loud_col = int((105 - left_min_e) // res)  # easting ~105
+    left_data[loud_row, loud_col] = 70.0
+    _write_geotiff(
+        tmp_path / "left.tif", left_data, left_min_e, left_max_n, res, nodata=0
+    )
+
+    # The right tile holds the same loud cell but the postcode point is NOT
+    # inside it, so without overlap the value would be lost for that point.
+    right_min_e, right_min_n = 100, 0
+    right_max_e, right_max_n = by_origin[(right_min_e, right_min_n)]
+    rwidth = int((right_max_e - right_min_e) // res)
+    rheight = int((right_max_n - right_min_n) // res)
+    right_data = np.zeros((rheight, rwidth), dtype=np.float32)
+    right_data[rheight - 1 - int((25 - right_min_n) // res), 0] = 70.0
+    _write_geotiff(
+        tmp_path / "right.tif", right_data, right_min_e, right_max_n, res, nodata=0
+    )
+
+    result = noise.sample_noise_at_postcodes(
+        [tmp_path / "left.tif", tmp_path / "right.tif"],
+        easting=np.array([75.0]),
+        northing=np.array([25.0]),
+        label="Road",
+        col_name="road_noise_lden_db",
+    )
+
+    assert result.to_list() == [70.0]
+
+
+def test_sample_noise_distinguishes_nodata_from_in_coverage_quiet(
+    monkeypatch, tmp_path
+):
+    monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 0)
+    monkeypatch.setattr(noise, "RESOLUTION", 10)
+
+    # Defra encodes TRUE nodata as the -96.0 sentinel; genuinely quiet ground
+    # below the lowest reporting band is 0.0. With a 0m radius each postcode
+    # reads exactly one cell, so we can pin behaviour per cell:
+    #   -96.0 sentinel  -> null  ("we don't know")
+    #   0.0 in-coverage -> NOISE_QUIET_FLOOR_DB ("we know it's quiet")
+    #   65.0            -> 65.0  (a real modelled reading)
+    data = np.array(
+        [
+            [-96.0, 0.0, 65.0],
+        ],
+        dtype=np.float32,
+    )
+    _write_geotiff(tmp_path / "noise.tif", data, 0, 10, 10, nodata=-96.0)
+
+    result = noise.sample_noise_at_postcodes(
+        [tmp_path / "noise.tif"],
+        # Cell centres at easting 5 (nodata), 15 (quiet 0.0), 25 (loud 65).
+        easting=np.array([5.0, 15.0, 25.0]),
+        northing=np.array([5.0, 5.0, 5.0]),
+        label="Road",
+        col_name="road_noise_lden_db",
+    )
+
+    assert result.to_list() == [None, float(noise.NOISE_QUIET_FLOOR_DB), 65.0]
+
+
+def test_sample_noise_preserves_genuine_reading_above_quiet_floor(monkeypatch, tmp_path):
+    monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 0)
+    monkeypatch.setattr(noise, "RESOLUTION", 10)
+
+    # The lowest Defra reporting band is 40.0 dB; genuine readings populate
+    # [40, ~80]. A genuine in-coverage reading at or just above the floor must be
+    # PRESERVED, not clamped UP to the floor — only true-quiet 0.0 is floored. A
+    # quiet floor set too high (e.g. 45) would inflate the ~35% of real 40-44.99
+    # dB readings; this pins that they survive unchanged.
+    floor = float(noise.NOISE_QUIET_FLOOR_DB)
+    data = np.array(
+        [
+            [42.0, floor, 0.0],
+        ],
+        dtype=np.float32,
+    )
+    _write_geotiff(tmp_path / "noise.tif", data, 0, 10, 10, nodata=-96.0)
+
+    result = noise.sample_noise_at_postcodes(
+        [tmp_path / "noise.tif"],
+        # Cell centres at easting 5 (42 dB), 15 (floor), 25 (quiet 0.0).
+        easting=np.array([5.0, 15.0, 25.0]),
+        northing=np.array([5.0, 5.0, 5.0]),
+        label="Road",
+        col_name="road_noise_lden_db",
+    )
+
+    # 42 preserved (NOT raised to the floor), floor preserved, 0.0 -> floor.
+    assert result.to_list() == [42.0, floor, floor]
+    # The floor must sit at/below the lowest genuine reading so nothing inflates.
+    assert floor <= 42.0
+
+
+def test_sample_noise_nodata_window_stays_null(monkeypatch, tmp_path):
+    monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 15)
+    monkeypatch.setattr(noise, "RESOLUTION", 10)
+
+    # A postcode whose entire 3x3 max-window is the -96.0 sentinel must remain
+    # null: no in-coverage cell was read, so "quiet" must NOT be inferred.
+    data = np.full((5, 5), -96.0, dtype=np.float32)
+    data[4, 4] = 70.0  # one loud cell, far from the nodata corner
+    _write_geotiff(tmp_path / "noise.tif", data, 0, 50, 10, nodata=-96.0)
+
+    result = noise.sample_noise_at_postcodes(
+        [tmp_path / "noise.tif"],
+        # Top-left point: its 3x3 window is cells (rows 0-1, cols 0-1) = all -96.
+        easting=np.array([5.0]),
+        northing=np.array([45.0]),
+        label="Road",
+        col_name="road_noise_lden_db",
+    )
+
+    assert result.to_list() == [None]
+
+
 def test_sample_noise_at_postcodes_uses_local_maximum(monkeypatch, tmp_path):
    monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 15)
    monkeypatch.setattr(noise, "RESOLUTION", 10)
--- a/pipeline/download/test_places.py
+++ b/pipeline/download/test_places.py
@ -9,10 +9,12 @@ from pipeline.download.places import (
    _display_city_from_tags,
    _is_dlr_station,
    _is_tram_station,
+    _london_postcode_tree,
    _naptan_dlr_stations,
    _normalize_street_name,
    _ofs_universities,
    _outcode_of_postcode,
+    _outcode_tree,
    _pois_to_places,
    _select_university_name,
    _station_display_name,
@ -242,6 +244,42 @@ def test_pois_to_places_keeps_high_value_named_pois_only():
    assert all(place["travel_destination"] is False for place in places)


+def test_pois_to_places_keeps_distinct_same_named_pois():
+    # Two genuinely distinct POIs sharing a name, far apart (London vs Bristol).
+    pois = pl.DataFrame(
+        {
+            "name": ["Victoria Park", "Victoria Park"],
+            "category": ["leisure/park", "leisure/park"],
+            "lat": [51.54, 51.46],
+            "lng": [-0.04, -2.60],
+        }
+    )
+
+    places = _pois_to_places(pois)
+
+    assert len(places) == 2
+    assert {(place["lat"], place["lon"]) for place in places} == {
+        (51.54, -0.04),
+        (51.46, -2.60),
+    }
+
+
+def test_pois_to_places_still_dedupes_colocated():
+    # The same physical POI mapped twice a few metres apart collapses to one.
+    pois = pl.DataFrame(
+        {
+            "name": ["Victoria Park", "Victoria Park"],
+            "category": ["leisure/park", "leisure/park"],
+            "lat": [51.5400, 51.5401],
+            "lng": [-0.0400, -0.0399],
+        }
+    )
+
+    places = _pois_to_places(pois)
+
+    assert len(places) == 1
+
+
 def test_display_city_from_tags_uses_explicit_london_context():
    assert _display_city_from_tags({"is_in": "Croydon, London, UK"}) == "London"
    assert _display_city_from_tags({"is_in": "Croydon, Cambridgeshire, UK"}) is None
@ -290,3 +328,52 @@ def test_assign_london_display_city_uses_nearest_active_postcode_admin(tmp_path)

    assert assigned == 2
    assert [place["display_city"] for place in places] == ["London", "London", None]
+
+
+def test_no_grid_reference_sentinel_is_excluded_from_coordinate_trees(tmp_path):
+    # ONS NSPL stores postcodes with no grid reference at the Null-Island sentinel
+    # lat=99.999999, long=0.0, whose paired BNG coords collapse to the (0, 0) origin.
+    # Such an active postcode must never enter the nearest-neighbour indexes.
+    sentinel = {
+        "pcds": "ZZ99 9ZZ",
+        "lat": 99.999999,
+        "long": 0.0,
+        "doterm": None,
+        "ctry25cd": "E92000001",
+        "east1m": 0,
+        "north1m": 0,
+        "rgn25cd": "E12000007",
+        "lad25cd": "E09000008",
+        "cty25cd": "E13000002",
+    }
+    croydon_easting, croydon_northing = WGS84_TO_BNG.transform(-0.101793, 51.371273)
+    real = {
+        "pcds": "CR0 1SZ",
+        "lat": 51.371273,
+        "long": -0.101793,
+        "doterm": None,
+        "ctry25cd": "E92000001",
+        "east1m": int(round(croydon_easting)),
+        "north1m": int(round(croydon_northing)),
+        "rgn25cd": "E12000007",
+        "lad25cd": "E09000008",
+        "cty25cd": "E13000002",
+    }
+    postcodes = tmp_path / "postcodes.parquet"
+    pl.DataFrame([sentinel, real]).write_parquet(postcodes)
+
+    # lat/long outcode tree: only the real postcode survives, so a London-area query
+    # cannot be tagged with the sentinel's (empty) outcode.
+    tree, outcodes = _outcode_tree(postcodes)
+    assert tree.n == 1
+    assert outcodes == ["CR0"]
+    _, idx = tree.query([[51.371273, -0.101793]])
+    assert outcodes[idx[0]] == "CR0"
+
+    # BNG London tree: only the real postcode survives, so the (0, 0) origin can never
+    # be the nearest neighbour of a real place.
+    bng_tree, london_flags = _london_postcode_tree(postcodes)
+    assert bng_tree.n == 1
+    assert london_flags.tolist() == [True]
+    _, bng_idx = bng_tree.query([[croydon_easting, croydon_northing]])
+    assert bng_idx[0] == 0
--- a/pipeline/download/uprn_lookup.py
+++ b/pipeline/download/uprn_lookup.py
@ -14,7 +14,7 @@ from pathlib import Path
 import polars as pl

 from pipeline.local_temp import local_tmp_dir
-from pipeline.utils import download, extract_zip
+from pipeline.utils import code_col_overrides, download, extract_zip

 URL = "https://www.arcgis.com/sharing/rest/content/items/4e0b4b3fbc2540caae27e7be532e61be/data"

@ -34,16 +34,16 @@ def find_csvs(extract_path: Path) -> list[Path]:

 def convert_to_parquet(csv_paths: list[Path], parquet_path: Path) -> None:
    # Some regional files infer different types for the same column (e.g.
-    # ruc21ind is String in most but Int64 in YH). Read all code columns as
-    # String to avoid schema mismatches.
-    CODE_COLS = {
-        "ruc21ind": pl.String,
-        "oac21ind": pl.String,
-        "imd19ind": pl.String,
-    }
+    # ruc21ind is String in most but Int64 in YH), and string codes like "UN1"
+    # appear deep in the data. Read all classification-index code columns as
+    # String to avoid schema mismatches. NSUL renames the year suffixes each
+    # release and polars silently ignores overrides for missing columns, so
+    # match on the suffix-free stem (from the header) rather than hard-coding.
+    names = pl.scan_csv(csv_paths[0]).collect_schema().names()
+    code_cols = code_col_overrides(names)
    df = pl.concat(
        [
-            pl.scan_csv(p, try_parse_dates=True, schema_overrides=CODE_COLS)
+            pl.scan_csv(p, try_parse_dates=True, schema_overrides=code_cols)
            for p in csv_paths
        ]
    )