LGTM

2026-05-14 08:09:19 +01:00 · 2026-05-14 08:09:19 +01:00 · a4103b0896
commit a4103b0896
parent a8165249a4
64 changed files with 5376 additions and 3832 deletions
--- a/pipeline/download/election_results.py
+++ b/pipeline/download/election_results.py
@ -17,16 +17,17 @@ PARTY_MAP = {
    "Reform UK": "Reform UK",
    "Green Party": "Green",
 }
+PARTY_GROUPS = [
+    "Labour",
+    "Conservative",
+    "Liberal Democrat",
+    "Reform UK",
+    "Green",
+    "Other parties",
+]


-def download_and_convert(output_path: Path) -> None:
-    print("Downloading 2024 General Election results...")
-    response = httpx.get(URL, follow_redirects=True, timeout=60)
-    response.raise_for_status()
-
-    df = pl.read_csv(response.content)
-    print(f"Raw shape: {df.shape}")
-
+def _convert_results(df: pl.DataFrame) -> pl.DataFrame:
    # Filter to England only (constituency codes starting with E14)
    df = df.filter(pl.col("Constituency geographic code").str.starts_with("E14"))

@ -70,9 +71,27 @@ def download_and_convert(output_path: Path) -> None:
    # Rename columns to "% Party" format
    rename_map = {col: f"% {col}" for col in party_pct.columns if col != "pcon"}
    party_pct = party_pct.rename(rename_map)
+    for party in PARTY_GROUPS:
+        col = f"% {party}"
+        if col not in party_pct.columns:
+            party_pct = party_pct.with_columns(pl.lit(0.0).alias(col))
+    party_pct = party_pct.with_columns(
+        [pl.col(f"% {party}").fill_null(0.0) for party in PARTY_GROUPS]
+    )

    # Join turnout with party vote shares
-    result = turnout.join(party_pct, on="pcon", how="left")
+    return turnout.join(party_pct, on="pcon", how="left")
+
+
+def download_and_convert(output_path: Path) -> None:
+    print("Downloading 2024 General Election results...")
+    response = httpx.get(URL, follow_redirects=True, timeout=60)
+    response.raise_for_status()
+
+    df = pl.read_csv(response.content)
+    print(f"Raw shape: {df.shape}")
+
+    result = _convert_results(df)

    print(f"Constituencies: {result.height}")
    print(f"Columns: {result.columns}")
--- a/pipeline/download/ethnicity.py
+++ b/pipeline/download/ethnicity.py
@ -9,15 +9,32 @@ pl.Config.set_tbl_cols(-1)

 URL = "https://www.ethnicity-facts-figures.service.gov.uk/uk-population-by-ethnicity/national-and-regional-populations/regional-ethnic-diversity/latest/downloads/population-by-ethnicity-and-local-authority-2021.csv"

+GEOGRAPHY_CODE_REPLACEMENTS = {
+    # 2023 Cumberland unitary authority
+    "E07000026": "E06000063",  # Allerdale
+    "E07000028": "E06000063",  # Carlisle
+    "E07000029": "E06000063",  # Copeland
+    # 2023 Westmorland and Furness unitary authority
+    "E07000027": "E06000064",  # Barrow-in-Furness
+    "E07000030": "E06000064",  # Eden
+    "E07000031": "E06000064",  # South Lakeland
+    # 2023 North Yorkshire unitary authority
+    "E07000163": "E06000065",  # Craven
+    "E07000164": "E06000065",  # Hambleton
+    "E07000165": "E06000065",  # Harrogate
+    "E07000166": "E06000065",  # Richmondshire
+    "E07000167": "E06000065",  # Ryedale
+    "E07000168": "E06000065",  # Scarborough
+    "E07000169": "E06000065",  # Selby
+    # 2023 Somerset unitary authority
+    "E07000187": "E06000066",  # Mendip
+    "E07000188": "E06000066",  # Sedgemoor
+    "E07000189": "E06000066",  # South Somerset
+    "E07000246": "E06000066",  # Somerset West and Taunton
+}

-def download_and_convert(output_path: Path) -> None:
-    print("Downloading ethnicity data...")
-    response = httpx.get(URL, follow_redirects=True, timeout=60)
-    response.raise_for_status()
-
-    df = pl.read_csv(response.content)
-    print(f"Raw shape: {df.head(100)}")

+def _ethnicity_percentages(df: pl.DataFrame) -> pl.DataFrame:
    # Use the detailed 19+1 breakdown to get sub-categories for Asian ethnicity,
    # then aggregate back to the broad groups plus South Asian / East Asian split.
    detailed = df.filter(
@ -55,11 +72,20 @@ def download_and_convert(output_path: Path) -> None:

    detailed = detailed.with_columns(
        pl.col("Ethnicity").replace_strict(group_map).alias("group"),
+        pl.col("Geography_code")
+        .replace(GEOGRAPHY_CODE_REPLACEMENTS)
+        .alias("output_geography_code"),
+        pl.col("Ethnic Population").cast(pl.Float64, strict=False).alias("_population"),
    )

-    # Sum percentages within each group per local authority (keep full precision)
-    grouped = detailed.group_by("Geography_code", "group").agg(pl.col("Value1").sum())
-    wide = grouped.pivot(on="group", index="Geography_code", values="Value1")
+    # Sum counts, not rounded percentages, so old districts can be safely
+    # recombined into their current unitary authorities.
+    grouped = detailed.group_by("output_geography_code", "group").agg(
+        pl.col("_population").sum()
+    )
+    wide = grouped.pivot(
+        on="group", index="output_geography_code", values="_population"
+    ).rename({"output_geography_code": "Geography_code"})

    # Normalize so each row sums to exactly 100%, then round using largest-remainder
    # method to preserve the sum. Independent rounding of 6 values can drift ±0.3.
@ -89,6 +115,18 @@ def download_and_convert(output_path: Path) -> None:
    # Rename columns to be descriptive
    rename_map = {col: f"% {col}" for col in wide.columns if col != "Geography_code"}
    wide = wide.rename(rename_map)
+    return wide
+
+
+def download_and_convert(output_path: Path) -> None:
+    print("Downloading ethnicity data...")
+    response = httpx.get(URL, follow_redirects=True, timeout=60)
+    response.raise_for_status()
+
+    df = pl.read_csv(response.content)
+    print(f"Raw shape: {df.head(100)}")
+
+    wide = _ethnicity_percentages(df)

    print(f"Output shape: {wide.shape}")
    print(f"Columns: {wide.columns}")
--- a/pipeline/download/noise.py
+++ b/pipeline/download/noise.py
@ -1,11 +1,11 @@
 """Download Defra Round 4 (2022) strategic noise data for England.

 Downloads modelled noise levels (road, rail, airport) as GeoTIFF rasters via
-WCS, then samples noise values at postcode centroids. Outputs a parquet file
-with postcode-level noise in dB for each source.
+WCS, then samples the local maximum around each postcode representative point.
+Outputs a parquet file with postcode-level noise in dB for each source.

-Uses 100km tiles (~42 per source) to balance request size vs count. The server
-times out on tiles larger than ~150km at 100m resolution.
+Uses smaller 20km tiles at native 10m resolution so values are not understated
+by a single coarse centroid sample.

 Data source: Defra Strategic Noise Mapping Round 4 (2022)
  - Lden = day-evening-night 24h weighted average (the EU standard metric)
@ -17,6 +17,7 @@ endpoint is broken for that coverage).
 """

 import argparse
+import math
 import tempfile
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
@ -27,8 +28,8 @@ import numpy as np
 import polars as pl
 import rasterio
 from pyproj import Transformer
-from rasterio.merge import merge
 from rasterio.transform import rowcol
+from scipy.ndimage import maximum_filter

 # Noise sources:
 # (label, column_name, WCS base URL, coverage ID, WCS version, allow_missing_tiles)
@ -67,8 +68,9 @@ BNG_MAX_E = 660_000
 BNG_MIN_N = 0
 BNG_MAX_N = 660_000

-# Tile size in metres (100km balances request size vs count; 300km causes 504s)
-TILE_SIZE = 100_000
+# Tile size in metres. At 10m resolution, 20km tiles are ~4M pixels each,
+# small enough to process one at a time without mosaicking all England.
+TILE_SIZE = 20_000

 # Max concurrent tile downloads
 MAX_WORKERS = 4
@ -76,19 +78,27 @@ MAX_WORKERS = 4
 # Native raster resolution (10m grid)
 NATIVE_RESOLUTION = 10

-# Request pixel resolution in metres (100m is sufficient for postcode-level data
-# and keeps download size ~100x smaller than native 10m)
-RESOLUTION = 100
+# Request pixel resolution in metres.
+RESOLUTION = NATIVE_RESOLUTION
+
+# The pipeline has postcode representative points rather than complete unit
+# polygons here. Use a small local footprint and take the maximum 10m cell so
+# postcode-level noise is not understated by centroid rounding.
+POSTCODE_NOISE_RADIUS_M = 50

 # Retry/split behaviour for slow Defra WCS requests. Some 100km eastern tiles
 # intermittently return 504s; smaller fallback requests usually succeed.
 MAX_RETRIES = 3
 RETRY_BACKOFF_SECONDS = 5
-MIN_TILE_SIZE = 25_000
+MIN_TILE_SIZE = 5_000

 type Tile = tuple[int, int, int, int]


+class NoGeoTiffError(RuntimeError):
+    """Raised when WCS returns an XML/HTML exception or another non-raster body."""
+
+
 def _wcs_get_coverage_url(
    wcs_base: str,
    coverage_id: str,
@ -144,8 +154,8 @@ def _fetch_tile_bytes(
    max_e: int,
    max_n: int,
    wcs_version: str = "1.0.0",
-) -> bytes | None:
-    """Fetch one WCS tile. Returns None when the server reports no GeoTIFF."""
+) -> bytes:
+    """Fetch one WCS tile."""
    url = _wcs_get_coverage_url(
        wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
    )
@ -154,7 +164,11 @@ def _fetch_tile_bytes(
        resp.raise_for_status()

    if not _looks_like_tiff(resp):
-        return None
+        content_type = resp.headers.get("content-type", "<missing>")
+        body_preview = resp.text[:200].replace("\n", " ")
+        raise NoGeoTiffError(
+            f"WCS returned non-GeoTIFF response ({content_type}): {body_preview}"
+        )
    return resp.content


@ -200,11 +214,14 @@ def _download_tile(
            content = _fetch_tile_bytes(
                wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
            )
-            if content is None:
-                return [], []
            tile_path.write_bytes(content)
            return [tile_path], []
-        except (httpx.HTTPStatusError, httpx.TimeoutException, httpx.ConnectError) as e:
+        except (
+            NoGeoTiffError,
+            httpx.HTTPStatusError,
+            httpx.TimeoutException,
+            httpx.ConnectError,
+        ) as e:
            last_error = e
            if attempt < MAX_RETRIES:
                sleep_for = RETRY_BACKOFF_SECONDS * attempt
@ -323,35 +340,62 @@ def sample_noise_at_postcodes(
    label: str,
    col_name: str,
 ) -> pl.Series:
-    """Sample noise values from merged tiles at given BNG coordinates."""
-    print(f"[{label}] Merging {len(tile_paths)} tiles...")
-    datasets = [rasterio.open(p) for p in tile_paths]
-    raster_nodata = datasets[0].nodata
-    mosaic, mosaic_transform = merge(datasets)
-    for ds in datasets:
-        ds.close()
-
-    noise_grid = mosaic[0]
-
-    print(f"[{label}] Sampling noise values at postcode centroids...")
-    rows, cols = rowcol(mosaic_transform, easting, northing)
-    rows = np.asarray(rows)
-    cols = np.asarray(cols)
-
-    h, w = noise_grid.shape
-    in_bounds = (rows >= 0) & (rows < h) & (cols >= 0) & (cols < w)
-
+    """Sample max noise values from 10m tiles around postcode representative points."""
+    print(f"[{label}] Sampling max noise values from {len(tile_paths)} tiles...")
    noise_db = np.full(len(easting), np.nan, dtype=np.float32)
-    valid_rows = rows[in_bounds]
-    valid_cols = cols[in_bounds]
-    sampled = noise_grid[valid_rows, valid_cols].astype(np.float32)
+    radius_cells = max(0, math.ceil(POSTCODE_NOISE_RADIUS_M / RESOLUTION))
+    filter_size = radius_cells * 2 + 1

-    # Mark nodata and zero (unmapped areas) as NaN.
-    # Road/rail use nodata=-96, airport uses nodata=3.4e38.
-    if raster_nodata is not None:
-        sampled[np.isclose(sampled, np.float32(raster_nodata), rtol=1e-5)] = np.nan
-    sampled[sampled == 0] = np.nan
-    noise_db[in_bounds] = sampled
+    for path in tile_paths:
+        with rasterio.open(path) as dataset:
+            bounds = dataset.bounds
+            candidate_mask = (
+                (easting >= bounds.left - POSTCODE_NOISE_RADIUS_M)
+                & (easting <= bounds.right + POSTCODE_NOISE_RADIUS_M)
+                & (northing >= bounds.bottom - POSTCODE_NOISE_RADIUS_M)
+                & (northing <= bounds.top + POSTCODE_NOISE_RADIUS_M)
+            )
+            candidate_indices = np.flatnonzero(candidate_mask)
+            if len(candidate_indices) == 0:
+                continue
+
+            grid = dataset.read(1).astype(np.float32, copy=False)
+            invalid = ~np.isfinite(grid) | (grid == 0)
+            if dataset.nodata is not None:
+                invalid |= np.isclose(
+                    grid, np.float32(dataset.nodata), rtol=1e-5, atol=1e-5
+                )
+            grid = grid.copy()
+            grid[invalid] = -np.inf
+            if filter_size > 1:
+                grid = maximum_filter(
+                    grid, size=filter_size, mode="constant", cval=-np.inf
+                )
+
+            rows, cols = rowcol(
+                dataset.transform,
+                easting[candidate_indices],
+                northing[candidate_indices],
+            )
+            rows = np.asarray(rows)
+            cols = np.asarray(cols)
+            h, w = grid.shape
+            in_bounds = (rows >= 0) & (rows < h) & (cols >= 0) & (cols < w)
+            if not np.any(in_bounds):
+                continue
+
+            sampled_indices = candidate_indices[in_bounds]
+            sampled = grid[rows[in_bounds], cols[in_bounds]]
+            valid = sampled != -np.inf
+            if not np.any(valid):
+                continue
+
+            sampled_indices = sampled_indices[valid]
+            sampled = sampled[valid]
+            existing = noise_db[sampled_indices]
+            noise_db[sampled_indices] = np.where(
+                np.isnan(existing), sampled, np.maximum(existing, sampled)
+            )

    valid_count = int(np.sum(~np.isnan(noise_db)))
    print(
--- a/pipeline/download/pois.py
+++ b/pipeline/download/pois.py
@ -4,7 +4,9 @@ from tempfile import mkdtemp

 import osmium
 import polars as pl
+from shapely import make_valid
 from shapely.geometry import Point
+from shapely.wkb import loads as load_wkb
 from tqdm import tqdm

 from pipeline.utils.england_geometry import (
@ -31,6 +33,21 @@ POI_TAG_KEYS: list[str] = [
    "tourism",
    "public_transport",
 ]
+AREA_BUILDING_CATEGORIES = {"building/church", "building/university"}
+
+
+def _representative_lat_lon(geom, england_polygon) -> tuple[float, float] | None:
+    if geom.is_empty:
+        return None
+    if not geom.is_valid:
+        geom = make_valid(geom)
+    if geom.is_empty:
+        return None
+    point = geom.representative_point()
+    lat, lon = point.y, point.x
+    if not england_polygon.contains(Point(lon, lat)):
+        return None
+    return lat, lon


 class POIHandler(osmium.SimpleHandler):
@ -42,6 +59,7 @@ class POIHandler(osmium.SimpleHandler):
        self.poi_count = 0
        self._progress = progress
        self._england = england_polygon
+        self._wkb_factory = osmium.geom.WKBFactory()

    def _in_england(self, lat: float, lon: float) -> bool:
        # Fast bbox pre-filter, then precise polygon check
@ -52,8 +70,18 @@ class POIHandler(osmium.SimpleHandler):
            return False
        return self._england.contains(Point(lon, lat))

-    def _match_tags(self, tags: osmium.osm.TagList) -> list[str]:
-        return [f"{key}/{tags[key]}" for key in POI_TAG_KEYS if key in tags]
+    def _match_tags(
+        self, tags: osmium.osm.TagList, *, polygonal: bool = False
+    ) -> list[str]:
+        categories = [f"{key}/{tags[key]}" for key in POI_TAG_KEYS if key in tags]
+        if not polygonal:
+            return categories
+        return [
+            category
+            for category in categories
+            if not category.startswith("building/")
+            or category in AREA_BUILDING_CATEGORIES
+        ]

    def _get_name(self, tags: osmium.osm.TagList) -> str:
        return tags.get("name:en", tags.get("name", ""))
@ -89,6 +117,13 @@ class POIHandler(osmium.SimpleHandler):
        if len(self._batch) >= BATCH_SIZE:
            self._flush_batch()

+    def _point_from_area(self, area: osmium.osm.Area) -> tuple[float, float] | None:
+        try:
+            geom = load_wkb(self._wkb_factory.create_multipolygon(area), hex=True)
+        except Exception:
+            return None
+        return _representative_lat_lon(geom, self._england)
+
    def _tick(self) -> None:
        self._progress.update(1)

@ -103,6 +138,18 @@ class POIHandler(osmium.SimpleHandler):
        for category in categories:
            self._add_poi(f"n{n.id}", n.tags, category, lat, lon)

+    def area(self, a: osmium.osm.Area) -> None:
+        self._tick()
+        categories = self._match_tags(a.tags, polygonal=True)
+        if not categories:
+            return
+        point = self._point_from_area(a)
+        if point is None:
+            return
+        lat, lon = point
+        for category in categories:
+            self._add_poi(f"a{a.id}", a.tags, category, lat, lon)
+

 def main() -> None:
    parser = argparse.ArgumentParser(
--- a/pipeline/download/rental_prices.py
+++ b/pipeline/download/rental_prices.py
@ -20,21 +20,27 @@ URL = "https://www.ons.gov.uk/file?uri=/economy/inflationandpriceindices/dataset
 # Local authority district codes in England
 LA_PREFIXES = ("E06", "E07", "E08", "E09")

+# ONS PIPR uses newer ONS codes for the 2026 South Yorkshire boundary/code
+# update while IoD 2025 still carries the predecessor codes. Duplicate rows
+# under the IoD codes so downstream joins are complete without inventing rents.
+AREA_CODE_ALIASES = {
+    "E08000038": "E08000016",  # Barnsley
+    "E08000039": "E08000019",  # Sheffield
+}

-def convert_to_parquet(xlsx_path: Path, parquet_path: Path) -> None:
-    print("Reading PIPR Excel file (Table 1)...")

+def _latest_rents_long(df: pl.DataFrame) -> pl.DataFrame:
    # Table 1 layout: row 0 = title, row 1 = column headers, row 2+ = data.
    # 40 columns in repeating blocks of 4 (index, monthly change, annual change,
    # rental price) for each category. Rental price columns (0-indexed):
    #   7 = All categories, 11 = One bed, 15 = Two bed, 19 = Three bed,
    #   23 = Four or more bed
-    df = pl.read_excel(xlsx_path, sheet_name="Table 1", has_header=False)
    df = df.slice(2)  # Skip title and header rows

    df = df.select(
        pl.col("column_1").alias("time_period"),
        pl.col("column_2").alias("area_code"),
+        pl.col("column_3").alias("area_name"),
        pl.col("column_12").cast(pl.Float32, strict=False).alias("rent_1bed"),
        pl.col("column_16").cast(pl.Float32, strict=False).alias("rent_2bed"),
        pl.col("column_20").cast(pl.Float32, strict=False).alias("rent_3bed"),
@ -65,12 +71,30 @@ def convert_to_parquet(xlsx_path: Path, parquet_path: Path) -> None:
        frames.append(
            df.select(
                pl.col("area_code"),
+                pl.col("area_name"),
                pl.col(col).alias("mean_monthly_rent"),
                pl.lit(bedrooms).cast(pl.UInt8).alias("bedrooms"),
            )
        )

    combined = pl.concat(frames)
+    alias_rows = []
+    for source_code, alias_code in AREA_CODE_ALIASES.items():
+        alias_rows.append(
+            combined.filter(pl.col("area_code") == source_code).with_columns(
+                pl.lit(alias_code).alias("area_code")
+            )
+        )
+    if alias_rows:
+        combined = pl.concat([combined, *alias_rows])
+
+    return combined.unique(["area_code", "bedrooms"], keep="first")
+
+
+def convert_to_parquet(xlsx_path: Path, parquet_path: Path) -> None:
+    print("Reading PIPR Excel file (Table 1)...")
+    raw = pl.read_excel(xlsx_path, sheet_name="Table 1", has_header=False)
+    combined = _latest_rents_long(raw)

    print(f"Combined: {combined.shape}")
    print(f"Non-null rents: {combined['mean_monthly_rent'].drop_nulls().len()}")
--- a/pipeline/download/test_election_results.py
+++ b/pipeline/download/test_election_results.py
@ -0,0 +1,22 @@
+import polars as pl
+
+from pipeline.download.election_results import _convert_results
+
+
+def test_convert_results_fills_parties_that_did_not_stand_with_zero():
+    raw = pl.DataFrame(
+        {
+            "Constituency geographic code": ["E14000001", "E14000001"],
+            "Main party name": ["Labour", "Conservative"],
+            "Candidate result position": [1, 2],
+            "Election valid vote count": [1000, 1000],
+            "Electorate": [2000, 2000],
+            "Candidate vote count": [600, 400],
+        }
+    )
+
+    result = _convert_results(raw)
+
+    assert result.select("% Labour", "% Conservative", "% Reform UK").to_dicts() == [
+        {"% Labour": 60.0, "% Conservative": 40.0, "% Reform UK": 0.0}
+    ]
--- a/pipeline/download/test_ethnicity.py
+++ b/pipeline/download/test_ethnicity.py
@ -0,0 +1,37 @@
+import polars as pl
+
+from pipeline.download.ethnicity import _ethnicity_percentages
+
+
+def test_ethnicity_percentages_recombines_predecessor_lads_by_population():
+    rows = []
+    for code, white, indian in [
+        ("E07000026", 80, 20),
+        ("E07000028", 10, 90),
+    ]:
+        total = white + indian
+        rows.extend(
+            [
+                {
+                    "Geography_code": code,
+                    "Ethnicity_type": "ONS 2021 19+1",
+                    "Ethnicity": "White British",
+                    "Ethnic Population": white,
+                    "Value1": white / total * 100,
+                },
+                {
+                    "Geography_code": code,
+                    "Ethnicity_type": "ONS 2021 19+1",
+                    "Ethnicity": "Indian",
+                    "Ethnic Population": indian,
+                    "Value1": indian / total * 100,
+                },
+            ]
+        )
+
+    result = _ethnicity_percentages(pl.DataFrame(rows))
+
+    cumberland = result.filter(pl.col("Geography_code") == "E06000063")
+    assert cumberland.select("% White", "% South Asian").to_dicts() == [
+        {"% White": 45.0, "% South Asian": 55.0}
+    ]
--- a/pipeline/download/test_noise.py
+++ b/pipeline/download/test_noise.py
@ -1,5 +1,8 @@
 import httpx
+import numpy as np
 import pytest
+import rasterio
+from rasterio.transform import from_origin

 from pipeline.download import noise

@ -50,6 +53,21 @@ def test_download_tile_reports_unsplittable_failure(monkeypatch, tmp_path):
    assert failures == [(0, 0, 100, 100)]


+def test_download_tile_treats_non_tiff_response_as_failure(monkeypatch, tmp_path):
+    monkeypatch.setattr(noise, "MAX_RETRIES", 1)
+    monkeypatch.setattr(noise, "MIN_TILE_SIZE", 100)
+
+    def fake_fetch_tile_bytes(*args, **kwargs):
+        raise noise.NoGeoTiffError("xml exception")
+
+    monkeypatch.setattr(noise, "_fetch_tile_bytes", fake_fetch_tile_bytes)
+
+    paths, failures = noise._download_tile("base", "coverage", 0, 0, 100, 100, tmp_path)
+
+    assert paths == []
+    assert failures == [(0, 0, 100, 100)]
+
+
 def test_download_raster_tolerates_missing_tiles_when_allowed(monkeypatch, tmp_path):
    monkeypatch.setattr(noise, "BNG_MIN_E", 0)
    monkeypatch.setattr(noise, "BNG_MAX_E", 100)
@ -87,3 +105,42 @@ def test_download_raster_raises_on_missing_strict_tiles(monkeypatch, tmp_path):

    with pytest.raises(RuntimeError, match=r"\[Road\] Failed to download"):
        noise.download_raster(tmp_path, "base", "coverage", "Road")
+
+
+def test_sample_noise_at_postcodes_uses_local_maximum(monkeypatch, tmp_path):
+    monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 15)
+    monkeypatch.setattr(noise, "RESOLUTION", 10)
+    tile_path = tmp_path / "noise.tif"
+    data = np.array(
+        [
+            [0, 0, 0, 0, 0],
+            [0, 70, 0, 0, 0],
+            [0, 0, 55, 0, 0],
+            [0, 0, 0, 0, 0],
+            [0, 0, 0, 0, 0],
+        ],
+        dtype=np.float32,
+    )
+    with rasterio.open(
+        tile_path,
+        "w",
+        driver="GTiff",
+        height=data.shape[0],
+        width=data.shape[1],
+        count=1,
+        dtype=data.dtype,
+        crs="EPSG:27700",
+        transform=from_origin(0, 50, 10, 10),
+        nodata=0,
+    ) as dataset:
+        dataset.write(data, 1)
+
+    result = noise.sample_noise_at_postcodes(
+        [tile_path],
+        easting=np.array([25.0]),
+        northing=np.array([25.0]),
+        label="Road",
+        col_name="road_noise_lden_db",
+    )
+
+    assert result.to_list() == [70.0]
--- a/pipeline/download/test_pois.py
+++ b/pipeline/download/test_pois.py
@ -0,0 +1,15 @@
+from shapely.geometry import Polygon, box
+
+from pipeline.download.pois import _representative_lat_lon
+
+
+def test_representative_lat_lon_uses_point_inside_polygon():
+    england = box(-1, 50, 1, 52)
+    poi_area = Polygon([(-0.1, 51.5), (0.1, 51.5), (0.1, 51.6), (-0.1, 51.6)])
+
+    lat_lon = _representative_lat_lon(poi_area, england)
+
+    assert lat_lon is not None
+    lat, lon = lat_lon
+    assert 51.5 <= lat <= 51.6
+    assert -0.1 <= lon <= 0.1
--- a/pipeline/download/test_rental_prices.py
+++ b/pipeline/download/test_rental_prices.py
@ -0,0 +1,24 @@
+import polars as pl
+
+from pipeline.download.rental_prices import _latest_rents_long
+
+
+def test_latest_rents_long_adds_iod_alias_codes_for_south_yorkshire():
+    raw = pl.DataFrame(
+        {
+            "column_1": ["title", "header", "2026-02-01 00:00:00"],
+            "column_2": ["", "", "E08000038"],
+            "column_3": ["", "", "Barnsley"],
+            "column_12": ["", "", "486"],
+            "column_16": ["", "", "595"],
+            "column_20": ["", "", "705"],
+            "column_24": ["", "", "900"],
+        }
+    )
+
+    result = _latest_rents_long(raw).filter(pl.col("bedrooms") == 1).sort("area_code")
+
+    assert result.select("area_code", "mean_monthly_rent").to_dicts() == [
+        {"area_code": "E08000016", "mean_monthly_rent": 486.0},
+        {"area_code": "E08000038", "mean_monthly_rent": 486.0},
+    ]