This commit is contained in:
Andras Schmelczer 2026-06-02 20:14:32 +01:00
parent fbfebc651c
commit aab85fe32e
33 changed files with 2016 additions and 283 deletions

View file

@ -4,27 +4,24 @@ import polars as pl
from pathlib import Path
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download, extract_zip
from pipeline.utils import code_col_overrides, download, extract_zip
URL = "https://www.arcgis.com/sharing/rest/content/items/36b718ad00de49afb9ad364f8b815b9e/data"
def convert_to_parquet(data_path: Path, parquet_path: Path) -> None:
# Classification code columns (ruc21ind, oac11ind, imd20ind) look numeric
# in early rows but contain string codes like "UN1" (Unclassified) later
# on. Force them to String to avoid mid-stream dtype inference failures.
# Note: NSPL renames these year suffixes as new releases roll in (e.g.
# Feb 2026 bumped oac from oac21ind → oac11ind, imd from imd19ind →
# imd20ind), so keep this dict in sync with the current CSV headers —
# polars silently ignores overrides for missing columns, masking drift.
# Classification code columns (e.g. ruc21ind, oac11ind, imd20ind) look
# numeric in early rows but contain string codes like "UN1" (Unclassified)
# later on. Force them to String to avoid mid-stream dtype inference
# failures. NSPL renames these year suffixes each release, and polars
# silently ignores overrides for missing columns, so match on the
# suffix-free stem (read from the header) rather than hard-coding suffixes.
csv_path = data_path / "Data/NSPL_FEB_2026_UK.csv"
names = pl.scan_csv(csv_path).collect_schema().names()
df = pl.scan_csv(
data_path / "Data/NSPL_FEB_2026_UK.csv",
csv_path,
try_parse_dates=True,
schema_overrides={
"ruc21ind": pl.String,
"oac11ind": pl.String,
"imd20ind": pl.String,
},
schema_overrides=code_col_overrides(names),
)
print(f"Columns: {df.collect_schema().names()}")
parquet_path.parent.mkdir(parents=True, exist_ok=True)

View file

@ -43,6 +43,35 @@ AGE_BANDS = [
(85, 5), # Aged 85 years and over
]
# Canonical NOMIS TS007A (C2021_AGE_19_NAME) band labels, in the SAME order as
# AGE_BANDS. Index i here corresponds to AGE_BANDS[i]; we validate the pivot
# output against this set and use it (not positional string parsing) to order
# the columns, so a stray/relabelled/missing band fails loudly instead of
# silently mis-aligning counts against the wrong lower bound.
EXPECTED_BAND_NAMES = [
"Aged 0 to 4 years",
"Aged 5 to 9 years",
"Aged 10 to 14 years",
"Aged 15 to 19 years",
"Aged 20 to 24 years",
"Aged 25 to 29 years",
"Aged 30 to 34 years",
"Aged 35 to 39 years",
"Aged 40 to 44 years",
"Aged 45 to 49 years",
"Aged 50 to 54 years",
"Aged 55 to 59 years",
"Aged 60 to 64 years",
"Aged 65 to 69 years",
"Aged 70 to 74 years",
"Aged 75 to 79 years",
"Aged 80 to 84 years",
"Aged 85 years and over",
]
assert len(EXPECTED_BAND_NAMES) == len(AGE_BANDS), (
"EXPECTED_BAND_NAMES and AGE_BANDS must stay aligned 1:1"
)
def compute_median_age(counts: list[int]) -> float:
"""Compute median age from five-year band counts using linear interpolation."""
@ -62,6 +91,53 @@ def compute_median_age(counts: list[int]) -> float:
return float("nan")
def _bands_to_median_table(pivoted: pl.DataFrame) -> pl.DataFrame:
"""Validate the pivoted age-band columns, then compute median age per LSOA.
The pivot must contain exactly the canonical NOMIS TS007A bands; a
missing/extra/relabelled band would otherwise silently mis-align counts
against the wrong AGE_BANDS lower bound, so we fail loudly instead.
"""
# Validate the pivoted age-band columns against the canonical NOMIS set
# BEFORE computing anything.
band_cols = [c for c in pivoted.columns if c != "GEOGRAPHY_CODE"]
found = set(band_cols)
expected = set(EXPECTED_BAND_NAMES)
if found != expected:
missing = sorted(expected - found)
unexpected = sorted(found - expected)
raise ValueError(
"Census age-band columns do not match the expected NOMIS TS007A bands.\n"
f" expected {len(EXPECTED_BAND_NAMES)} bands, found {len(band_cols)}\n"
f" missing: {missing}\n"
f" unexpected: {unexpected}\n"
"Refusing to compute medians against misaligned bands."
)
# Use the canonical order (guaranteed aligned with AGE_BANDS), not positional
# string parsing, and treat a null band (zero-population) as 0 rather than
# crashing on sum().
band_cols = list(EXPECTED_BAND_NAMES)
pivoted = pivoted.with_columns(pl.col(band_cols).fill_null(0))
print(f"Age bands found: {len(band_cols)}")
print(f" First: {band_cols[0]}")
print(f" Last: {band_cols[-1]}")
rows = pivoted.select("GEOGRAPHY_CODE", *band_cols).to_dicts()
medians = []
for row in rows:
counts = [row[col] for col in band_cols]
median = compute_median_age(counts)
medians.append(
{"lsoa21": row["GEOGRAPHY_CODE"], "median_age": round(median, 1)}
)
return pl.DataFrame(medians).with_columns(
pl.col("median_age").cast(pl.Float32),
)
def download_and_convert(output_path: Path) -> None:
print("Downloading Census 2021 age by five-year bands from NOMIS...")
frames = []
@ -94,29 +170,7 @@ def download_and_convert(output_path: Path) -> None:
values="OBS_VALUE",
)
# Extract age band columns in order and compute median
# NOMIS returns band names like "Aged 0 to 4 years", "Aged 85 years and over"
band_cols = [c for c in pivoted.columns if c != "GEOGRAPHY_CODE"]
# Sort by the lower bound of each band
band_cols.sort(key=lambda c: int(c.split()[1]))
print(f"Age bands found: {len(band_cols)}")
print(f" First: {band_cols[0]}")
print(f" Last: {band_cols[-1]}")
# Compute median age per LSOA
rows = pivoted.select("GEOGRAPHY_CODE", *band_cols).to_dicts()
medians = []
for row in rows:
counts = [row[col] for col in band_cols]
median = compute_median_age(counts)
medians.append(
{"lsoa21": row["GEOGRAPHY_CODE"], "median_age": round(median, 1)}
)
result = pl.DataFrame(medians).with_columns(
pl.col("median_age").cast(pl.Float32),
)
result = _bands_to_median_table(pivoted)
print(f"England LSOAs: {result.height}")
print(

View file

@ -83,11 +83,32 @@ NATIVE_RESOLUTION = 10
# Request pixel resolution in metres.
RESOLUTION = NATIVE_RESOLUTION
# Defra encodes TRUE "no data" with this sentinel (NOT 0.0). A 0.0 cell that is
# otherwise inside the raster means "modelled below the lowest reporting band",
# i.e. genuinely quiet — see noise_overlay_tiles.py:167.
NOISE_NODATA_SENTINEL = np.float32(-96.0)
# Lowest modelled Defra Lden reporting band (dB). Verified against the actual
# rasters: the minimum positive in-coverage value is 40.0 dB with NO values in
# (0, 40) — below the band, cells are encoded as 0.0 (genuinely quiet). We floor
# in-coverage cells to 40.0 so a below-band 0.0 surfaces as "we know it's quiet"
# (~40 dB) instead of collapsing to null ("we don't know"), WITHOUT inflating the
# ~35% of genuine 40-44.99 dB readings that a 45.0 floor would wrongly bump to 45.
# NB: 45.0 is the overlay's lowest *paint* stop (noise_overlay_tiles.
# NOISE_COLOR_STOPS[0]) — a rendering threshold, not the data's reporting floor.
NOISE_QUIET_FLOOR_DB = np.float32(40.0)
# The pipeline has postcode representative points rather than complete unit
# polygons here. Use a small local footprint and take the maximum 10m cell so
# postcode-level noise is not understated by centroid rounding.
POSTCODE_NOISE_RADIUS_M = 50
# Adjacent download tiles must overlap by at least the sampling radius so every
# postcode's 50m max-window is fully contained in at least one tile. Without
# this, a loud pixel just across a tile seam is invisible to a postcode on the
# far side, under-reporting noise near seams.
TILE_OVERLAP_M = POSTCODE_NOISE_RADIUS_M
# Retry/split behaviour for slow Defra WCS requests. Some 100km eastern tiles
# intermittently return 504s; smaller fallback requests usually succeed.
MAX_RETRIES = 3
@ -287,6 +308,31 @@ def _download_tile(
return [], [(min_e, min_n, max_e, max_n)]
def _generate_tiles(
min_e: int,
max_e: int,
min_n: int,
max_n: int,
tile_size: int,
overlap_m: int,
step: int,
) -> list[Tile]:
"""Generate download tile bboxes stepping by ``step`` but extending each
tile's far edge by ``overlap_m`` so neighbours overlap.
Overlapping neighbours guarantee that every postcode's POSTCODE_NOISE_RADIUS_M
sampling window is fully contained in at least one tile, so a loud pixel near
a seam is never lost (the sampler takes np.fmax across tiles).
"""
tiles: list[Tile] = []
for tile_min_e in range(min_e, max_e, step):
for tile_min_n in range(min_n, max_n, step):
tile_max_e = min(tile_min_e + tile_size + overlap_m, BNG_MAX_E)
tile_max_n = min(tile_min_n + tile_size + overlap_m, BNG_MAX_N)
tiles.append((tile_min_e, tile_min_n, tile_max_e, tile_max_n))
return tiles
def download_raster(
tile_dir: Path,
wcs_base: str,
@ -296,12 +342,9 @@ def download_raster(
allow_missing_tiles: bool = False,
) -> list[Path]:
"""Download noise GeoTIFF raster covering England, returning paths to saved files."""
tiles = []
for min_e in range(BNG_MIN_E, BNG_MAX_E, TILE_SIZE):
for min_n in range(BNG_MIN_N, BNG_MAX_N, TILE_SIZE):
max_e = min(min_e + TILE_SIZE, BNG_MAX_E)
max_n = min(min_n + TILE_SIZE, BNG_MAX_N)
tiles.append((min_e, min_n, max_e, max_n))
tiles = _generate_tiles(
BNG_MIN_E, BNG_MAX_E, BNG_MIN_N, BNG_MAX_N, TILE_SIZE, TILE_OVERLAP_M, TILE_SIZE
)
print(
f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)..."
@ -385,14 +428,23 @@ def sample_noise_at_postcodes(
if len(candidate_indices) == 0:
continue
# Defra rasters encode TRUE nodata as the -96.0 sentinel (and
# occasionally non-finite / dataset.nodata); genuinely quiet ground
# below the model's lowest reporting band is encoded as 0.0. Only
# the former is "we don't know" — the latter is a real "we know it's
# quiet" reading and must not collapse to null. So treat ONLY true
# nodata as -inf (it never wins a max and never counts as coverage),
# and clamp every in-coverage cell up to NOISE_QUIET_FLOOR_DB so a
# below-threshold 0.0 surfaces as the documented quiet floor.
grid = dataset.read(1).astype(np.float32, copy=False)
invalid = ~np.isfinite(grid) | (grid == 0)
nodata = ~np.isfinite(grid) | np.isclose(
grid, NOISE_NODATA_SENTINEL, rtol=1e-5, atol=1e-5
)
if dataset.nodata is not None:
invalid |= np.isclose(
nodata |= np.isclose(
grid, np.float32(dataset.nodata), rtol=1e-5, atol=1e-5
)
grid = grid.copy()
grid[invalid] = -np.inf
grid = np.where(nodata, -np.inf, np.maximum(grid, NOISE_QUIET_FLOOR_DB))
if filter_size > 1:
grid = maximum_filter(
grid, size=filter_size, mode="constant", cval=-np.inf
@ -412,12 +464,15 @@ def sample_noise_at_postcodes(
sampled_indices = candidate_indices[in_bounds]
sampled = grid[rows[in_bounds], cols[in_bounds]]
valid = sampled != -np.inf
if not np.any(valid):
# A finite sample means at least one in-coverage cell sat in the
# window (quiet -> floor, or louder). -inf means the whole window was
# true nodata, so the postcode stays uncovered (null) for this tile.
covered = np.isfinite(sampled)
if not np.any(covered):
continue
sampled_indices = sampled_indices[valid]
sampled = sampled[valid]
sampled_indices = sampled_indices[covered]
sampled = sampled[covered]
existing = noise_db[sampled_indices]
noise_db[sampled_indices] = np.where(
np.isnan(existing), sampled, np.maximum(existing, sampled)

View file

@ -84,6 +84,38 @@ LONDON_COUNTY_CODES = {"E13000001", "E13000002"}
DISPLAY_CITY_NEAREST_POSTCODE_MAX_M = 3_000
WGS84_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
# England British National Grid (EPSG:27700) bounding box, with margin. ONS NSPL stores
# postcodes that have no grid reference at the Null-Island sentinel lat=99.999999,
# long=0.000000, whose paired easting/northing collapse to the grid origin (0, 0) (or
# inf). Requiring coordinates inside this box drops the sentinel from every index, so an
# active postcode lacking a grid ref can never become a false nearest neighbour.
ENGLAND_BNG_MIN_EAST = 50_000.0
ENGLAND_BNG_MAX_EAST = 660_000.0
ENGLAND_BNG_MIN_NORTH = 0.0
ENGLAND_BNG_MAX_NORTH = 660_000.0
def _valid_wgs84_expr() -> pl.Expr:
"""Rows with a real lat/long inside England (drops the ONS lat=99.999999, long=0.0
no-grid-reference sentinel and any nulls), so they never enter a coordinate index."""
return (
pl.col("lat").is_not_null()
& pl.col("long").is_not_null()
& pl.col("lat").is_between(ENGLAND_BBOX_SOUTH, ENGLAND_BBOX_NORTH)
& pl.col("long").is_between(ENGLAND_BBOX_WEST, ENGLAND_BBOX_EAST)
)
def _valid_bng_expr() -> pl.Expr:
"""Rows with a real easting/northing inside England (drops the (0, 0) grid-origin /
inf paired with the ONS no-grid-reference sentinel and any nulls)."""
return (
pl.col("east1m").is_not_null()
& pl.col("north1m").is_not_null()
& pl.col("east1m").is_between(ENGLAND_BNG_MIN_EAST, ENGLAND_BNG_MAX_EAST)
& pl.col("north1m").is_between(ENGLAND_BNG_MIN_NORTH, ENGLAND_BNG_MAX_NORTH)
)
# Suffixes to strip from raw station names before appending the typed suffix.
_STATION_STRIP = (
" tube station",
@ -303,7 +335,7 @@ def _outcode_tree(postcodes_path: Path) -> tuple[cKDTree, list[str]]:
postcodes_path, columns=["pcds", "lat", "long", "ctry25cd", "doterm"]
)
.filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
.filter(pl.col("lat").is_not_null() & pl.col("long").is_not_null())
.filter(_valid_wgs84_expr())
)
coords = np.column_stack(
[df["lat"].to_numpy().astype(np.float64), df["long"].to_numpy().astype(np.float64)]
@ -359,12 +391,22 @@ def _build_street_places(
return sorted(places, key=lambda place: place["name"].lower())
def _poi_dedup_key(name: str, place_type: str, lat: float, lon: float) -> tuple:
"""Geographic de-dup key: round(.,2) is ~1.1km lat / ~0.7km UK lon.
Coarse enough to collapse the SAME physical POI mapped twice a few metres
apart, fine enough to keep genuinely distinct same-named POIs in different
towns (e.g. "Victoria Park" in London vs Bristol).
"""
return (name.lower(), place_type, round(lat, 2), round(lon, 2))
def _pois_to_places(pois: pl.DataFrame) -> list[dict]:
"""Map high-value named POIs onto gazetteer place rows (M), de-duplicated by (name, type)."""
"""Map high-value named POIs onto gazetteer place rows (M), de-duplicated by (name, type, coords)."""
if pois.is_empty():
return []
seen: set[tuple[str, str]] = set()
seen: set[tuple] = set()
places: list[dict] = []
for row in pois.iter_rows(named=True):
place_type = HIGH_VALUE_POI_CATEGORIES.get(str(row.get("category", "")))
@ -373,7 +415,9 @@ def _pois_to_places(pois: pl.DataFrame) -> list[dict]:
name = str(row.get("name") or "").strip()
if len(name) < 3:
continue
key = (name.lower(), place_type)
lat = float(row["lat"])
lon = float(row["lng"])
key = _poi_dedup_key(name, place_type, lat, lon)
if key in seen:
continue
seen.add(key)
@ -381,8 +425,8 @@ def _pois_to_places(pois: pl.DataFrame) -> list[dict]:
{
"name": name,
"place_type": place_type,
"lat": float(row["lat"]),
"lon": float(row["lng"]),
"lat": lat,
"lon": lon,
"population": 0,
"travel_destination": False,
"display_city": None,
@ -395,11 +439,16 @@ def _append_high_value_pois(places: list[dict], pois_path: Path) -> int:
pois = pl.read_parquet(pois_path, columns=["name", "category", "lat", "lng"])
new_places = _pois_to_places(pois)
existing = {
(str(place["name"]).lower(), place["place_type"]) for place in places
_poi_dedup_key(
str(place["name"]), place["place_type"], place["lat"], place["lon"]
)
for place in places
}
added = 0
for place in new_places:
key = (place["name"].lower(), place["place_type"])
key = _poi_dedup_key(
place["name"], place["place_type"], place["lat"], place["lon"]
)
if key in existing:
continue
places.append(place)
@ -409,10 +458,14 @@ def _append_high_value_pois(places: list[dict], pois_path: Path) -> int:
def _postcode_lookup(postcodes_path: Path) -> dict[str, tuple[float, float]]:
df = pl.read_parquet(
postcodes_path,
columns=["pcds", "lat", "long", "ctry25cd", "doterm"],
).filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
df = (
pl.read_parquet(
postcodes_path,
columns=["pcds", "lat", "long", "ctry25cd", "doterm"],
)
.filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
.filter(_valid_wgs84_expr())
)
return {
_normalize_postcode(postcode): (float(lat), float(lon))
for postcode, lat, lon in df.select(["pcds", "lat", "long"]).iter_rows()
@ -470,7 +523,7 @@ def _london_postcode_tree(postcodes_path: Path) -> tuple[cKDTree, np.ndarray]:
.filter(
(pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null()
)
.filter(pl.col("east1m").is_not_null() & pl.col("north1m").is_not_null())
.filter(_valid_bng_expr())
.with_columns(_is_london_admin_expr().alias("is_london"))
.select("east1m", "north1m", "is_london")
)

View file

@ -0,0 +1,75 @@
import math
import polars as pl
import pytest
from pipeline.download import median_age
from pipeline.download.median_age import (
AGE_BANDS,
EXPECTED_BAND_NAMES,
compute_median_age,
)
def test_expected_band_names_align_with_age_bands():
assert len(EXPECTED_BAND_NAMES) == len(AGE_BANDS)
def test_compute_median_age_interpolates_within_median_band():
# All weight in the 30-34 band -> median is the band midpoint via linear
# interpolation: 30 + ((50 - 0) / 100) * 5 = 32.5.
counts = [0] * len(AGE_BANDS)
counts[6] = 100 # "Aged 30 to 34 years"
assert compute_median_age(counts) == pytest.approx(32.5)
# 50 below the median band, 100 inside the 35-39 band holding the median.
# half = 75; cumulative before band 7 = 50; 35 + ((75 - 50) / 100) * 5 = 36.25.
counts = [0] * len(AGE_BANDS)
counts[0] = 50 # below the median band
counts[7] = 100 # "Aged 35 to 39 years" holds the median
assert compute_median_age(counts) == pytest.approx(36.25)
def test_compute_median_age_empty_lsoa_is_nan():
assert math.isnan(compute_median_age([0] * len(AGE_BANDS)))
def _pivoted(band_to_counts: dict[str, list]) -> pl.DataFrame:
"""Build a pivot-shaped frame: GEOGRAPHY_CODE + one column per band."""
n = len(next(iter(band_to_counts.values())))
data = {"GEOGRAPHY_CODE": [f"E0100000{i}" for i in range(n)]}
data.update(band_to_counts)
return pl.DataFrame(data)
def test_null_band_count_is_treated_as_zero_not_crash():
# One LSOA has a null in the 85+ band (NOMIS can return null for a band with
# zero people). It must be coerced to 0, not raise TypeError in sum(). With
# all 100 people in the 30-34 band the median is the band midpoint, 32.5.
counts_by_band = {name: [0] for name in EXPECTED_BAND_NAMES}
counts_by_band["Aged 30 to 34 years"] = [100]
counts_by_band["Aged 85 years and over"] = [None]
pivoted = _pivoted(counts_by_band)
table = median_age._bands_to_median_table(pivoted)
assert table.height == 1
assert table["median_age"][0] == pytest.approx(32.5)
def test_missing_band_raises_clear_error():
counts_by_band = {name: [10] for name in EXPECTED_BAND_NAMES}
del counts_by_band["Aged 85 years and over"]
pivoted = _pivoted(counts_by_band)
with pytest.raises(ValueError, match=r"do not match the expected NOMIS"):
median_age._bands_to_median_table(pivoted)
def test_relabelled_band_raises_clear_error():
counts_by_band = {name: [10] for name in EXPECTED_BAND_NAMES}
counts_by_band["Total"] = counts_by_band.pop("Aged 85 years and over")
pivoted = _pivoted(counts_by_band)
with pytest.raises(ValueError, match=r"unexpected:"):
median_age._bands_to_median_table(pivoted)

View file

@ -125,6 +125,205 @@ def test_download_raster_raises_on_missing_strict_tiles(monkeypatch, tmp_path):
noise.download_raster(tmp_path, "base", "coverage", "Road")
def test_generate_tiles_neighbours_overlap_by_radius():
tile_size = 20_000
overlap = noise.POSTCODE_NOISE_RADIUS_M
tiles = noise._generate_tiles(
0, 60_000, 0, 60_000, tile_size, overlap, tile_size
)
by_origin = {(min_e, min_n): (max_e, max_n) for min_e, min_n, max_e, max_n in tiles}
# Horizontally adjacent tiles must overlap by >= overlap.
for (min_e, min_n), (max_e, _max_n) in by_origin.items():
right_origin = (min_e + tile_size, min_n)
if right_origin in by_origin:
assert max_e - right_origin[0] >= overlap
# Vertically adjacent tiles must overlap by >= overlap.
for (min_e, min_n), (_max_e, max_n) in by_origin.items():
up_origin = (min_e, min_n + tile_size)
if up_origin in by_origin:
assert max_n - up_origin[1] >= overlap
def test_generate_tiles_clamps_to_grid_extent():
tile_size = 20_000
overlap = noise.POSTCODE_NOISE_RADIUS_M
tiles = noise._generate_tiles(
noise.BNG_MAX_E - tile_size,
noise.BNG_MAX_E,
noise.BNG_MAX_N - tile_size,
noise.BNG_MAX_N,
tile_size,
overlap,
tile_size,
)
# The final (top-right) tile cannot extend past the England extent even
# though the overlap would otherwise push it beyond.
for _min_e, _min_n, max_e, max_n in tiles:
assert max_e <= noise.BNG_MAX_E
assert max_n <= noise.BNG_MAX_N
def _write_geotiff(path, data, left, top, resolution, nodata):
with rasterio.open(
path,
"w",
driver="GTiff",
height=data.shape[0],
width=data.shape[1],
count=1,
dtype=data.dtype,
crs="EPSG:27700",
transform=from_origin(left, top, resolution, resolution),
nodata=nodata,
) as dataset:
dataset.write(data, 1)
def test_sample_noise_recovers_value_across_overlapping_seam(monkeypatch, tmp_path):
monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 50)
monkeypatch.setattr(noise, "RESOLUTION", 10)
# Two download tiles share a vertical seam at easting=100. _generate_tiles
# decides their real footprints: with the overlap fix the LEFT tile extends
# past the seam by POSTCODE_NOISE_RADIUS_M and thus covers a loud cell that
# physically sits just across the seam.
tile_size = 100
overlap = noise.POSTCODE_NOISE_RADIUS_M
tiles = noise._generate_tiles(0, 200, 0, 100, tile_size, overlap, tile_size)
by_origin = {
(min_e, min_n): (max_e, max_n) for min_e, min_n, max_e, max_n in tiles
}
left_min_e, left_min_n = 0, 0
left_max_e, left_max_n = by_origin[(left_min_e, left_min_n)]
# Overlap fix is what makes the left tile reach across the seam.
assert left_max_e > 100
# The loud 70 dB cell centre is at easting 105 (just across the seam) and
# the postcode point is at easting 75 in the left tile, within 50m of it.
res = noise.RESOLUTION
width = int((left_max_e - left_min_e) // res)
height = int((left_max_n - left_min_n) // res)
left_data = np.zeros((height, width), dtype=np.float32)
loud_row = height - 1 - int((25 - left_min_n) // res) # northing ~25
loud_col = int((105 - left_min_e) // res) # easting ~105
left_data[loud_row, loud_col] = 70.0
_write_geotiff(
tmp_path / "left.tif", left_data, left_min_e, left_max_n, res, nodata=0
)
# The right tile holds the same loud cell but the postcode point is NOT
# inside it, so without overlap the value would be lost for that point.
right_min_e, right_min_n = 100, 0
right_max_e, right_max_n = by_origin[(right_min_e, right_min_n)]
rwidth = int((right_max_e - right_min_e) // res)
rheight = int((right_max_n - right_min_n) // res)
right_data = np.zeros((rheight, rwidth), dtype=np.float32)
right_data[rheight - 1 - int((25 - right_min_n) // res), 0] = 70.0
_write_geotiff(
tmp_path / "right.tif", right_data, right_min_e, right_max_n, res, nodata=0
)
result = noise.sample_noise_at_postcodes(
[tmp_path / "left.tif", tmp_path / "right.tif"],
easting=np.array([75.0]),
northing=np.array([25.0]),
label="Road",
col_name="road_noise_lden_db",
)
assert result.to_list() == [70.0]
def test_sample_noise_distinguishes_nodata_from_in_coverage_quiet(
monkeypatch, tmp_path
):
monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 0)
monkeypatch.setattr(noise, "RESOLUTION", 10)
# Defra encodes TRUE nodata as the -96.0 sentinel; genuinely quiet ground
# below the lowest reporting band is 0.0. With a 0m radius each postcode
# reads exactly one cell, so we can pin behaviour per cell:
# -96.0 sentinel -> null ("we don't know")
# 0.0 in-coverage -> NOISE_QUIET_FLOOR_DB ("we know it's quiet")
# 65.0 -> 65.0 (a real modelled reading)
data = np.array(
[
[-96.0, 0.0, 65.0],
],
dtype=np.float32,
)
_write_geotiff(tmp_path / "noise.tif", data, 0, 10, 10, nodata=-96.0)
result = noise.sample_noise_at_postcodes(
[tmp_path / "noise.tif"],
# Cell centres at easting 5 (nodata), 15 (quiet 0.0), 25 (loud 65).
easting=np.array([5.0, 15.0, 25.0]),
northing=np.array([5.0, 5.0, 5.0]),
label="Road",
col_name="road_noise_lden_db",
)
assert result.to_list() == [None, float(noise.NOISE_QUIET_FLOOR_DB), 65.0]
def test_sample_noise_preserves_genuine_reading_above_quiet_floor(monkeypatch, tmp_path):
monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 0)
monkeypatch.setattr(noise, "RESOLUTION", 10)
# The lowest Defra reporting band is 40.0 dB; genuine readings populate
# [40, ~80]. A genuine in-coverage reading at or just above the floor must be
# PRESERVED, not clamped UP to the floor — only true-quiet 0.0 is floored. A
# quiet floor set too high (e.g. 45) would inflate the ~35% of real 40-44.99
# dB readings; this pins that they survive unchanged.
floor = float(noise.NOISE_QUIET_FLOOR_DB)
data = np.array(
[
[42.0, floor, 0.0],
],
dtype=np.float32,
)
_write_geotiff(tmp_path / "noise.tif", data, 0, 10, 10, nodata=-96.0)
result = noise.sample_noise_at_postcodes(
[tmp_path / "noise.tif"],
# Cell centres at easting 5 (42 dB), 15 (floor), 25 (quiet 0.0).
easting=np.array([5.0, 15.0, 25.0]),
northing=np.array([5.0, 5.0, 5.0]),
label="Road",
col_name="road_noise_lden_db",
)
# 42 preserved (NOT raised to the floor), floor preserved, 0.0 -> floor.
assert result.to_list() == [42.0, floor, floor]
# The floor must sit at/below the lowest genuine reading so nothing inflates.
assert floor <= 42.0
def test_sample_noise_nodata_window_stays_null(monkeypatch, tmp_path):
monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 15)
monkeypatch.setattr(noise, "RESOLUTION", 10)
# A postcode whose entire 3x3 max-window is the -96.0 sentinel must remain
# null: no in-coverage cell was read, so "quiet" must NOT be inferred.
data = np.full((5, 5), -96.0, dtype=np.float32)
data[4, 4] = 70.0 # one loud cell, far from the nodata corner
_write_geotiff(tmp_path / "noise.tif", data, 0, 50, 10, nodata=-96.0)
result = noise.sample_noise_at_postcodes(
[tmp_path / "noise.tif"],
# Top-left point: its 3x3 window is cells (rows 0-1, cols 0-1) = all -96.
easting=np.array([5.0]),
northing=np.array([45.0]),
label="Road",
col_name="road_noise_lden_db",
)
assert result.to_list() == [None]
def test_sample_noise_at_postcodes_uses_local_maximum(monkeypatch, tmp_path):
monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 15)
monkeypatch.setattr(noise, "RESOLUTION", 10)

View file

@ -9,10 +9,12 @@ from pipeline.download.places import (
_display_city_from_tags,
_is_dlr_station,
_is_tram_station,
_london_postcode_tree,
_naptan_dlr_stations,
_normalize_street_name,
_ofs_universities,
_outcode_of_postcode,
_outcode_tree,
_pois_to_places,
_select_university_name,
_station_display_name,
@ -242,6 +244,42 @@ def test_pois_to_places_keeps_high_value_named_pois_only():
assert all(place["travel_destination"] is False for place in places)
def test_pois_to_places_keeps_distinct_same_named_pois():
# Two genuinely distinct POIs sharing a name, far apart (London vs Bristol).
pois = pl.DataFrame(
{
"name": ["Victoria Park", "Victoria Park"],
"category": ["leisure/park", "leisure/park"],
"lat": [51.54, 51.46],
"lng": [-0.04, -2.60],
}
)
places = _pois_to_places(pois)
assert len(places) == 2
assert {(place["lat"], place["lon"]) for place in places} == {
(51.54, -0.04),
(51.46, -2.60),
}
def test_pois_to_places_still_dedupes_colocated():
# The same physical POI mapped twice a few metres apart collapses to one.
pois = pl.DataFrame(
{
"name": ["Victoria Park", "Victoria Park"],
"category": ["leisure/park", "leisure/park"],
"lat": [51.5400, 51.5401],
"lng": [-0.0400, -0.0399],
}
)
places = _pois_to_places(pois)
assert len(places) == 1
def test_display_city_from_tags_uses_explicit_london_context():
assert _display_city_from_tags({"is_in": "Croydon, London, UK"}) == "London"
assert _display_city_from_tags({"is_in": "Croydon, Cambridgeshire, UK"}) is None
@ -290,3 +328,52 @@ def test_assign_london_display_city_uses_nearest_active_postcode_admin(tmp_path)
assert assigned == 2
assert [place["display_city"] for place in places] == ["London", "London", None]
def test_no_grid_reference_sentinel_is_excluded_from_coordinate_trees(tmp_path):
# ONS NSPL stores postcodes with no grid reference at the Null-Island sentinel
# lat=99.999999, long=0.0, whose paired BNG coords collapse to the (0, 0) origin.
# Such an active postcode must never enter the nearest-neighbour indexes.
sentinel = {
"pcds": "ZZ99 9ZZ",
"lat": 99.999999,
"long": 0.0,
"doterm": None,
"ctry25cd": "E92000001",
"east1m": 0,
"north1m": 0,
"rgn25cd": "E12000007",
"lad25cd": "E09000008",
"cty25cd": "E13000002",
}
croydon_easting, croydon_northing = WGS84_TO_BNG.transform(-0.101793, 51.371273)
real = {
"pcds": "CR0 1SZ",
"lat": 51.371273,
"long": -0.101793,
"doterm": None,
"ctry25cd": "E92000001",
"east1m": int(round(croydon_easting)),
"north1m": int(round(croydon_northing)),
"rgn25cd": "E12000007",
"lad25cd": "E09000008",
"cty25cd": "E13000002",
}
postcodes = tmp_path / "postcodes.parquet"
pl.DataFrame([sentinel, real]).write_parquet(postcodes)
# lat/long outcode tree: only the real postcode survives, so a London-area query
# cannot be tagged with the sentinel's (empty) outcode.
tree, outcodes = _outcode_tree(postcodes)
assert tree.n == 1
assert outcodes == ["CR0"]
_, idx = tree.query([[51.371273, -0.101793]])
assert outcodes[idx[0]] == "CR0"
# BNG London tree: only the real postcode survives, so the (0, 0) origin can never
# be the nearest neighbour of a real place.
bng_tree, london_flags = _london_postcode_tree(postcodes)
assert bng_tree.n == 1
assert london_flags.tolist() == [True]
_, bng_idx = bng_tree.query([[croydon_easting, croydon_northing]])
assert bng_idx[0] == 0

View file

@ -14,7 +14,7 @@ from pathlib import Path
import polars as pl
from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download, extract_zip
from pipeline.utils import code_col_overrides, download, extract_zip
URL = "https://www.arcgis.com/sharing/rest/content/items/4e0b4b3fbc2540caae27e7be532e61be/data"
@ -34,16 +34,16 @@ def find_csvs(extract_path: Path) -> list[Path]:
def convert_to_parquet(csv_paths: list[Path], parquet_path: Path) -> None:
# Some regional files infer different types for the same column (e.g.
# ruc21ind is String in most but Int64 in YH). Read all code columns as
# String to avoid schema mismatches.
CODE_COLS = {
"ruc21ind": pl.String,
"oac21ind": pl.String,
"imd19ind": pl.String,
}
# ruc21ind is String in most but Int64 in YH), and string codes like "UN1"
# appear deep in the data. Read all classification-index code columns as
# String to avoid schema mismatches. NSUL renames the year suffixes each
# release and polars silently ignores overrides for missing columns, so
# match on the suffix-free stem (from the header) rather than hard-coding.
names = pl.scan_csv(csv_paths[0]).collect_schema().names()
code_cols = code_col_overrides(names)
df = pl.concat(
[
pl.scan_csv(p, try_parse_dates=True, schema_overrides=CODE_COLS)
pl.scan_csv(p, try_parse_dates=True, schema_overrides=code_cols)
for p in csv_paths
]
)