idgf
This commit is contained in:
parent
fbfebc651c
commit
aab85fe32e
33 changed files with 2016 additions and 283 deletions
|
|
@ -4,27 +4,24 @@ import polars as pl
|
|||
from pathlib import Path
|
||||
|
||||
from pipeline.local_temp import local_tmp_dir
|
||||
from pipeline.utils import download, extract_zip
|
||||
from pipeline.utils import code_col_overrides, download, extract_zip
|
||||
|
||||
URL = "https://www.arcgis.com/sharing/rest/content/items/36b718ad00de49afb9ad364f8b815b9e/data"
|
||||
|
||||
|
||||
def convert_to_parquet(data_path: Path, parquet_path: Path) -> None:
|
||||
# Classification code columns (ruc21ind, oac11ind, imd20ind) look numeric
|
||||
# in early rows but contain string codes like "UN1" (Unclassified) later
|
||||
# on. Force them to String to avoid mid-stream dtype inference failures.
|
||||
# Note: NSPL renames these year suffixes as new releases roll in (e.g.
|
||||
# Feb 2026 bumped oac from oac21ind → oac11ind, imd from imd19ind →
|
||||
# imd20ind), so keep this dict in sync with the current CSV headers —
|
||||
# polars silently ignores overrides for missing columns, masking drift.
|
||||
# Classification code columns (e.g. ruc21ind, oac11ind, imd20ind) look
|
||||
# numeric in early rows but contain string codes like "UN1" (Unclassified)
|
||||
# later on. Force them to String to avoid mid-stream dtype inference
|
||||
# failures. NSPL renames these year suffixes each release, and polars
|
||||
# silently ignores overrides for missing columns, so match on the
|
||||
# suffix-free stem (read from the header) rather than hard-coding suffixes.
|
||||
csv_path = data_path / "Data/NSPL_FEB_2026_UK.csv"
|
||||
names = pl.scan_csv(csv_path).collect_schema().names()
|
||||
df = pl.scan_csv(
|
||||
data_path / "Data/NSPL_FEB_2026_UK.csv",
|
||||
csv_path,
|
||||
try_parse_dates=True,
|
||||
schema_overrides={
|
||||
"ruc21ind": pl.String,
|
||||
"oac11ind": pl.String,
|
||||
"imd20ind": pl.String,
|
||||
},
|
||||
schema_overrides=code_col_overrides(names),
|
||||
)
|
||||
print(f"Columns: {df.collect_schema().names()}")
|
||||
parquet_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
|
|
|||
|
|
@ -43,6 +43,35 @@ AGE_BANDS = [
|
|||
(85, 5), # Aged 85 years and over
|
||||
]
|
||||
|
||||
# Canonical NOMIS TS007A (C2021_AGE_19_NAME) band labels, in the SAME order as
|
||||
# AGE_BANDS. Index i here corresponds to AGE_BANDS[i]; we validate the pivot
|
||||
# output against this set and use it (not positional string parsing) to order
|
||||
# the columns, so a stray/relabelled/missing band fails loudly instead of
|
||||
# silently mis-aligning counts against the wrong lower bound.
|
||||
EXPECTED_BAND_NAMES = [
|
||||
"Aged 0 to 4 years",
|
||||
"Aged 5 to 9 years",
|
||||
"Aged 10 to 14 years",
|
||||
"Aged 15 to 19 years",
|
||||
"Aged 20 to 24 years",
|
||||
"Aged 25 to 29 years",
|
||||
"Aged 30 to 34 years",
|
||||
"Aged 35 to 39 years",
|
||||
"Aged 40 to 44 years",
|
||||
"Aged 45 to 49 years",
|
||||
"Aged 50 to 54 years",
|
||||
"Aged 55 to 59 years",
|
||||
"Aged 60 to 64 years",
|
||||
"Aged 65 to 69 years",
|
||||
"Aged 70 to 74 years",
|
||||
"Aged 75 to 79 years",
|
||||
"Aged 80 to 84 years",
|
||||
"Aged 85 years and over",
|
||||
]
|
||||
assert len(EXPECTED_BAND_NAMES) == len(AGE_BANDS), (
|
||||
"EXPECTED_BAND_NAMES and AGE_BANDS must stay aligned 1:1"
|
||||
)
|
||||
|
||||
|
||||
def compute_median_age(counts: list[int]) -> float:
|
||||
"""Compute median age from five-year band counts using linear interpolation."""
|
||||
|
|
@ -62,6 +91,53 @@ def compute_median_age(counts: list[int]) -> float:
|
|||
return float("nan")
|
||||
|
||||
|
||||
def _bands_to_median_table(pivoted: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Validate the pivoted age-band columns, then compute median age per LSOA.
|
||||
|
||||
The pivot must contain exactly the canonical NOMIS TS007A bands; a
|
||||
missing/extra/relabelled band would otherwise silently mis-align counts
|
||||
against the wrong AGE_BANDS lower bound, so we fail loudly instead.
|
||||
"""
|
||||
# Validate the pivoted age-band columns against the canonical NOMIS set
|
||||
# BEFORE computing anything.
|
||||
band_cols = [c for c in pivoted.columns if c != "GEOGRAPHY_CODE"]
|
||||
found = set(band_cols)
|
||||
expected = set(EXPECTED_BAND_NAMES)
|
||||
if found != expected:
|
||||
missing = sorted(expected - found)
|
||||
unexpected = sorted(found - expected)
|
||||
raise ValueError(
|
||||
"Census age-band columns do not match the expected NOMIS TS007A bands.\n"
|
||||
f" expected {len(EXPECTED_BAND_NAMES)} bands, found {len(band_cols)}\n"
|
||||
f" missing: {missing}\n"
|
||||
f" unexpected: {unexpected}\n"
|
||||
"Refusing to compute medians against misaligned bands."
|
||||
)
|
||||
|
||||
# Use the canonical order (guaranteed aligned with AGE_BANDS), not positional
|
||||
# string parsing, and treat a null band (zero-population) as 0 rather than
|
||||
# crashing on sum().
|
||||
band_cols = list(EXPECTED_BAND_NAMES)
|
||||
pivoted = pivoted.with_columns(pl.col(band_cols).fill_null(0))
|
||||
|
||||
print(f"Age bands found: {len(band_cols)}")
|
||||
print(f" First: {band_cols[0]}")
|
||||
print(f" Last: {band_cols[-1]}")
|
||||
|
||||
rows = pivoted.select("GEOGRAPHY_CODE", *band_cols).to_dicts()
|
||||
medians = []
|
||||
for row in rows:
|
||||
counts = [row[col] for col in band_cols]
|
||||
median = compute_median_age(counts)
|
||||
medians.append(
|
||||
{"lsoa21": row["GEOGRAPHY_CODE"], "median_age": round(median, 1)}
|
||||
)
|
||||
|
||||
return pl.DataFrame(medians).with_columns(
|
||||
pl.col("median_age").cast(pl.Float32),
|
||||
)
|
||||
|
||||
|
||||
def download_and_convert(output_path: Path) -> None:
|
||||
print("Downloading Census 2021 age by five-year bands from NOMIS...")
|
||||
frames = []
|
||||
|
|
@ -94,29 +170,7 @@ def download_and_convert(output_path: Path) -> None:
|
|||
values="OBS_VALUE",
|
||||
)
|
||||
|
||||
# Extract age band columns in order and compute median
|
||||
# NOMIS returns band names like "Aged 0 to 4 years", "Aged 85 years and over"
|
||||
band_cols = [c for c in pivoted.columns if c != "GEOGRAPHY_CODE"]
|
||||
# Sort by the lower bound of each band
|
||||
band_cols.sort(key=lambda c: int(c.split()[1]))
|
||||
|
||||
print(f"Age bands found: {len(band_cols)}")
|
||||
print(f" First: {band_cols[0]}")
|
||||
print(f" Last: {band_cols[-1]}")
|
||||
|
||||
# Compute median age per LSOA
|
||||
rows = pivoted.select("GEOGRAPHY_CODE", *band_cols).to_dicts()
|
||||
medians = []
|
||||
for row in rows:
|
||||
counts = [row[col] for col in band_cols]
|
||||
median = compute_median_age(counts)
|
||||
medians.append(
|
||||
{"lsoa21": row["GEOGRAPHY_CODE"], "median_age": round(median, 1)}
|
||||
)
|
||||
|
||||
result = pl.DataFrame(medians).with_columns(
|
||||
pl.col("median_age").cast(pl.Float32),
|
||||
)
|
||||
result = _bands_to_median_table(pivoted)
|
||||
|
||||
print(f"England LSOAs: {result.height}")
|
||||
print(
|
||||
|
|
|
|||
|
|
@ -83,11 +83,32 @@ NATIVE_RESOLUTION = 10
|
|||
# Request pixel resolution in metres.
|
||||
RESOLUTION = NATIVE_RESOLUTION
|
||||
|
||||
# Defra encodes TRUE "no data" with this sentinel (NOT 0.0). A 0.0 cell that is
|
||||
# otherwise inside the raster means "modelled below the lowest reporting band",
|
||||
# i.e. genuinely quiet — see noise_overlay_tiles.py:167.
|
||||
NOISE_NODATA_SENTINEL = np.float32(-96.0)
|
||||
|
||||
# Lowest modelled Defra Lden reporting band (dB). Verified against the actual
|
||||
# rasters: the minimum positive in-coverage value is 40.0 dB with NO values in
|
||||
# (0, 40) — below the band, cells are encoded as 0.0 (genuinely quiet). We floor
|
||||
# in-coverage cells to 40.0 so a below-band 0.0 surfaces as "we know it's quiet"
|
||||
# (~40 dB) instead of collapsing to null ("we don't know"), WITHOUT inflating the
|
||||
# ~35% of genuine 40-44.99 dB readings that a 45.0 floor would wrongly bump to 45.
|
||||
# NB: 45.0 is the overlay's lowest *paint* stop (noise_overlay_tiles.
|
||||
# NOISE_COLOR_STOPS[0]) — a rendering threshold, not the data's reporting floor.
|
||||
NOISE_QUIET_FLOOR_DB = np.float32(40.0)
|
||||
|
||||
# The pipeline has postcode representative points rather than complete unit
|
||||
# polygons here. Use a small local footprint and take the maximum 10m cell so
|
||||
# postcode-level noise is not understated by centroid rounding.
|
||||
POSTCODE_NOISE_RADIUS_M = 50
|
||||
|
||||
# Adjacent download tiles must overlap by at least the sampling radius so every
|
||||
# postcode's 50m max-window is fully contained in at least one tile. Without
|
||||
# this, a loud pixel just across a tile seam is invisible to a postcode on the
|
||||
# far side, under-reporting noise near seams.
|
||||
TILE_OVERLAP_M = POSTCODE_NOISE_RADIUS_M
|
||||
|
||||
# Retry/split behaviour for slow Defra WCS requests. Some 100km eastern tiles
|
||||
# intermittently return 504s; smaller fallback requests usually succeed.
|
||||
MAX_RETRIES = 3
|
||||
|
|
@ -287,6 +308,31 @@ def _download_tile(
|
|||
return [], [(min_e, min_n, max_e, max_n)]
|
||||
|
||||
|
||||
def _generate_tiles(
|
||||
min_e: int,
|
||||
max_e: int,
|
||||
min_n: int,
|
||||
max_n: int,
|
||||
tile_size: int,
|
||||
overlap_m: int,
|
||||
step: int,
|
||||
) -> list[Tile]:
|
||||
"""Generate download tile bboxes stepping by ``step`` but extending each
|
||||
tile's far edge by ``overlap_m`` so neighbours overlap.
|
||||
|
||||
Overlapping neighbours guarantee that every postcode's POSTCODE_NOISE_RADIUS_M
|
||||
sampling window is fully contained in at least one tile, so a loud pixel near
|
||||
a seam is never lost (the sampler takes np.fmax across tiles).
|
||||
"""
|
||||
tiles: list[Tile] = []
|
||||
for tile_min_e in range(min_e, max_e, step):
|
||||
for tile_min_n in range(min_n, max_n, step):
|
||||
tile_max_e = min(tile_min_e + tile_size + overlap_m, BNG_MAX_E)
|
||||
tile_max_n = min(tile_min_n + tile_size + overlap_m, BNG_MAX_N)
|
||||
tiles.append((tile_min_e, tile_min_n, tile_max_e, tile_max_n))
|
||||
return tiles
|
||||
|
||||
|
||||
def download_raster(
|
||||
tile_dir: Path,
|
||||
wcs_base: str,
|
||||
|
|
@ -296,12 +342,9 @@ def download_raster(
|
|||
allow_missing_tiles: bool = False,
|
||||
) -> list[Path]:
|
||||
"""Download noise GeoTIFF raster covering England, returning paths to saved files."""
|
||||
tiles = []
|
||||
for min_e in range(BNG_MIN_E, BNG_MAX_E, TILE_SIZE):
|
||||
for min_n in range(BNG_MIN_N, BNG_MAX_N, TILE_SIZE):
|
||||
max_e = min(min_e + TILE_SIZE, BNG_MAX_E)
|
||||
max_n = min(min_n + TILE_SIZE, BNG_MAX_N)
|
||||
tiles.append((min_e, min_n, max_e, max_n))
|
||||
tiles = _generate_tiles(
|
||||
BNG_MIN_E, BNG_MAX_E, BNG_MIN_N, BNG_MAX_N, TILE_SIZE, TILE_OVERLAP_M, TILE_SIZE
|
||||
)
|
||||
|
||||
print(
|
||||
f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)..."
|
||||
|
|
@ -385,14 +428,23 @@ def sample_noise_at_postcodes(
|
|||
if len(candidate_indices) == 0:
|
||||
continue
|
||||
|
||||
# Defra rasters encode TRUE nodata as the -96.0 sentinel (and
|
||||
# occasionally non-finite / dataset.nodata); genuinely quiet ground
|
||||
# below the model's lowest reporting band is encoded as 0.0. Only
|
||||
# the former is "we don't know" — the latter is a real "we know it's
|
||||
# quiet" reading and must not collapse to null. So treat ONLY true
|
||||
# nodata as -inf (it never wins a max and never counts as coverage),
|
||||
# and clamp every in-coverage cell up to NOISE_QUIET_FLOOR_DB so a
|
||||
# below-threshold 0.0 surfaces as the documented quiet floor.
|
||||
grid = dataset.read(1).astype(np.float32, copy=False)
|
||||
invalid = ~np.isfinite(grid) | (grid == 0)
|
||||
nodata = ~np.isfinite(grid) | np.isclose(
|
||||
grid, NOISE_NODATA_SENTINEL, rtol=1e-5, atol=1e-5
|
||||
)
|
||||
if dataset.nodata is not None:
|
||||
invalid |= np.isclose(
|
||||
nodata |= np.isclose(
|
||||
grid, np.float32(dataset.nodata), rtol=1e-5, atol=1e-5
|
||||
)
|
||||
grid = grid.copy()
|
||||
grid[invalid] = -np.inf
|
||||
grid = np.where(nodata, -np.inf, np.maximum(grid, NOISE_QUIET_FLOOR_DB))
|
||||
if filter_size > 1:
|
||||
grid = maximum_filter(
|
||||
grid, size=filter_size, mode="constant", cval=-np.inf
|
||||
|
|
@ -412,12 +464,15 @@ def sample_noise_at_postcodes(
|
|||
|
||||
sampled_indices = candidate_indices[in_bounds]
|
||||
sampled = grid[rows[in_bounds], cols[in_bounds]]
|
||||
valid = sampled != -np.inf
|
||||
if not np.any(valid):
|
||||
# A finite sample means at least one in-coverage cell sat in the
|
||||
# window (quiet -> floor, or louder). -inf means the whole window was
|
||||
# true nodata, so the postcode stays uncovered (null) for this tile.
|
||||
covered = np.isfinite(sampled)
|
||||
if not np.any(covered):
|
||||
continue
|
||||
|
||||
sampled_indices = sampled_indices[valid]
|
||||
sampled = sampled[valid]
|
||||
sampled_indices = sampled_indices[covered]
|
||||
sampled = sampled[covered]
|
||||
existing = noise_db[sampled_indices]
|
||||
noise_db[sampled_indices] = np.where(
|
||||
np.isnan(existing), sampled, np.maximum(existing, sampled)
|
||||
|
|
|
|||
|
|
@ -84,6 +84,38 @@ LONDON_COUNTY_CODES = {"E13000001", "E13000002"}
|
|||
DISPLAY_CITY_NEAREST_POSTCODE_MAX_M = 3_000
|
||||
WGS84_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
|
||||
|
||||
# England British National Grid (EPSG:27700) bounding box, with margin. ONS NSPL stores
|
||||
# postcodes that have no grid reference at the Null-Island sentinel lat=99.999999,
|
||||
# long=0.000000, whose paired easting/northing collapse to the grid origin (0, 0) (or
|
||||
# inf). Requiring coordinates inside this box drops the sentinel from every index, so an
|
||||
# active postcode lacking a grid ref can never become a false nearest neighbour.
|
||||
ENGLAND_BNG_MIN_EAST = 50_000.0
|
||||
ENGLAND_BNG_MAX_EAST = 660_000.0
|
||||
ENGLAND_BNG_MIN_NORTH = 0.0
|
||||
ENGLAND_BNG_MAX_NORTH = 660_000.0
|
||||
|
||||
|
||||
def _valid_wgs84_expr() -> pl.Expr:
|
||||
"""Rows with a real lat/long inside England (drops the ONS lat=99.999999, long=0.0
|
||||
no-grid-reference sentinel and any nulls), so they never enter a coordinate index."""
|
||||
return (
|
||||
pl.col("lat").is_not_null()
|
||||
& pl.col("long").is_not_null()
|
||||
& pl.col("lat").is_between(ENGLAND_BBOX_SOUTH, ENGLAND_BBOX_NORTH)
|
||||
& pl.col("long").is_between(ENGLAND_BBOX_WEST, ENGLAND_BBOX_EAST)
|
||||
)
|
||||
|
||||
|
||||
def _valid_bng_expr() -> pl.Expr:
|
||||
"""Rows with a real easting/northing inside England (drops the (0, 0) grid-origin /
|
||||
inf paired with the ONS no-grid-reference sentinel and any nulls)."""
|
||||
return (
|
||||
pl.col("east1m").is_not_null()
|
||||
& pl.col("north1m").is_not_null()
|
||||
& pl.col("east1m").is_between(ENGLAND_BNG_MIN_EAST, ENGLAND_BNG_MAX_EAST)
|
||||
& pl.col("north1m").is_between(ENGLAND_BNG_MIN_NORTH, ENGLAND_BNG_MAX_NORTH)
|
||||
)
|
||||
|
||||
# Suffixes to strip from raw station names before appending the typed suffix.
|
||||
_STATION_STRIP = (
|
||||
" tube station",
|
||||
|
|
@ -303,7 +335,7 @@ def _outcode_tree(postcodes_path: Path) -> tuple[cKDTree, list[str]]:
|
|||
postcodes_path, columns=["pcds", "lat", "long", "ctry25cd", "doterm"]
|
||||
)
|
||||
.filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
|
||||
.filter(pl.col("lat").is_not_null() & pl.col("long").is_not_null())
|
||||
.filter(_valid_wgs84_expr())
|
||||
)
|
||||
coords = np.column_stack(
|
||||
[df["lat"].to_numpy().astype(np.float64), df["long"].to_numpy().astype(np.float64)]
|
||||
|
|
@ -359,12 +391,22 @@ def _build_street_places(
|
|||
return sorted(places, key=lambda place: place["name"].lower())
|
||||
|
||||
|
||||
def _poi_dedup_key(name: str, place_type: str, lat: float, lon: float) -> tuple:
|
||||
"""Geographic de-dup key: round(.,2) is ~1.1km lat / ~0.7km UK lon.
|
||||
|
||||
Coarse enough to collapse the SAME physical POI mapped twice a few metres
|
||||
apart, fine enough to keep genuinely distinct same-named POIs in different
|
||||
towns (e.g. "Victoria Park" in London vs Bristol).
|
||||
"""
|
||||
return (name.lower(), place_type, round(lat, 2), round(lon, 2))
|
||||
|
||||
|
||||
def _pois_to_places(pois: pl.DataFrame) -> list[dict]:
|
||||
"""Map high-value named POIs onto gazetteer place rows (M), de-duplicated by (name, type)."""
|
||||
"""Map high-value named POIs onto gazetteer place rows (M), de-duplicated by (name, type, coords)."""
|
||||
if pois.is_empty():
|
||||
return []
|
||||
|
||||
seen: set[tuple[str, str]] = set()
|
||||
seen: set[tuple] = set()
|
||||
places: list[dict] = []
|
||||
for row in pois.iter_rows(named=True):
|
||||
place_type = HIGH_VALUE_POI_CATEGORIES.get(str(row.get("category", "")))
|
||||
|
|
@ -373,7 +415,9 @@ def _pois_to_places(pois: pl.DataFrame) -> list[dict]:
|
|||
name = str(row.get("name") or "").strip()
|
||||
if len(name) < 3:
|
||||
continue
|
||||
key = (name.lower(), place_type)
|
||||
lat = float(row["lat"])
|
||||
lon = float(row["lng"])
|
||||
key = _poi_dedup_key(name, place_type, lat, lon)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
|
|
@ -381,8 +425,8 @@ def _pois_to_places(pois: pl.DataFrame) -> list[dict]:
|
|||
{
|
||||
"name": name,
|
||||
"place_type": place_type,
|
||||
"lat": float(row["lat"]),
|
||||
"lon": float(row["lng"]),
|
||||
"lat": lat,
|
||||
"lon": lon,
|
||||
"population": 0,
|
||||
"travel_destination": False,
|
||||
"display_city": None,
|
||||
|
|
@ -395,11 +439,16 @@ def _append_high_value_pois(places: list[dict], pois_path: Path) -> int:
|
|||
pois = pl.read_parquet(pois_path, columns=["name", "category", "lat", "lng"])
|
||||
new_places = _pois_to_places(pois)
|
||||
existing = {
|
||||
(str(place["name"]).lower(), place["place_type"]) for place in places
|
||||
_poi_dedup_key(
|
||||
str(place["name"]), place["place_type"], place["lat"], place["lon"]
|
||||
)
|
||||
for place in places
|
||||
}
|
||||
added = 0
|
||||
for place in new_places:
|
||||
key = (place["name"].lower(), place["place_type"])
|
||||
key = _poi_dedup_key(
|
||||
place["name"], place["place_type"], place["lat"], place["lon"]
|
||||
)
|
||||
if key in existing:
|
||||
continue
|
||||
places.append(place)
|
||||
|
|
@ -409,10 +458,14 @@ def _append_high_value_pois(places: list[dict], pois_path: Path) -> int:
|
|||
|
||||
|
||||
def _postcode_lookup(postcodes_path: Path) -> dict[str, tuple[float, float]]:
|
||||
df = pl.read_parquet(
|
||||
postcodes_path,
|
||||
columns=["pcds", "lat", "long", "ctry25cd", "doterm"],
|
||||
).filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
|
||||
df = (
|
||||
pl.read_parquet(
|
||||
postcodes_path,
|
||||
columns=["pcds", "lat", "long", "ctry25cd", "doterm"],
|
||||
)
|
||||
.filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
|
||||
.filter(_valid_wgs84_expr())
|
||||
)
|
||||
return {
|
||||
_normalize_postcode(postcode): (float(lat), float(lon))
|
||||
for postcode, lat, lon in df.select(["pcds", "lat", "long"]).iter_rows()
|
||||
|
|
@ -470,7 +523,7 @@ def _london_postcode_tree(postcodes_path: Path) -> tuple[cKDTree, np.ndarray]:
|
|||
.filter(
|
||||
(pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null()
|
||||
)
|
||||
.filter(pl.col("east1m").is_not_null() & pl.col("north1m").is_not_null())
|
||||
.filter(_valid_bng_expr())
|
||||
.with_columns(_is_london_admin_expr().alias("is_london"))
|
||||
.select("east1m", "north1m", "is_london")
|
||||
)
|
||||
|
|
|
|||
75
pipeline/download/test_median_age.py
Normal file
75
pipeline/download/test_median_age.py
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
import math
|
||||
|
||||
import polars as pl
|
||||
import pytest
|
||||
|
||||
from pipeline.download import median_age
|
||||
from pipeline.download.median_age import (
|
||||
AGE_BANDS,
|
||||
EXPECTED_BAND_NAMES,
|
||||
compute_median_age,
|
||||
)
|
||||
|
||||
|
||||
def test_expected_band_names_align_with_age_bands():
|
||||
assert len(EXPECTED_BAND_NAMES) == len(AGE_BANDS)
|
||||
|
||||
|
||||
def test_compute_median_age_interpolates_within_median_band():
|
||||
# All weight in the 30-34 band -> median is the band midpoint via linear
|
||||
# interpolation: 30 + ((50 - 0) / 100) * 5 = 32.5.
|
||||
counts = [0] * len(AGE_BANDS)
|
||||
counts[6] = 100 # "Aged 30 to 34 years"
|
||||
assert compute_median_age(counts) == pytest.approx(32.5)
|
||||
|
||||
# 50 below the median band, 100 inside the 35-39 band holding the median.
|
||||
# half = 75; cumulative before band 7 = 50; 35 + ((75 - 50) / 100) * 5 = 36.25.
|
||||
counts = [0] * len(AGE_BANDS)
|
||||
counts[0] = 50 # below the median band
|
||||
counts[7] = 100 # "Aged 35 to 39 years" holds the median
|
||||
assert compute_median_age(counts) == pytest.approx(36.25)
|
||||
|
||||
|
||||
def test_compute_median_age_empty_lsoa_is_nan():
|
||||
assert math.isnan(compute_median_age([0] * len(AGE_BANDS)))
|
||||
|
||||
|
||||
def _pivoted(band_to_counts: dict[str, list]) -> pl.DataFrame:
|
||||
"""Build a pivot-shaped frame: GEOGRAPHY_CODE + one column per band."""
|
||||
n = len(next(iter(band_to_counts.values())))
|
||||
data = {"GEOGRAPHY_CODE": [f"E0100000{i}" for i in range(n)]}
|
||||
data.update(band_to_counts)
|
||||
return pl.DataFrame(data)
|
||||
|
||||
|
||||
def test_null_band_count_is_treated_as_zero_not_crash():
|
||||
# One LSOA has a null in the 85+ band (NOMIS can return null for a band with
|
||||
# zero people). It must be coerced to 0, not raise TypeError in sum(). With
|
||||
# all 100 people in the 30-34 band the median is the band midpoint, 32.5.
|
||||
counts_by_band = {name: [0] for name in EXPECTED_BAND_NAMES}
|
||||
counts_by_band["Aged 30 to 34 years"] = [100]
|
||||
counts_by_band["Aged 85 years and over"] = [None]
|
||||
pivoted = _pivoted(counts_by_band)
|
||||
|
||||
table = median_age._bands_to_median_table(pivoted)
|
||||
|
||||
assert table.height == 1
|
||||
assert table["median_age"][0] == pytest.approx(32.5)
|
||||
|
||||
|
||||
def test_missing_band_raises_clear_error():
|
||||
counts_by_band = {name: [10] for name in EXPECTED_BAND_NAMES}
|
||||
del counts_by_band["Aged 85 years and over"]
|
||||
pivoted = _pivoted(counts_by_band)
|
||||
|
||||
with pytest.raises(ValueError, match=r"do not match the expected NOMIS"):
|
||||
median_age._bands_to_median_table(pivoted)
|
||||
|
||||
|
||||
def test_relabelled_band_raises_clear_error():
|
||||
counts_by_band = {name: [10] for name in EXPECTED_BAND_NAMES}
|
||||
counts_by_band["Total"] = counts_by_band.pop("Aged 85 years and over")
|
||||
pivoted = _pivoted(counts_by_band)
|
||||
|
||||
with pytest.raises(ValueError, match=r"unexpected:"):
|
||||
median_age._bands_to_median_table(pivoted)
|
||||
|
|
@ -125,6 +125,205 @@ def test_download_raster_raises_on_missing_strict_tiles(monkeypatch, tmp_path):
|
|||
noise.download_raster(tmp_path, "base", "coverage", "Road")
|
||||
|
||||
|
||||
def test_generate_tiles_neighbours_overlap_by_radius():
|
||||
tile_size = 20_000
|
||||
overlap = noise.POSTCODE_NOISE_RADIUS_M
|
||||
tiles = noise._generate_tiles(
|
||||
0, 60_000, 0, 60_000, tile_size, overlap, tile_size
|
||||
)
|
||||
|
||||
by_origin = {(min_e, min_n): (max_e, max_n) for min_e, min_n, max_e, max_n in tiles}
|
||||
|
||||
# Horizontally adjacent tiles must overlap by >= overlap.
|
||||
for (min_e, min_n), (max_e, _max_n) in by_origin.items():
|
||||
right_origin = (min_e + tile_size, min_n)
|
||||
if right_origin in by_origin:
|
||||
assert max_e - right_origin[0] >= overlap
|
||||
|
||||
# Vertically adjacent tiles must overlap by >= overlap.
|
||||
for (min_e, min_n), (_max_e, max_n) in by_origin.items():
|
||||
up_origin = (min_e, min_n + tile_size)
|
||||
if up_origin in by_origin:
|
||||
assert max_n - up_origin[1] >= overlap
|
||||
|
||||
|
||||
def test_generate_tiles_clamps_to_grid_extent():
|
||||
tile_size = 20_000
|
||||
overlap = noise.POSTCODE_NOISE_RADIUS_M
|
||||
tiles = noise._generate_tiles(
|
||||
noise.BNG_MAX_E - tile_size,
|
||||
noise.BNG_MAX_E,
|
||||
noise.BNG_MAX_N - tile_size,
|
||||
noise.BNG_MAX_N,
|
||||
tile_size,
|
||||
overlap,
|
||||
tile_size,
|
||||
)
|
||||
# The final (top-right) tile cannot extend past the England extent even
|
||||
# though the overlap would otherwise push it beyond.
|
||||
for _min_e, _min_n, max_e, max_n in tiles:
|
||||
assert max_e <= noise.BNG_MAX_E
|
||||
assert max_n <= noise.BNG_MAX_N
|
||||
|
||||
|
||||
def _write_geotiff(path, data, left, top, resolution, nodata):
|
||||
with rasterio.open(
|
||||
path,
|
||||
"w",
|
||||
driver="GTiff",
|
||||
height=data.shape[0],
|
||||
width=data.shape[1],
|
||||
count=1,
|
||||
dtype=data.dtype,
|
||||
crs="EPSG:27700",
|
||||
transform=from_origin(left, top, resolution, resolution),
|
||||
nodata=nodata,
|
||||
) as dataset:
|
||||
dataset.write(data, 1)
|
||||
|
||||
|
||||
def test_sample_noise_recovers_value_across_overlapping_seam(monkeypatch, tmp_path):
|
||||
monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 50)
|
||||
monkeypatch.setattr(noise, "RESOLUTION", 10)
|
||||
|
||||
# Two download tiles share a vertical seam at easting=100. _generate_tiles
|
||||
# decides their real footprints: with the overlap fix the LEFT tile extends
|
||||
# past the seam by POSTCODE_NOISE_RADIUS_M and thus covers a loud cell that
|
||||
# physically sits just across the seam.
|
||||
tile_size = 100
|
||||
overlap = noise.POSTCODE_NOISE_RADIUS_M
|
||||
tiles = noise._generate_tiles(0, 200, 0, 100, tile_size, overlap, tile_size)
|
||||
by_origin = {
|
||||
(min_e, min_n): (max_e, max_n) for min_e, min_n, max_e, max_n in tiles
|
||||
}
|
||||
left_min_e, left_min_n = 0, 0
|
||||
left_max_e, left_max_n = by_origin[(left_min_e, left_min_n)]
|
||||
# Overlap fix is what makes the left tile reach across the seam.
|
||||
assert left_max_e > 100
|
||||
|
||||
# The loud 70 dB cell centre is at easting 105 (just across the seam) and
|
||||
# the postcode point is at easting 75 in the left tile, within 50m of it.
|
||||
res = noise.RESOLUTION
|
||||
width = int((left_max_e - left_min_e) // res)
|
||||
height = int((left_max_n - left_min_n) // res)
|
||||
left_data = np.zeros((height, width), dtype=np.float32)
|
||||
loud_row = height - 1 - int((25 - left_min_n) // res) # northing ~25
|
||||
loud_col = int((105 - left_min_e) // res) # easting ~105
|
||||
left_data[loud_row, loud_col] = 70.0
|
||||
_write_geotiff(
|
||||
tmp_path / "left.tif", left_data, left_min_e, left_max_n, res, nodata=0
|
||||
)
|
||||
|
||||
# The right tile holds the same loud cell but the postcode point is NOT
|
||||
# inside it, so without overlap the value would be lost for that point.
|
||||
right_min_e, right_min_n = 100, 0
|
||||
right_max_e, right_max_n = by_origin[(right_min_e, right_min_n)]
|
||||
rwidth = int((right_max_e - right_min_e) // res)
|
||||
rheight = int((right_max_n - right_min_n) // res)
|
||||
right_data = np.zeros((rheight, rwidth), dtype=np.float32)
|
||||
right_data[rheight - 1 - int((25 - right_min_n) // res), 0] = 70.0
|
||||
_write_geotiff(
|
||||
tmp_path / "right.tif", right_data, right_min_e, right_max_n, res, nodata=0
|
||||
)
|
||||
|
||||
result = noise.sample_noise_at_postcodes(
|
||||
[tmp_path / "left.tif", tmp_path / "right.tif"],
|
||||
easting=np.array([75.0]),
|
||||
northing=np.array([25.0]),
|
||||
label="Road",
|
||||
col_name="road_noise_lden_db",
|
||||
)
|
||||
|
||||
assert result.to_list() == [70.0]
|
||||
|
||||
|
||||
def test_sample_noise_distinguishes_nodata_from_in_coverage_quiet(
|
||||
monkeypatch, tmp_path
|
||||
):
|
||||
monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 0)
|
||||
monkeypatch.setattr(noise, "RESOLUTION", 10)
|
||||
|
||||
# Defra encodes TRUE nodata as the -96.0 sentinel; genuinely quiet ground
|
||||
# below the lowest reporting band is 0.0. With a 0m radius each postcode
|
||||
# reads exactly one cell, so we can pin behaviour per cell:
|
||||
# -96.0 sentinel -> null ("we don't know")
|
||||
# 0.0 in-coverage -> NOISE_QUIET_FLOOR_DB ("we know it's quiet")
|
||||
# 65.0 -> 65.0 (a real modelled reading)
|
||||
data = np.array(
|
||||
[
|
||||
[-96.0, 0.0, 65.0],
|
||||
],
|
||||
dtype=np.float32,
|
||||
)
|
||||
_write_geotiff(tmp_path / "noise.tif", data, 0, 10, 10, nodata=-96.0)
|
||||
|
||||
result = noise.sample_noise_at_postcodes(
|
||||
[tmp_path / "noise.tif"],
|
||||
# Cell centres at easting 5 (nodata), 15 (quiet 0.0), 25 (loud 65).
|
||||
easting=np.array([5.0, 15.0, 25.0]),
|
||||
northing=np.array([5.0, 5.0, 5.0]),
|
||||
label="Road",
|
||||
col_name="road_noise_lden_db",
|
||||
)
|
||||
|
||||
assert result.to_list() == [None, float(noise.NOISE_QUIET_FLOOR_DB), 65.0]
|
||||
|
||||
|
||||
def test_sample_noise_preserves_genuine_reading_above_quiet_floor(monkeypatch, tmp_path):
|
||||
monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 0)
|
||||
monkeypatch.setattr(noise, "RESOLUTION", 10)
|
||||
|
||||
# The lowest Defra reporting band is 40.0 dB; genuine readings populate
|
||||
# [40, ~80]. A genuine in-coverage reading at or just above the floor must be
|
||||
# PRESERVED, not clamped UP to the floor — only true-quiet 0.0 is floored. A
|
||||
# quiet floor set too high (e.g. 45) would inflate the ~35% of real 40-44.99
|
||||
# dB readings; this pins that they survive unchanged.
|
||||
floor = float(noise.NOISE_QUIET_FLOOR_DB)
|
||||
data = np.array(
|
||||
[
|
||||
[42.0, floor, 0.0],
|
||||
],
|
||||
dtype=np.float32,
|
||||
)
|
||||
_write_geotiff(tmp_path / "noise.tif", data, 0, 10, 10, nodata=-96.0)
|
||||
|
||||
result = noise.sample_noise_at_postcodes(
|
||||
[tmp_path / "noise.tif"],
|
||||
# Cell centres at easting 5 (42 dB), 15 (floor), 25 (quiet 0.0).
|
||||
easting=np.array([5.0, 15.0, 25.0]),
|
||||
northing=np.array([5.0, 5.0, 5.0]),
|
||||
label="Road",
|
||||
col_name="road_noise_lden_db",
|
||||
)
|
||||
|
||||
# 42 preserved (NOT raised to the floor), floor preserved, 0.0 -> floor.
|
||||
assert result.to_list() == [42.0, floor, floor]
|
||||
# The floor must sit at/below the lowest genuine reading so nothing inflates.
|
||||
assert floor <= 42.0
|
||||
|
||||
|
||||
def test_sample_noise_nodata_window_stays_null(monkeypatch, tmp_path):
|
||||
monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 15)
|
||||
monkeypatch.setattr(noise, "RESOLUTION", 10)
|
||||
|
||||
# A postcode whose entire 3x3 max-window is the -96.0 sentinel must remain
|
||||
# null: no in-coverage cell was read, so "quiet" must NOT be inferred.
|
||||
data = np.full((5, 5), -96.0, dtype=np.float32)
|
||||
data[4, 4] = 70.0 # one loud cell, far from the nodata corner
|
||||
_write_geotiff(tmp_path / "noise.tif", data, 0, 50, 10, nodata=-96.0)
|
||||
|
||||
result = noise.sample_noise_at_postcodes(
|
||||
[tmp_path / "noise.tif"],
|
||||
# Top-left point: its 3x3 window is cells (rows 0-1, cols 0-1) = all -96.
|
||||
easting=np.array([5.0]),
|
||||
northing=np.array([45.0]),
|
||||
label="Road",
|
||||
col_name="road_noise_lden_db",
|
||||
)
|
||||
|
||||
assert result.to_list() == [None]
|
||||
|
||||
|
||||
def test_sample_noise_at_postcodes_uses_local_maximum(monkeypatch, tmp_path):
|
||||
monkeypatch.setattr(noise, "POSTCODE_NOISE_RADIUS_M", 15)
|
||||
monkeypatch.setattr(noise, "RESOLUTION", 10)
|
||||
|
|
|
|||
|
|
@ -9,10 +9,12 @@ from pipeline.download.places import (
|
|||
_display_city_from_tags,
|
||||
_is_dlr_station,
|
||||
_is_tram_station,
|
||||
_london_postcode_tree,
|
||||
_naptan_dlr_stations,
|
||||
_normalize_street_name,
|
||||
_ofs_universities,
|
||||
_outcode_of_postcode,
|
||||
_outcode_tree,
|
||||
_pois_to_places,
|
||||
_select_university_name,
|
||||
_station_display_name,
|
||||
|
|
@ -242,6 +244,42 @@ def test_pois_to_places_keeps_high_value_named_pois_only():
|
|||
assert all(place["travel_destination"] is False for place in places)
|
||||
|
||||
|
||||
def test_pois_to_places_keeps_distinct_same_named_pois():
|
||||
# Two genuinely distinct POIs sharing a name, far apart (London vs Bristol).
|
||||
pois = pl.DataFrame(
|
||||
{
|
||||
"name": ["Victoria Park", "Victoria Park"],
|
||||
"category": ["leisure/park", "leisure/park"],
|
||||
"lat": [51.54, 51.46],
|
||||
"lng": [-0.04, -2.60],
|
||||
}
|
||||
)
|
||||
|
||||
places = _pois_to_places(pois)
|
||||
|
||||
assert len(places) == 2
|
||||
assert {(place["lat"], place["lon"]) for place in places} == {
|
||||
(51.54, -0.04),
|
||||
(51.46, -2.60),
|
||||
}
|
||||
|
||||
|
||||
def test_pois_to_places_still_dedupes_colocated():
|
||||
# The same physical POI mapped twice a few metres apart collapses to one.
|
||||
pois = pl.DataFrame(
|
||||
{
|
||||
"name": ["Victoria Park", "Victoria Park"],
|
||||
"category": ["leisure/park", "leisure/park"],
|
||||
"lat": [51.5400, 51.5401],
|
||||
"lng": [-0.0400, -0.0399],
|
||||
}
|
||||
)
|
||||
|
||||
places = _pois_to_places(pois)
|
||||
|
||||
assert len(places) == 1
|
||||
|
||||
|
||||
def test_display_city_from_tags_uses_explicit_london_context():
|
||||
assert _display_city_from_tags({"is_in": "Croydon, London, UK"}) == "London"
|
||||
assert _display_city_from_tags({"is_in": "Croydon, Cambridgeshire, UK"}) is None
|
||||
|
|
@ -290,3 +328,52 @@ def test_assign_london_display_city_uses_nearest_active_postcode_admin(tmp_path)
|
|||
|
||||
assert assigned == 2
|
||||
assert [place["display_city"] for place in places] == ["London", "London", None]
|
||||
|
||||
|
||||
def test_no_grid_reference_sentinel_is_excluded_from_coordinate_trees(tmp_path):
|
||||
# ONS NSPL stores postcodes with no grid reference at the Null-Island sentinel
|
||||
# lat=99.999999, long=0.0, whose paired BNG coords collapse to the (0, 0) origin.
|
||||
# Such an active postcode must never enter the nearest-neighbour indexes.
|
||||
sentinel = {
|
||||
"pcds": "ZZ99 9ZZ",
|
||||
"lat": 99.999999,
|
||||
"long": 0.0,
|
||||
"doterm": None,
|
||||
"ctry25cd": "E92000001",
|
||||
"east1m": 0,
|
||||
"north1m": 0,
|
||||
"rgn25cd": "E12000007",
|
||||
"lad25cd": "E09000008",
|
||||
"cty25cd": "E13000002",
|
||||
}
|
||||
croydon_easting, croydon_northing = WGS84_TO_BNG.transform(-0.101793, 51.371273)
|
||||
real = {
|
||||
"pcds": "CR0 1SZ",
|
||||
"lat": 51.371273,
|
||||
"long": -0.101793,
|
||||
"doterm": None,
|
||||
"ctry25cd": "E92000001",
|
||||
"east1m": int(round(croydon_easting)),
|
||||
"north1m": int(round(croydon_northing)),
|
||||
"rgn25cd": "E12000007",
|
||||
"lad25cd": "E09000008",
|
||||
"cty25cd": "E13000002",
|
||||
}
|
||||
postcodes = tmp_path / "postcodes.parquet"
|
||||
pl.DataFrame([sentinel, real]).write_parquet(postcodes)
|
||||
|
||||
# lat/long outcode tree: only the real postcode survives, so a London-area query
|
||||
# cannot be tagged with the sentinel's (empty) outcode.
|
||||
tree, outcodes = _outcode_tree(postcodes)
|
||||
assert tree.n == 1
|
||||
assert outcodes == ["CR0"]
|
||||
_, idx = tree.query([[51.371273, -0.101793]])
|
||||
assert outcodes[idx[0]] == "CR0"
|
||||
|
||||
# BNG London tree: only the real postcode survives, so the (0, 0) origin can never
|
||||
# be the nearest neighbour of a real place.
|
||||
bng_tree, london_flags = _london_postcode_tree(postcodes)
|
||||
assert bng_tree.n == 1
|
||||
assert london_flags.tolist() == [True]
|
||||
_, bng_idx = bng_tree.query([[croydon_easting, croydon_northing]])
|
||||
assert bng_idx[0] == 0
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ from pathlib import Path
|
|||
import polars as pl
|
||||
|
||||
from pipeline.local_temp import local_tmp_dir
|
||||
from pipeline.utils import download, extract_zip
|
||||
from pipeline.utils import code_col_overrides, download, extract_zip
|
||||
|
||||
URL = "https://www.arcgis.com/sharing/rest/content/items/4e0b4b3fbc2540caae27e7be532e61be/data"
|
||||
|
||||
|
|
@ -34,16 +34,16 @@ def find_csvs(extract_path: Path) -> list[Path]:
|
|||
|
||||
def convert_to_parquet(csv_paths: list[Path], parquet_path: Path) -> None:
|
||||
# Some regional files infer different types for the same column (e.g.
|
||||
# ruc21ind is String in most but Int64 in YH). Read all code columns as
|
||||
# String to avoid schema mismatches.
|
||||
CODE_COLS = {
|
||||
"ruc21ind": pl.String,
|
||||
"oac21ind": pl.String,
|
||||
"imd19ind": pl.String,
|
||||
}
|
||||
# ruc21ind is String in most but Int64 in YH), and string codes like "UN1"
|
||||
# appear deep in the data. Read all classification-index code columns as
|
||||
# String to avoid schema mismatches. NSUL renames the year suffixes each
|
||||
# release and polars silently ignores overrides for missing columns, so
|
||||
# match on the suffix-free stem (from the header) rather than hard-coding.
|
||||
names = pl.scan_csv(csv_paths[0]).collect_schema().names()
|
||||
code_cols = code_col_overrides(names)
|
||||
df = pl.concat(
|
||||
[
|
||||
pl.scan_csv(p, try_parse_dates=True, schema_overrides=CODE_COLS)
|
||||
pl.scan_csv(p, try_parse_dates=True, schema_overrides=code_cols)
|
||||
for p in csv_paths
|
||||
]
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue