don't crash

This commit is contained in:
Andras Schmelczer 2026-06-04 20:40:42 +01:00
parent aab85fe32e
commit d6d20ccd37
13 changed files with 2630 additions and 3924 deletions

View file

@ -72,6 +72,18 @@ assert len(EXPECTED_BAND_NAMES) == len(AGE_BANDS), (
"EXPECTED_BAND_NAMES and AGE_BANDS must stay aligned 1:1"
)
# NOMIS sometimes labels a band with a wording variant that denotes the SAME
# age range (e.g. "Aged 4 years and under" for ages 0-4, "Aged 90 years and
# over" wording for the top band). Map such known-equivalent labels back to the
# canonical name BEFORE validation so a real band change still fails loudly,
# but a cosmetic relabel of an identical range does not block the build.
BAND_NAME_ALIASES = {
"Aged 4 years and under": "Aged 0 to 4 years",
}
assert set(BAND_NAME_ALIASES.values()) <= set(EXPECTED_BAND_NAMES), (
"BAND_NAME_ALIASES must map to canonical EXPECTED_BAND_NAMES"
)
def compute_median_age(counts: list[int]) -> float:
"""Compute median age from five-year band counts using linear interpolation."""
@ -98,6 +110,15 @@ def _bands_to_median_table(pivoted: pl.DataFrame) -> pl.DataFrame:
missing/extra/relabelled band would otherwise silently mis-align counts
against the wrong AGE_BANDS lower bound, so we fail loudly instead.
"""
# Normalise known-equivalent NOMIS label variants to their canonical name
# before validating (renaming onto an already-present canonical column would
# collide, so polars raises loudly in that genuinely ambiguous case).
rename_map = {
c: BAND_NAME_ALIASES[c] for c in pivoted.columns if c in BAND_NAME_ALIASES
}
if rename_map:
pivoted = pivoted.rename(rename_map)
# Validate the pivoted age-band columns against the canonical NOMIS set
# BEFORE computing anything.
band_cols = [c for c in pivoted.columns if c != "GEOGRAPHY_CODE"]

View file

@ -266,8 +266,12 @@ def _download_tile(
except (
NoGeoTiffError,
httpx.HTTPStatusError,
httpx.TimeoutException,
httpx.ConnectError,
# TransportError is the superset of TimeoutException, ConnectError,
# ReadError and ProtocolError — including RemoteProtocolError, raised
# when the WCS server closes the connection mid-stream ("incomplete
# chunked read"). All are transient; retry/split rather than letting
# one flaky tile crash the whole raster download.
httpx.TransportError,
) as e:
last_error = e
if attempt < MAX_RETRIES:

View file

@ -57,6 +57,21 @@ def test_null_band_count_is_treated_as_zero_not_crash():
assert table["median_age"][0] == pytest.approx(32.5)
def test_equivalent_band_label_alias_is_accepted():
# NOMIS relabelled the first band "Aged 4 years and under" (same as ages
# 0-4). It must be normalised to the canonical name and used as band 0, not
# rejected. All 100 people in that band -> median in the 0-4 range: 2.5.
counts_by_band = {name: [0] for name in EXPECTED_BAND_NAMES}
counts_by_band["Aged 4 years and under"] = counts_by_band.pop("Aged 0 to 4 years")
counts_by_band["Aged 4 years and under"] = [100]
pivoted = _pivoted(counts_by_band)
table = median_age._bands_to_median_table(pivoted)
assert table.height == 1
assert table["median_age"][0] == pytest.approx(2.5)
def test_missing_band_raises_clear_error():
counts_by_band = {name: [10] for name in EXPECTED_BAND_NAMES}
del counts_by_band["Aged 85 years and over"]