perfect-postcode/pipeline/download/test_median_age.py
2026-06-04 20:40:42 +01:00

90 lines
3.4 KiB
Python

import math
import polars as pl
import pytest
from pipeline.download import median_age
from pipeline.download.median_age import (
AGE_BANDS,
EXPECTED_BAND_NAMES,
compute_median_age,
)
def test_expected_band_names_align_with_age_bands():
assert len(EXPECTED_BAND_NAMES) == len(AGE_BANDS)
def test_compute_median_age_interpolates_within_median_band():
# All weight in the 30-34 band -> median is the band midpoint via linear
# interpolation: 30 + ((50 - 0) / 100) * 5 = 32.5.
counts = [0] * len(AGE_BANDS)
counts[6] = 100 # "Aged 30 to 34 years"
assert compute_median_age(counts) == pytest.approx(32.5)
# 50 below the median band, 100 inside the 35-39 band holding the median.
# half = 75; cumulative before band 7 = 50; 35 + ((75 - 50) / 100) * 5 = 36.25.
counts = [0] * len(AGE_BANDS)
counts[0] = 50 # below the median band
counts[7] = 100 # "Aged 35 to 39 years" holds the median
assert compute_median_age(counts) == pytest.approx(36.25)
def test_compute_median_age_empty_lsoa_is_nan():
assert math.isnan(compute_median_age([0] * len(AGE_BANDS)))
def _pivoted(band_to_counts: dict[str, list]) -> pl.DataFrame:
"""Build a pivot-shaped frame: GEOGRAPHY_CODE + one column per band."""
n = len(next(iter(band_to_counts.values())))
data = {"GEOGRAPHY_CODE": [f"E0100000{i}" for i in range(n)]}
data.update(band_to_counts)
return pl.DataFrame(data)
def test_null_band_count_is_treated_as_zero_not_crash():
# One LSOA has a null in the 85+ band (NOMIS can return null for a band with
# zero people). It must be coerced to 0, not raise TypeError in sum(). With
# all 100 people in the 30-34 band the median is the band midpoint, 32.5.
counts_by_band = {name: [0] for name in EXPECTED_BAND_NAMES}
counts_by_band["Aged 30 to 34 years"] = [100]
counts_by_band["Aged 85 years and over"] = [None]
pivoted = _pivoted(counts_by_band)
table = median_age._bands_to_median_table(pivoted)
assert table.height == 1
assert table["median_age"][0] == pytest.approx(32.5)
def test_equivalent_band_label_alias_is_accepted():
# NOMIS relabelled the first band "Aged 4 years and under" (same as ages
# 0-4). It must be normalised to the canonical name and used as band 0, not
# rejected. All 100 people in that band -> median in the 0-4 range: 2.5.
counts_by_band = {name: [0] for name in EXPECTED_BAND_NAMES}
counts_by_band["Aged 4 years and under"] = counts_by_band.pop("Aged 0 to 4 years")
counts_by_band["Aged 4 years and under"] = [100]
pivoted = _pivoted(counts_by_band)
table = median_age._bands_to_median_table(pivoted)
assert table.height == 1
assert table["median_age"][0] == pytest.approx(2.5)
def test_missing_band_raises_clear_error():
counts_by_band = {name: [10] for name in EXPECTED_BAND_NAMES}
del counts_by_band["Aged 85 years and over"]
pivoted = _pivoted(counts_by_band)
with pytest.raises(ValueError, match=r"do not match the expected NOMIS"):
median_age._bands_to_median_table(pivoted)
def test_relabelled_band_raises_clear_error():
counts_by_band = {name: [10] for name in EXPECTED_BAND_NAMES}
counts_by_band["Total"] = counts_by_band.pop("Aged 85 years and over")
pivoted = _pivoted(counts_by_band)
with pytest.raises(ValueError, match=r"unexpected:"):
median_age._bands_to_median_table(pivoted)