90 lines
3.4 KiB
Python
90 lines
3.4 KiB
Python
import math
|
|
|
|
import polars as pl
|
|
import pytest
|
|
|
|
from pipeline.download import median_age
|
|
from pipeline.download.median_age import (
|
|
AGE_BANDS,
|
|
EXPECTED_BAND_NAMES,
|
|
compute_median_age,
|
|
)
|
|
|
|
|
|
def test_expected_band_names_align_with_age_bands():
|
|
assert len(EXPECTED_BAND_NAMES) == len(AGE_BANDS)
|
|
|
|
|
|
def test_compute_median_age_interpolates_within_median_band():
|
|
# All weight in the 30-34 band -> median is the band midpoint via linear
|
|
# interpolation: 30 + ((50 - 0) / 100) * 5 = 32.5.
|
|
counts = [0] * len(AGE_BANDS)
|
|
counts[6] = 100 # "Aged 30 to 34 years"
|
|
assert compute_median_age(counts) == pytest.approx(32.5)
|
|
|
|
# 50 below the median band, 100 inside the 35-39 band holding the median.
|
|
# half = 75; cumulative before band 7 = 50; 35 + ((75 - 50) / 100) * 5 = 36.25.
|
|
counts = [0] * len(AGE_BANDS)
|
|
counts[0] = 50 # below the median band
|
|
counts[7] = 100 # "Aged 35 to 39 years" holds the median
|
|
assert compute_median_age(counts) == pytest.approx(36.25)
|
|
|
|
|
|
def test_compute_median_age_empty_lsoa_is_nan():
|
|
assert math.isnan(compute_median_age([0] * len(AGE_BANDS)))
|
|
|
|
|
|
def _pivoted(band_to_counts: dict[str, list]) -> pl.DataFrame:
|
|
"""Build a pivot-shaped frame: GEOGRAPHY_CODE + one column per band."""
|
|
n = len(next(iter(band_to_counts.values())))
|
|
data = {"GEOGRAPHY_CODE": [f"E0100000{i}" for i in range(n)]}
|
|
data.update(band_to_counts)
|
|
return pl.DataFrame(data)
|
|
|
|
|
|
def test_null_band_count_is_treated_as_zero_not_crash():
|
|
# One LSOA has a null in the 85+ band (NOMIS can return null for a band with
|
|
# zero people). It must be coerced to 0, not raise TypeError in sum(). With
|
|
# all 100 people in the 30-34 band the median is the band midpoint, 32.5.
|
|
counts_by_band = {name: [0] for name in EXPECTED_BAND_NAMES}
|
|
counts_by_band["Aged 30 to 34 years"] = [100]
|
|
counts_by_band["Aged 85 years and over"] = [None]
|
|
pivoted = _pivoted(counts_by_band)
|
|
|
|
table = median_age._bands_to_median_table(pivoted)
|
|
|
|
assert table.height == 1
|
|
assert table["median_age"][0] == pytest.approx(32.5)
|
|
|
|
|
|
def test_equivalent_band_label_alias_is_accepted():
|
|
# NOMIS relabelled the first band "Aged 4 years and under" (same as ages
|
|
# 0-4). It must be normalised to the canonical name and used as band 0, not
|
|
# rejected. All 100 people in that band -> median in the 0-4 range: 2.5.
|
|
counts_by_band = {name: [0] for name in EXPECTED_BAND_NAMES}
|
|
counts_by_band["Aged 4 years and under"] = counts_by_band.pop("Aged 0 to 4 years")
|
|
counts_by_band["Aged 4 years and under"] = [100]
|
|
pivoted = _pivoted(counts_by_band)
|
|
|
|
table = median_age._bands_to_median_table(pivoted)
|
|
|
|
assert table.height == 1
|
|
assert table["median_age"][0] == pytest.approx(2.5)
|
|
|
|
|
|
def test_missing_band_raises_clear_error():
|
|
counts_by_band = {name: [10] for name in EXPECTED_BAND_NAMES}
|
|
del counts_by_band["Aged 85 years and over"]
|
|
pivoted = _pivoted(counts_by_band)
|
|
|
|
with pytest.raises(ValueError, match=r"do not match the expected NOMIS"):
|
|
median_age._bands_to_median_table(pivoted)
|
|
|
|
|
|
def test_relabelled_band_raises_clear_error():
|
|
counts_by_band = {name: [10] for name in EXPECTED_BAND_NAMES}
|
|
counts_by_band["Total"] = counts_by_band.pop("Aged 85 years and over")
|
|
pivoted = _pivoted(counts_by_band)
|
|
|
|
with pytest.raises(ValueError, match=r"unexpected:"):
|
|
median_age._bands_to_median_table(pivoted)
|