perfect-postcode/pipeline/download/test_median_age.py

import math

import polars as pl
import pytest

from pipeline.download import median_age
from pipeline.download.median_age import (
    AGE_BANDS,
    EXPECTED_BAND_NAMES,
    compute_median_age,
)


def test_expected_band_names_align_with_age_bands():
    assert len(EXPECTED_BAND_NAMES) == len(AGE_BANDS)


def test_compute_median_age_interpolates_within_median_band():
    # All weight in the 30-34 band -> median is the band midpoint via linear
    # interpolation: 30 + ((50 - 0) / 100) * 5 = 32.5.
    counts = [0] * len(AGE_BANDS)
    counts[6] = 100  # "Aged 30 to 34 years"
    assert compute_median_age(counts) == pytest.approx(32.5)

    # 50 below the median band, 100 inside the 35-39 band holding the median.
    # half = 75; cumulative before band 7 = 50; 35 + ((75 - 50) / 100) * 5 = 36.25.
    counts = [0] * len(AGE_BANDS)
    counts[0] = 50  # below the median band
    counts[7] = 100  # "Aged 35 to 39 years" holds the median
    assert compute_median_age(counts) == pytest.approx(36.25)


def test_compute_median_age_empty_lsoa_is_nan():
    assert math.isnan(compute_median_age([0] * len(AGE_BANDS)))


def _pivoted(band_to_counts: dict[str, list]) -> pl.DataFrame:
    """Build a pivot-shaped frame: GEOGRAPHY_CODE + one column per band."""
    n = len(next(iter(band_to_counts.values())))
    data = {"GEOGRAPHY_CODE": [f"E0100000{i}" for i in range(n)]}
    data.update(band_to_counts)
    return pl.DataFrame(data)


def test_null_band_count_is_treated_as_zero_not_crash():
    # One LSOA has a null in the 85+ band (NOMIS can return null for a band with
    # zero people). It must be coerced to 0, not raise TypeError in sum(). With
    # all 100 people in the 30-34 band the median is the band midpoint, 32.5.
    counts_by_band = {name: [0] for name in EXPECTED_BAND_NAMES}
    counts_by_band["Aged 30 to 34 years"] = [100]
    counts_by_band["Aged 85 years and over"] = [None]
    pivoted = _pivoted(counts_by_band)

    table = median_age._bands_to_median_table(pivoted)

    assert table.height == 1
    assert table["median_age"][0] == pytest.approx(32.5)


def test_equivalent_band_label_alias_is_accepted():
    # NOMIS relabelled the first band "Aged 4 years and under" (same as ages
    # 0-4). It must be normalised to the canonical name and used as band 0, not
    # rejected. All 100 people in that band -> median in the 0-4 range: 2.5.
    counts_by_band = {name: [0] for name in EXPECTED_BAND_NAMES}
    counts_by_band["Aged 4 years and under"] = counts_by_band.pop("Aged 0 to 4 years")
    counts_by_band["Aged 4 years and under"] = [100]
    pivoted = _pivoted(counts_by_band)

    table = median_age._bands_to_median_table(pivoted)

    assert table.height == 1
    assert table["median_age"][0] == pytest.approx(2.5)


def test_missing_band_raises_clear_error():
    counts_by_band = {name: [10] for name in EXPECTED_BAND_NAMES}
    del counts_by_band["Aged 85 years and over"]
    pivoted = _pivoted(counts_by_band)

    with pytest.raises(ValueError, match=r"do not match the expected NOMIS"):
        median_age._bands_to_median_table(pivoted)


def test_relabelled_band_raises_clear_error():
    counts_by_band = {name: [10] for name in EXPECTED_BAND_NAMES}
    counts_by_band["Total"] = counts_by_band.pop("Aged 85 years and over")
    pivoted = _pivoted(counts_by_band)

    with pytest.raises(ValueError, match=r"unexpected:"):
        median_age._bands_to_median_table(pivoted)