perfect-postcode/pipeline/download/test_ethnicity.py

import polars as pl
import pytest

from pipeline.download.ethnicity import GROUP_MAP, OUTPUT_GROUPS, _ethnicity_percentages


def _long_rows(geo: str, counts: dict[str, int]) -> list[dict]:
    """Build NOMIS-shaped long rows for one LSOA from {leaf_label: count}.

    Every one of the 19 leaf categories must be present in the download (NOMIS
    emits a 0-count row when an LSOA has none), so categories not given default
    to 0 to mirror that.
    """
    return [
        {
            "GEOGRAPHY_CODE": geo,
            "C2021_ETH_20_NAME": label,
            "OBS_VALUE": counts.get(label, 0),
        }
        for label in GROUP_MAP
    ]


def test_ethnicity_percentages_keyed_by_lsoa_with_seven_buckets():
    df = pl.DataFrame(
        _long_rows(
            "E01000001",
            {
                "White: English, Welsh, Scottish, Northern Irish or British": 60,
                "White: Other White": 10,
                "Asian, Asian British or Asian Welsh: Indian": 20,
                "Black, Black British, Black Welsh, Caribbean or African: African": 10,
            },
        )
    )

    result = _ethnicity_percentages(df)

    assert result.columns[0] == "lsoa21"
    assert set(result.columns) == {"lsoa21", *(f"% {g}" for g in OUTPUT_GROUPS)}
    row = result.filter(pl.col("lsoa21") == "E01000001").to_dicts()[0]
    assert row["% White"] == 70.0
    assert row["% South Asian"] == 20.0
    assert row["% Black"] == 10.0
    # Percentages always sum to exactly 100 (largest-remainder rounding).
    assert round(sum(row[f"% {g}"] for g in OUTPUT_GROUPS), 1) == 100.0


def test_ethnicity_routes_chinese_to_east_and_other_asian_to_se():
    """'Chinese' folds into '% East Asian' and 'Other Asian' into '% SE Asian'
    (neither into '% South Asian'), keeping the two Asian buckets distinct."""
    df = pl.DataFrame(
        _long_rows(
            "E01000002",
            {
                "Asian, Asian British or Asian Welsh: Chinese": 30,
                "Asian, Asian British or Asian Welsh: Other Asian": 20,
                "Asian, Asian British or Asian Welsh: Indian": 50,
            },
        )
    )

    result = _ethnicity_percentages(df)
    area = result.filter(pl.col("lsoa21") == "E01000002")

    assert "% East Asian" in result.columns
    assert "% SE Asian" in result.columns
    assert "% East/SE Asian" not in result.columns
    assert area.select("% East Asian", "% SE Asian", "% South Asian").to_dicts() == [
        {"% East Asian": 30.0, "% SE Asian": 20.0, "% South Asian": 50.0}
    ]


def test_ethnicity_percentages_independent_per_lsoa():
    """Two LSOAs get independent profiles — the LSOA granularity is the point."""
    df = pl.concat(
        [
            pl.DataFrame(
                _long_rows(
                    "E01000010",
                    {"White: Other White": 100},
                )
            ),
            pl.DataFrame(
                _long_rows(
                    "E01000011",
                    {"Asian, Asian British or Asian Welsh: Pakistani": 100},
                )
            ),
        ]
    )

    result = _ethnicity_percentages(df).sort("lsoa21")

    assert result["% White"].to_list() == [100.0, 0.0]
    assert result["% South Asian"].to_list() == [0.0, 100.0]


def test_ethnicity_percentages_rejects_unexpected_category():
    rows = _long_rows("E01000003", {"White: Other White": 10})
    rows.append(
        {
            "GEOGRAPHY_CODE": "E01000003",
            "C2021_ETH_20_NAME": "White: A Brand New Census Category",
            "OBS_VALUE": 5,
        }
    )

    with pytest.raises(ValueError, match="do not match the expected"):
        _ethnicity_percentages(pl.DataFrame(rows))


def test_ethnicity_percentages_rejects_missing_category():
    # Drop one leaf entirely: its people would vanish from the denominator.
    rows = [r for r in _long_rows("E01000004", {"White: Other White": 10}) if
            r["C2021_ETH_20_NAME"] != "Other ethnic group: Arab"]

    with pytest.raises(ValueError, match="missing"):
        _ethnicity_percentages(pl.DataFrame(rows))