perfect-postcode/pipeline/download/test_ethnicity.py

119 lines
4 KiB
Python

import polars as pl
import pytest
from pipeline.download.ethnicity import GROUP_MAP, OUTPUT_GROUPS, _ethnicity_percentages
def _long_rows(geo: str, counts: dict[str, int]) -> list[dict]:
"""Build NOMIS-shaped long rows for one LSOA from {leaf_label: count}.
Every one of the 19 leaf categories must be present in the download (NOMIS
emits a 0-count row when an LSOA has none), so categories not given default
to 0 to mirror that.
"""
return [
{
"GEOGRAPHY_CODE": geo,
"C2021_ETH_20_NAME": label,
"OBS_VALUE": counts.get(label, 0),
}
for label in GROUP_MAP
]
def test_ethnicity_percentages_keyed_by_lsoa_with_seven_buckets():
df = pl.DataFrame(
_long_rows(
"E01000001",
{
"White: English, Welsh, Scottish, Northern Irish or British": 60,
"White: Other White": 10,
"Asian, Asian British or Asian Welsh: Indian": 20,
"Black, Black British, Black Welsh, Caribbean or African: African": 10,
},
)
)
result = _ethnicity_percentages(df)
assert result.columns[0] == "lsoa21"
assert set(result.columns) == {"lsoa21", *(f"% {g}" for g in OUTPUT_GROUPS)}
row = result.filter(pl.col("lsoa21") == "E01000001").to_dicts()[0]
assert row["% White"] == 70.0
assert row["% South Asian"] == 20.0
assert row["% Black"] == 10.0
# Percentages always sum to exactly 100 (largest-remainder rounding).
assert round(sum(row[f"% {g}"] for g in OUTPUT_GROUPS), 1) == 100.0
def test_ethnicity_routes_chinese_to_east_and_other_asian_to_se():
"""'Chinese' folds into '% East Asian' and 'Other Asian' into '% SE Asian'
(neither into '% South Asian'), keeping the two Asian buckets distinct."""
df = pl.DataFrame(
_long_rows(
"E01000002",
{
"Asian, Asian British or Asian Welsh: Chinese": 30,
"Asian, Asian British or Asian Welsh: Other Asian": 20,
"Asian, Asian British or Asian Welsh: Indian": 50,
},
)
)
result = _ethnicity_percentages(df)
area = result.filter(pl.col("lsoa21") == "E01000002")
assert "% East Asian" in result.columns
assert "% SE Asian" in result.columns
assert "% East/SE Asian" not in result.columns
assert area.select("% East Asian", "% SE Asian", "% South Asian").to_dicts() == [
{"% East Asian": 30.0, "% SE Asian": 20.0, "% South Asian": 50.0}
]
def test_ethnicity_percentages_independent_per_lsoa():
"""Two LSOAs get independent profiles — the LSOA granularity is the point."""
df = pl.concat(
[
pl.DataFrame(
_long_rows(
"E01000010",
{"White: Other White": 100},
)
),
pl.DataFrame(
_long_rows(
"E01000011",
{"Asian, Asian British or Asian Welsh: Pakistani": 100},
)
),
]
)
result = _ethnicity_percentages(df).sort("lsoa21")
assert result["% White"].to_list() == [100.0, 0.0]
assert result["% South Asian"].to_list() == [0.0, 100.0]
def test_ethnicity_percentages_rejects_unexpected_category():
rows = _long_rows("E01000003", {"White: Other White": 10})
rows.append(
{
"GEOGRAPHY_CODE": "E01000003",
"C2021_ETH_20_NAME": "White: A Brand New Census Category",
"OBS_VALUE": 5,
}
)
with pytest.raises(ValueError, match="do not match the expected"):
_ethnicity_percentages(pl.DataFrame(rows))
def test_ethnicity_percentages_rejects_missing_category():
# Drop one leaf entirely: its people would vanish from the denominator.
rows = [r for r in _long_rows("E01000004", {"White: Other White": 10}) if
r["C2021_ETH_20_NAME"] != "Other ethnic group: Arab"]
with pytest.raises(ValueError, match="missing"):
_ethnicity_percentages(pl.DataFrame(rows))