119 lines
4 KiB
Python
119 lines
4 KiB
Python
import polars as pl
|
|
import pytest
|
|
|
|
from pipeline.download.ethnicity import GROUP_MAP, OUTPUT_GROUPS, _ethnicity_percentages
|
|
|
|
|
|
def _long_rows(geo: str, counts: dict[str, int]) -> list[dict]:
|
|
"""Build NOMIS-shaped long rows for one LSOA from {leaf_label: count}.
|
|
|
|
Every one of the 19 leaf categories must be present in the download (NOMIS
|
|
emits a 0-count row when an LSOA has none), so categories not given default
|
|
to 0 to mirror that.
|
|
"""
|
|
return [
|
|
{
|
|
"GEOGRAPHY_CODE": geo,
|
|
"C2021_ETH_20_NAME": label,
|
|
"OBS_VALUE": counts.get(label, 0),
|
|
}
|
|
for label in GROUP_MAP
|
|
]
|
|
|
|
|
|
def test_ethnicity_percentages_keyed_by_lsoa_with_seven_buckets():
|
|
df = pl.DataFrame(
|
|
_long_rows(
|
|
"E01000001",
|
|
{
|
|
"White: English, Welsh, Scottish, Northern Irish or British": 60,
|
|
"White: Other White": 10,
|
|
"Asian, Asian British or Asian Welsh: Indian": 20,
|
|
"Black, Black British, Black Welsh, Caribbean or African: African": 10,
|
|
},
|
|
)
|
|
)
|
|
|
|
result = _ethnicity_percentages(df)
|
|
|
|
assert result.columns[0] == "lsoa21"
|
|
assert set(result.columns) == {"lsoa21", *(f"% {g}" for g in OUTPUT_GROUPS)}
|
|
row = result.filter(pl.col("lsoa21") == "E01000001").to_dicts()[0]
|
|
assert row["% White"] == 70.0
|
|
assert row["% South Asian"] == 20.0
|
|
assert row["% Black"] == 10.0
|
|
# Percentages always sum to exactly 100 (largest-remainder rounding).
|
|
assert round(sum(row[f"% {g}"] for g in OUTPUT_GROUPS), 1) == 100.0
|
|
|
|
|
|
def test_ethnicity_routes_chinese_to_east_and_other_asian_to_se():
|
|
"""'Chinese' folds into '% East Asian' and 'Other Asian' into '% SE Asian'
|
|
(neither into '% South Asian'), keeping the two Asian buckets distinct."""
|
|
df = pl.DataFrame(
|
|
_long_rows(
|
|
"E01000002",
|
|
{
|
|
"Asian, Asian British or Asian Welsh: Chinese": 30,
|
|
"Asian, Asian British or Asian Welsh: Other Asian": 20,
|
|
"Asian, Asian British or Asian Welsh: Indian": 50,
|
|
},
|
|
)
|
|
)
|
|
|
|
result = _ethnicity_percentages(df)
|
|
area = result.filter(pl.col("lsoa21") == "E01000002")
|
|
|
|
assert "% East Asian" in result.columns
|
|
assert "% SE Asian" in result.columns
|
|
assert "% East/SE Asian" not in result.columns
|
|
assert area.select("% East Asian", "% SE Asian", "% South Asian").to_dicts() == [
|
|
{"% East Asian": 30.0, "% SE Asian": 20.0, "% South Asian": 50.0}
|
|
]
|
|
|
|
|
|
def test_ethnicity_percentages_independent_per_lsoa():
|
|
"""Two LSOAs get independent profiles — the LSOA granularity is the point."""
|
|
df = pl.concat(
|
|
[
|
|
pl.DataFrame(
|
|
_long_rows(
|
|
"E01000010",
|
|
{"White: Other White": 100},
|
|
)
|
|
),
|
|
pl.DataFrame(
|
|
_long_rows(
|
|
"E01000011",
|
|
{"Asian, Asian British or Asian Welsh: Pakistani": 100},
|
|
)
|
|
),
|
|
]
|
|
)
|
|
|
|
result = _ethnicity_percentages(df).sort("lsoa21")
|
|
|
|
assert result["% White"].to_list() == [100.0, 0.0]
|
|
assert result["% South Asian"].to_list() == [0.0, 100.0]
|
|
|
|
|
|
def test_ethnicity_percentages_rejects_unexpected_category():
|
|
rows = _long_rows("E01000003", {"White: Other White": 10})
|
|
rows.append(
|
|
{
|
|
"GEOGRAPHY_CODE": "E01000003",
|
|
"C2021_ETH_20_NAME": "White: A Brand New Census Category",
|
|
"OBS_VALUE": 5,
|
|
}
|
|
)
|
|
|
|
with pytest.raises(ValueError, match="do not match the expected"):
|
|
_ethnicity_percentages(pl.DataFrame(rows))
|
|
|
|
|
|
def test_ethnicity_percentages_rejects_missing_category():
|
|
# Drop one leaf entirely: its people would vanish from the denominator.
|
|
rows = [r for r in _long_rows("E01000004", {"White: Other White": 10}) if
|
|
r["C2021_ETH_20_NAME"] != "Other ethnic group: Arab"]
|
|
|
|
with pytest.raises(ValueError, match="missing"):
|
|
_ethnicity_percentages(pl.DataFrame(rows))
|