import polars as pl import pytest from pipeline.download.ethnicity import GROUP_MAP, OUTPUT_GROUPS, _ethnicity_percentages def _long_rows(geo: str, counts: dict[str, int]) -> list[dict]: """Build NOMIS-shaped long rows for one LSOA from {leaf_label: count}. Every one of the 19 leaf categories must be present in the download (NOMIS emits a 0-count row when an LSOA has none), so categories not given default to 0 to mirror that. """ return [ { "GEOGRAPHY_CODE": geo, "C2021_ETH_20_NAME": label, "OBS_VALUE": counts.get(label, 0), } for label in GROUP_MAP ] def test_ethnicity_percentages_keyed_by_lsoa_with_seven_buckets(): df = pl.DataFrame( _long_rows( "E01000001", { "White: English, Welsh, Scottish, Northern Irish or British": 60, "White: Other White": 10, "Asian, Asian British or Asian Welsh: Indian": 20, "Black, Black British, Black Welsh, Caribbean or African: African": 10, }, ) ) result = _ethnicity_percentages(df) assert result.columns[0] == "lsoa21" assert set(result.columns) == {"lsoa21", *(f"% {g}" for g in OUTPUT_GROUPS)} row = result.filter(pl.col("lsoa21") == "E01000001").to_dicts()[0] assert row["% White"] == 70.0 assert row["% South Asian"] == 20.0 assert row["% Black"] == 10.0 # Percentages always sum to exactly 100 (largest-remainder rounding). assert round(sum(row[f"% {g}"] for g in OUTPUT_GROUPS), 1) == 100.0 def test_ethnicity_routes_chinese_to_east_and_other_asian_to_se(): """'Chinese' folds into '% East Asian' and 'Other Asian' into '% SE Asian' (neither into '% South Asian'), keeping the two Asian buckets distinct.""" df = pl.DataFrame( _long_rows( "E01000002", { "Asian, Asian British or Asian Welsh: Chinese": 30, "Asian, Asian British or Asian Welsh: Other Asian": 20, "Asian, Asian British or Asian Welsh: Indian": 50, }, ) ) result = _ethnicity_percentages(df) area = result.filter(pl.col("lsoa21") == "E01000002") assert "% East Asian" in result.columns assert "% SE Asian" in result.columns assert "% East/SE Asian" not in result.columns assert area.select("% East Asian", "% SE Asian", "% South Asian").to_dicts() == [ {"% East Asian": 30.0, "% SE Asian": 20.0, "% South Asian": 50.0} ] def test_ethnicity_percentages_independent_per_lsoa(): """Two LSOAs get independent profiles — the LSOA granularity is the point.""" df = pl.concat( [ pl.DataFrame( _long_rows( "E01000010", {"White: Other White": 100}, ) ), pl.DataFrame( _long_rows( "E01000011", {"Asian, Asian British or Asian Welsh: Pakistani": 100}, ) ), ] ) result = _ethnicity_percentages(df).sort("lsoa21") assert result["% White"].to_list() == [100.0, 0.0] assert result["% South Asian"].to_list() == [0.0, 100.0] def test_ethnicity_percentages_rejects_unexpected_category(): rows = _long_rows("E01000003", {"White: Other White": 10}) rows.append( { "GEOGRAPHY_CODE": "E01000003", "C2021_ETH_20_NAME": "White: A Brand New Census Category", "OBS_VALUE": 5, } ) with pytest.raises(ValueError, match="do not match the expected"): _ethnicity_percentages(pl.DataFrame(rows)) def test_ethnicity_percentages_rejects_missing_category(): # Drop one leaf entirely: its people would vanish from the denominator. rows = [r for r in _long_rows("E01000004", {"White: Other White": 10}) if r["C2021_ETH_20_NAME"] != "Other ethnic group: Arab"] with pytest.raises(ValueError, match="missing"): _ethnicity_percentages(pl.DataFrame(rows))