SE asian split + pass

This commit is contained in:
Andras Schmelczer 2026-06-10 08:27:49 +01:00
parent 85da1941aa
commit 08560476c5
16 changed files with 115 additions and 57 deletions

View file

@ -2,11 +2,11 @@
Downloads the 20-category ethnic-group breakdown (TS021, classification
C2021_ETH_20) from the NOMIS API at LSOA 2021 granularity, folds the 19 detailed
leaf categories into our 6 output buckets, and emits one row per LSOA with the
leaf categories into our 7 output buckets, and emits one row per LSOA with the
percentage in each bucket.
Sourcing at LSOA (~33,755 England areas) rather than Local Authority (~319) is a
~100x granularity gain with no change to the 6-bucket output schema: two very
~100x granularity gain with no change to the 7-bucket output schema: two very
different neighbourhoods in one borough no longer share an identical ethnicity
profile. The join key downstream (merge.py) is `lsoa21`, the same key already
used for median age and IoD.
@ -37,13 +37,14 @@ BASE_URL = (
)
PAGE_SIZE = 25000
# Map the 19 detailed NOMIS C2021_ETH_20 leaf categories to our 6 output groups.
# The split mirrors the previous Local-Authority source exactly:
# * "Other Asian" routes to East/SE Asian (not South Asian). The ONS "Other
# Asian" bucket is predominantly East/Southeast Asian (Filipino, Vietnamese,
# Thai, Japanese, Korean, ...) rather than South Asian, so routing it here
# avoids inflating "% South Asian". The split is approximate (the bucket also
# holds some South Asian groups such as Sri Lankan/Nepalese).
# Map the 19 detailed NOMIS C2021_ETH_20 leaf categories to our 7 output groups.
# The Asian split:
# * "Chinese" routes to East Asian.
# * "Other Asian" routes to SE Asian (not South Asian). The ONS "Other Asian"
# bucket is predominantly East/Southeast Asian (Filipino, Vietnamese, Thai,
# Japanese, Korean, ...) rather than South Asian, so routing it here avoids
# inflating "% South Asian". The split is approximate (the bucket also holds
# some South Asian groups such as Sri Lankan/Nepalese).
GROUP_MAP = {
# White
"White: English, Welsh, Scottish, Northern Irish or British": "White",
@ -57,7 +58,7 @@ GROUP_MAP = {
"Asian, Asian British or Asian Welsh: Bangladeshi": "South Asian",
# East / Southeast Asian
"Asian, Asian British or Asian Welsh: Chinese": "East Asian",
"Asian, Asian British or Asian Welsh: Other Asian": "South East Asian",
"Asian, Asian British or Asian Welsh: Other Asian": "SE Asian",
# Black
"Black, Black British, Black Welsh, Caribbean or African: African": "Black",
"Black, Black British, Black Welsh, Caribbean or African: Caribbean": "Black",
@ -72,16 +73,16 @@ GROUP_MAP = {
"Other ethnic group: Any other ethnic group": "Other",
}
# The 6 output groups, in a fixed order so the largest-remainder rounding below
# The 7 output groups, in a fixed order so the largest-remainder rounding below
# is deterministic regardless of pivot column ordering.
OUTPUT_GROUPS = ["White", "South Asian", "East/SE Asian", "Black", "Mixed", "Other"]
OUTPUT_GROUPS = ["White", "South Asian", "East Asian", "SE Asian", "Black", "Mixed", "Other"]
assert set(GROUP_MAP.values()) == set(OUTPUT_GROUPS), (
"GROUP_MAP values must be exactly the OUTPUT_GROUPS"
)
def _ethnicity_percentages(df: pl.DataFrame) -> pl.DataFrame:
"""Fold the 19 NOMIS leaf categories into 6-bucket percentages per LSOA.
"""Fold the 19 NOMIS leaf categories into 7-bucket percentages per LSOA.
`df` is the long-format NOMIS download with columns GEOGRAPHY_CODE,
C2021_ETH_20_NAME (the detailed leaf label) and OBS_VALUE (a count). A

View file

@ -21,7 +21,7 @@ def _long_rows(geo: str, counts: dict[str, int]) -> list[dict]:
]
def test_ethnicity_percentages_keyed_by_lsoa_with_six_buckets():
def test_ethnicity_percentages_keyed_by_lsoa_with_seven_buckets():
df = pl.DataFrame(
_long_rows(
"E01000001",
@ -46,9 +46,9 @@ def test_ethnicity_percentages_keyed_by_lsoa_with_six_buckets():
assert round(sum(row[f"% {g}"] for g in OUTPUT_GROUPS), 1) == 100.0
def test_ethnicity_routes_other_asian_to_east_se_asian():
"""'Other Asian' and 'Chinese' both fold into '% East/SE Asian' (not
'% South Asian'), preserving the East/SE Asian split from the LAD source."""
def test_ethnicity_routes_chinese_to_east_and_other_asian_to_se():
"""'Chinese' folds into '% East Asian' and 'Other Asian' into '% SE Asian'
(neither into '% South Asian'), keeping the two Asian buckets distinct."""
df = pl.DataFrame(
_long_rows(
"E01000002",
@ -63,10 +63,11 @@ def test_ethnicity_routes_other_asian_to_east_se_asian():
result = _ethnicity_percentages(df)
area = result.filter(pl.col("lsoa21") == "E01000002")
assert "% East/SE Asian" in result.columns
assert "% East Asian" not in result.columns
assert area.select("% East/SE Asian", "% South Asian").to_dicts() == [
{"% East/SE Asian": 50.0, "% South Asian": 50.0}
assert "% East Asian" in result.columns
assert "% SE Asian" in result.columns
assert "% East/SE Asian" not in result.columns
assert area.select("% East Asian", "% SE Asian", "% South Asian").to_dicts() == [
{"% East Asian": 30.0, "% SE Asian": 20.0, "% South Asian": 50.0}
]

View file

@ -560,8 +560,9 @@ def download_national_rail_cif(raw_dir: Path) -> Path | None:
print(f"National Rail CIF already exists: {dest}")
return dest
email = os.environ.get("NATIONAL_RAIL_EMAIL")
password = os.environ.get("NATIONAL_RAIL_PASSWORD")
# Free National Rail Open Data account; env vars override the baked-in default.
email = os.environ.get("NATIONAL_RAIL_EMAIL", "schmelczerandras@gmail.com")
password = os.environ.get("NATIONAL_RAIL_PASSWORD", "z8^b!4GhCS8kj1Vp")
if not email or not password:
print(
"Warning: NATIONAL_RAIL_EMAIL/NATIONAL_RAIL_PASSWORD not set, skipping national rail"

View file

@ -68,7 +68,8 @@ _AREA_COLUMNS = [
"Air Quality and Road Safety Score",
# Ethnicity
"% South Asian",
"% East/SE Asian",
"% East Asian",
"% SE Asian",
"% Black",
"% Mixed",
"% White",