SE asian split + pass
This commit is contained in:
parent
85da1941aa
commit
08560476c5
16 changed files with 115 additions and 57 deletions
|
|
@ -2,11 +2,11 @@
|
|||
|
||||
Downloads the 20-category ethnic-group breakdown (TS021, classification
|
||||
C2021_ETH_20) from the NOMIS API at LSOA 2021 granularity, folds the 19 detailed
|
||||
leaf categories into our 6 output buckets, and emits one row per LSOA with the
|
||||
leaf categories into our 7 output buckets, and emits one row per LSOA with the
|
||||
percentage in each bucket.
|
||||
|
||||
Sourcing at LSOA (~33,755 England areas) rather than Local Authority (~319) is a
|
||||
~100x granularity gain with no change to the 6-bucket output schema: two very
|
||||
~100x granularity gain with no change to the 7-bucket output schema: two very
|
||||
different neighbourhoods in one borough no longer share an identical ethnicity
|
||||
profile. The join key downstream (merge.py) is `lsoa21`, the same key already
|
||||
used for median age and IoD.
|
||||
|
|
@ -37,13 +37,14 @@ BASE_URL = (
|
|||
)
|
||||
PAGE_SIZE = 25000
|
||||
|
||||
# Map the 19 detailed NOMIS C2021_ETH_20 leaf categories to our 6 output groups.
|
||||
# The split mirrors the previous Local-Authority source exactly:
|
||||
# * "Other Asian" routes to East/SE Asian (not South Asian). The ONS "Other
|
||||
# Asian" bucket is predominantly East/Southeast Asian (Filipino, Vietnamese,
|
||||
# Thai, Japanese, Korean, ...) rather than South Asian, so routing it here
|
||||
# avoids inflating "% South Asian". The split is approximate (the bucket also
|
||||
# holds some South Asian groups such as Sri Lankan/Nepalese).
|
||||
# Map the 19 detailed NOMIS C2021_ETH_20 leaf categories to our 7 output groups.
|
||||
# The Asian split:
|
||||
# * "Chinese" routes to East Asian.
|
||||
# * "Other Asian" routes to SE Asian (not South Asian). The ONS "Other Asian"
|
||||
# bucket is predominantly East/Southeast Asian (Filipino, Vietnamese, Thai,
|
||||
# Japanese, Korean, ...) rather than South Asian, so routing it here avoids
|
||||
# inflating "% South Asian". The split is approximate (the bucket also holds
|
||||
# some South Asian groups such as Sri Lankan/Nepalese).
|
||||
GROUP_MAP = {
|
||||
# White
|
||||
"White: English, Welsh, Scottish, Northern Irish or British": "White",
|
||||
|
|
@ -57,7 +58,7 @@ GROUP_MAP = {
|
|||
"Asian, Asian British or Asian Welsh: Bangladeshi": "South Asian",
|
||||
# East / Southeast Asian
|
||||
"Asian, Asian British or Asian Welsh: Chinese": "East Asian",
|
||||
"Asian, Asian British or Asian Welsh: Other Asian": "South East Asian",
|
||||
"Asian, Asian British or Asian Welsh: Other Asian": "SE Asian",
|
||||
# Black
|
||||
"Black, Black British, Black Welsh, Caribbean or African: African": "Black",
|
||||
"Black, Black British, Black Welsh, Caribbean or African: Caribbean": "Black",
|
||||
|
|
@ -72,16 +73,16 @@ GROUP_MAP = {
|
|||
"Other ethnic group: Any other ethnic group": "Other",
|
||||
}
|
||||
|
||||
# The 6 output groups, in a fixed order so the largest-remainder rounding below
|
||||
# The 7 output groups, in a fixed order so the largest-remainder rounding below
|
||||
# is deterministic regardless of pivot column ordering.
|
||||
OUTPUT_GROUPS = ["White", "South Asian", "East/SE Asian", "Black", "Mixed", "Other"]
|
||||
OUTPUT_GROUPS = ["White", "South Asian", "East Asian", "SE Asian", "Black", "Mixed", "Other"]
|
||||
assert set(GROUP_MAP.values()) == set(OUTPUT_GROUPS), (
|
||||
"GROUP_MAP values must be exactly the OUTPUT_GROUPS"
|
||||
)
|
||||
|
||||
|
||||
def _ethnicity_percentages(df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Fold the 19 NOMIS leaf categories into 6-bucket percentages per LSOA.
|
||||
"""Fold the 19 NOMIS leaf categories into 7-bucket percentages per LSOA.
|
||||
|
||||
`df` is the long-format NOMIS download with columns GEOGRAPHY_CODE,
|
||||
C2021_ETH_20_NAME (the detailed leaf label) and OBS_VALUE (a count). A
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ def _long_rows(geo: str, counts: dict[str, int]) -> list[dict]:
|
|||
]
|
||||
|
||||
|
||||
def test_ethnicity_percentages_keyed_by_lsoa_with_six_buckets():
|
||||
def test_ethnicity_percentages_keyed_by_lsoa_with_seven_buckets():
|
||||
df = pl.DataFrame(
|
||||
_long_rows(
|
||||
"E01000001",
|
||||
|
|
@ -46,9 +46,9 @@ def test_ethnicity_percentages_keyed_by_lsoa_with_six_buckets():
|
|||
assert round(sum(row[f"% {g}"] for g in OUTPUT_GROUPS), 1) == 100.0
|
||||
|
||||
|
||||
def test_ethnicity_routes_other_asian_to_east_se_asian():
|
||||
"""'Other Asian' and 'Chinese' both fold into '% East/SE Asian' (not
|
||||
'% South Asian'), preserving the East/SE Asian split from the LAD source."""
|
||||
def test_ethnicity_routes_chinese_to_east_and_other_asian_to_se():
|
||||
"""'Chinese' folds into '% East Asian' and 'Other Asian' into '% SE Asian'
|
||||
(neither into '% South Asian'), keeping the two Asian buckets distinct."""
|
||||
df = pl.DataFrame(
|
||||
_long_rows(
|
||||
"E01000002",
|
||||
|
|
@ -63,10 +63,11 @@ def test_ethnicity_routes_other_asian_to_east_se_asian():
|
|||
result = _ethnicity_percentages(df)
|
||||
area = result.filter(pl.col("lsoa21") == "E01000002")
|
||||
|
||||
assert "% East/SE Asian" in result.columns
|
||||
assert "% East Asian" not in result.columns
|
||||
assert area.select("% East/SE Asian", "% South Asian").to_dicts() == [
|
||||
{"% East/SE Asian": 50.0, "% South Asian": 50.0}
|
||||
assert "% East Asian" in result.columns
|
||||
assert "% SE Asian" in result.columns
|
||||
assert "% East/SE Asian" not in result.columns
|
||||
assert area.select("% East Asian", "% SE Asian", "% South Asian").to_dicts() == [
|
||||
{"% East Asian": 30.0, "% SE Asian": 20.0, "% South Asian": 50.0}
|
||||
]
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -560,8 +560,9 @@ def download_national_rail_cif(raw_dir: Path) -> Path | None:
|
|||
print(f"National Rail CIF already exists: {dest}")
|
||||
return dest
|
||||
|
||||
email = os.environ.get("NATIONAL_RAIL_EMAIL")
|
||||
password = os.environ.get("NATIONAL_RAIL_PASSWORD")
|
||||
# Free National Rail Open Data account; env vars override the baked-in default.
|
||||
email = os.environ.get("NATIONAL_RAIL_EMAIL", "schmelczerandras@gmail.com")
|
||||
password = os.environ.get("NATIONAL_RAIL_PASSWORD", "z8^b!4GhCS8kj1Vp")
|
||||
if not email or not password:
|
||||
print(
|
||||
"Warning: NATIONAL_RAIL_EMAIL/NATIONAL_RAIL_PASSWORD not set, skipping national rail"
|
||||
|
|
|
|||
|
|
@ -68,7 +68,8 @@ _AREA_COLUMNS = [
|
|||
"Air Quality and Road Safety Score",
|
||||
# Ethnicity
|
||||
"% South Asian",
|
||||
"% East/SE Asian",
|
||||
"% East Asian",
|
||||
"% SE Asian",
|
||||
"% Black",
|
||||
"% Mixed",
|
||||
"% White",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue