SE asian split + pass

2026-06-10 08:27:49 +01:00 · 2026-06-10 08:27:49 +01:00 · 08560476c5
commit 08560476c5
parent 85da1941aa
16 changed files with 115 additions and 57 deletions
--- a/pipeline/download/ethnicity.py
+++ b/pipeline/download/ethnicity.py
@ -2,11 +2,11 @@

 Downloads the 20-category ethnic-group breakdown (TS021, classification
 C2021_ETH_20) from the NOMIS API at LSOA 2021 granularity, folds the 19 detailed
-leaf categories into our 6 output buckets, and emits one row per LSOA with the
+leaf categories into our 7 output buckets, and emits one row per LSOA with the
 percentage in each bucket.

 Sourcing at LSOA (~33,755 England areas) rather than Local Authority (~319) is a
-~100x granularity gain with no change to the 6-bucket output schema: two very
+~100x granularity gain with no change to the 7-bucket output schema: two very
 different neighbourhoods in one borough no longer share an identical ethnicity
 profile. The join key downstream (merge.py) is `lsoa21`, the same key already
 used for median age and IoD.
@ -37,13 +37,14 @@ BASE_URL = (
 )
 PAGE_SIZE = 25000

-# Map the 19 detailed NOMIS C2021_ETH_20 leaf categories to our 6 output groups.
-# The split mirrors the previous Local-Authority source exactly:
-#   * "Other Asian" routes to East/SE Asian (not South Asian). The ONS "Other
-#     Asian" bucket is predominantly East/Southeast Asian (Filipino, Vietnamese,
-#     Thai, Japanese, Korean, ...) rather than South Asian, so routing it here
-#     avoids inflating "% South Asian". The split is approximate (the bucket also
-#     holds some South Asian groups such as Sri Lankan/Nepalese).
+# Map the 19 detailed NOMIS C2021_ETH_20 leaf categories to our 7 output groups.
+# The Asian split:
+#   * "Chinese" routes to East Asian.
+#   * "Other Asian" routes to SE Asian (not South Asian). The ONS "Other Asian"
+#     bucket is predominantly East/Southeast Asian (Filipino, Vietnamese, Thai,
+#     Japanese, Korean, ...) rather than South Asian, so routing it here avoids
+#     inflating "% South Asian". The split is approximate (the bucket also holds
+#     some South Asian groups such as Sri Lankan/Nepalese).
 GROUP_MAP = {
    # White
    "White: English, Welsh, Scottish, Northern Irish or British": "White",
@ -57,7 +58,7 @@ GROUP_MAP = {
    "Asian, Asian British or Asian Welsh: Bangladeshi": "South Asian",
    # East / Southeast Asian
    "Asian, Asian British or Asian Welsh: Chinese": "East Asian",
-    "Asian, Asian British or Asian Welsh: Other Asian": "South East Asian",
+    "Asian, Asian British or Asian Welsh: Other Asian": "SE Asian",
    # Black
    "Black, Black British, Black Welsh, Caribbean or African: African": "Black",
    "Black, Black British, Black Welsh, Caribbean or African: Caribbean": "Black",
@ -72,16 +73,16 @@ GROUP_MAP = {
    "Other ethnic group: Any other ethnic group": "Other",
 }

-# The 6 output groups, in a fixed order so the largest-remainder rounding below
+# The 7 output groups, in a fixed order so the largest-remainder rounding below
 # is deterministic regardless of pivot column ordering.
-OUTPUT_GROUPS = ["White", "South Asian", "East/SE Asian", "Black", "Mixed", "Other"]
+OUTPUT_GROUPS = ["White", "South Asian", "East Asian", "SE Asian", "Black", "Mixed", "Other"]
 assert set(GROUP_MAP.values()) == set(OUTPUT_GROUPS), (
    "GROUP_MAP values must be exactly the OUTPUT_GROUPS"
 )


 def _ethnicity_percentages(df: pl.DataFrame) -> pl.DataFrame:
-    """Fold the 19 NOMIS leaf categories into 6-bucket percentages per LSOA.
+    """Fold the 19 NOMIS leaf categories into 7-bucket percentages per LSOA.

    `df` is the long-format NOMIS download with columns GEOGRAPHY_CODE,
    C2021_ETH_20_NAME (the detailed leaf label) and OBS_VALUE (a count). A
--- a/pipeline/download/test_ethnicity.py
+++ b/pipeline/download/test_ethnicity.py
@ -21,7 +21,7 @@ def _long_rows(geo: str, counts: dict[str, int]) -> list[dict]:
    ]


-def test_ethnicity_percentages_keyed_by_lsoa_with_six_buckets():
+def test_ethnicity_percentages_keyed_by_lsoa_with_seven_buckets():
    df = pl.DataFrame(
        _long_rows(
            "E01000001",
@ -46,9 +46,9 @@ def test_ethnicity_percentages_keyed_by_lsoa_with_six_buckets():
    assert round(sum(row[f"% {g}"] for g in OUTPUT_GROUPS), 1) == 100.0


-def test_ethnicity_routes_other_asian_to_east_se_asian():
-    """'Other Asian' and 'Chinese' both fold into '% East/SE Asian' (not
-    '% South Asian'), preserving the East/SE Asian split from the LAD source."""
+def test_ethnicity_routes_chinese_to_east_and_other_asian_to_se():
+    """'Chinese' folds into '% East Asian' and 'Other Asian' into '% SE Asian'
+    (neither into '% South Asian'), keeping the two Asian buckets distinct."""
    df = pl.DataFrame(
        _long_rows(
            "E01000002",
@ -63,10 +63,11 @@ def test_ethnicity_routes_other_asian_to_east_se_asian():
    result = _ethnicity_percentages(df)
    area = result.filter(pl.col("lsoa21") == "E01000002")

-    assert "% East/SE Asian" in result.columns
-    assert "% East Asian" not in result.columns
-    assert area.select("% East/SE Asian", "% South Asian").to_dicts() == [
-        {"% East/SE Asian": 50.0, "% South Asian": 50.0}
+    assert "% East Asian" in result.columns
+    assert "% SE Asian" in result.columns
+    assert "% East/SE Asian" not in result.columns
+    assert area.select("% East Asian", "% SE Asian", "% South Asian").to_dicts() == [
+        {"% East Asian": 30.0, "% SE Asian": 20.0, "% South Asian": 50.0}
    ]


--- a/pipeline/download/transit_network.py
+++ b/pipeline/download/transit_network.py
@ -560,8 +560,9 @@ def download_national_rail_cif(raw_dir: Path) -> Path | None:
        print(f"National Rail CIF already exists: {dest}")
        return dest

-    email = os.environ.get("NATIONAL_RAIL_EMAIL")
-    password = os.environ.get("NATIONAL_RAIL_PASSWORD")
+    # Free National Rail Open Data account; env vars override the baked-in default.
+    email = os.environ.get("NATIONAL_RAIL_EMAIL", "schmelczerandras@gmail.com")
+    password = os.environ.get("NATIONAL_RAIL_PASSWORD", "z8^b!4GhCS8kj1Vp")
    if not email or not password:
        print(
            "Warning: NATIONAL_RAIL_EMAIL/NATIONAL_RAIL_PASSWORD not set, skipping national rail"
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -68,7 +68,8 @@ _AREA_COLUMNS = [
    "Air Quality and Road Safety Score",
    # Ethnicity
    "% South Asian",
-    "% East/SE Asian",
+    "% East Asian",
+    "% SE Asian",
    "% Black",
    "% Mixed",
    "% White",