perfect-postcode/pipeline/download/ethnicity.py

"""Download Census 2021 ethnic group (TS021) by LSOA.

Downloads the 20-category ethnic-group breakdown (TS021, classification
C2021_ETH_20) from the NOMIS API at LSOA 2021 granularity, folds the 19 detailed
leaf categories into our 6 output buckets, and emits one row per LSOA with the
percentage in each bucket.

Sourcing at LSOA (~33,755 England areas) rather than Local Authority (~319) is a
~100x granularity gain with no change to the 6-bucket output schema: two very
different neighbourhoods in one borough no longer share an identical ethnicity
profile. The join key downstream (merge.py) is `lsoa21`, the same key already
used for median age and IoD.

Source: NOMIS (ONS Census 2021 — TS021 dataset, NM_2041_1)
License: Open Government Licence v3.0
"""

import argparse
from io import BytesIO
from pathlib import Path

import httpx
import polars as pl

pl.Config.set_tbl_cols(-1)

# NOMIS API: Census 2021 TS021 (ethnic group, 20 categories) by LSOA 2021
# (TYPE151). c2021_eth_20=1..19 selects the 19 detailed leaf categories
# (excluding the 5 broad aggregates 1001-1005 and the 0 = Total, which we
# re-derive ourselves). measures=20100 selects the absolute count.
BASE_URL = (
    "https://www.nomisweb.co.uk/api/v01/dataset/NM_2041_1.data.csv"
    "?geography=TYPE151"
    "&c2021_eth_20=1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19"
    "&measures=20100"
    "&select=GEOGRAPHY_CODE,C2021_ETH_20_NAME,OBS_VALUE"
)
PAGE_SIZE = 25000

# Map the 19 detailed NOMIS C2021_ETH_20 leaf categories to our 6 output groups.
# The split mirrors the previous Local-Authority source exactly:
#   * "Other Asian" routes to East/SE Asian (not South Asian). The ONS "Other
#     Asian" bucket is predominantly East/Southeast Asian (Filipino, Vietnamese,
#     Thai, Japanese, Korean, ...) rather than South Asian, so routing it here
#     avoids inflating "% South Asian". The split is approximate (the bucket also
#     holds some South Asian groups such as Sri Lankan/Nepalese).
GROUP_MAP = {
    # White
    "White: English, Welsh, Scottish, Northern Irish or British": "White",
    "White: Irish": "White",
    "White: Gypsy or Irish Traveller": "White",
    "White: Roma": "White",
    "White: Other White": "White",
    # South Asian
    "Asian, Asian British or Asian Welsh: Indian": "South Asian",
    "Asian, Asian British or Asian Welsh: Pakistani": "South Asian",
    "Asian, Asian British or Asian Welsh: Bangladeshi": "South Asian",
    # East / Southeast Asian
    "Asian, Asian British or Asian Welsh: Chinese": "East Asian",
    "Asian, Asian British or Asian Welsh: Other Asian": "South East Asian",
    # Black
    "Black, Black British, Black Welsh, Caribbean or African: African": "Black",
    "Black, Black British, Black Welsh, Caribbean or African: Caribbean": "Black",
    "Black, Black British, Black Welsh, Caribbean or African: Other Black": "Black",
    # Mixed
    "Mixed or Multiple ethnic groups: White and Asian": "Mixed",
    "Mixed or Multiple ethnic groups: White and Black African": "Mixed",
    "Mixed or Multiple ethnic groups: White and Black Caribbean": "Mixed",
    "Mixed or Multiple ethnic groups: Other Mixed or Multiple ethnic groups": "Mixed",
    # Other
    "Other ethnic group: Arab": "Other",
    "Other ethnic group: Any other ethnic group": "Other",
}

# The 6 output groups, in a fixed order so the largest-remainder rounding below
# is deterministic regardless of pivot column ordering.
OUTPUT_GROUPS = ["White", "South Asian", "East/SE Asian", "Black", "Mixed", "Other"]
assert set(GROUP_MAP.values()) == set(OUTPUT_GROUPS), (
    "GROUP_MAP values must be exactly the OUTPUT_GROUPS"
)


def _ethnicity_percentages(df: pl.DataFrame) -> pl.DataFrame:
    """Fold the 19 NOMIS leaf categories into 6-bucket percentages per LSOA.

    `df` is the long-format NOMIS download with columns GEOGRAPHY_CODE,
    C2021_ETH_20_NAME (the detailed leaf label) and OBS_VALUE (a count). A
    missing/extra/relabelled leaf category would silently drop people from the
    denominator, so we validate the category set against GROUP_MAP first and
    fail loudly otherwise.
    """
    found = set(df["C2021_ETH_20_NAME"].unique().to_list())
    expected = set(GROUP_MAP)
    if found != expected:
        missing = sorted(expected - found)
        unexpected = sorted(found - expected)
        raise ValueError(
            "Census ethnic-group categories do not match the expected NOMIS "
            "TS021 C2021_ETH_20 leaf set.\n"
            f"  expected {len(expected)} categories, found {len(found)}\n"
            f"  missing:    {missing}\n"
            f"  unexpected: {unexpected}\n"
            "Refusing to compute percentages against an unrecognised breakdown."
        )

    # Map each leaf to its output group and sum counts per (LSOA, group). Summing
    # counts (not rounded percentages) keeps the denominator exact.
    grouped = (
        df.with_columns(
            pl.col("C2021_ETH_20_NAME").replace_strict(GROUP_MAP).alias("group"),
            pl.col("OBS_VALUE").cast(pl.Float64, strict=False).alias("_count"),
        )
        .group_by("GEOGRAPHY_CODE", "group")
        .agg(pl.col("_count").sum())
    )
    wide = grouped.pivot(on="group", index="GEOGRAPHY_CODE", values="_count").rename(
        {"GEOGRAPHY_CODE": "lsoa21"}
    )

    # A group with no people in an LSOA is absent from the long rows, so the pivot
    # leaves a null; treat it as 0 before normalising.
    wide = wide.with_columns(pl.col(OUTPUT_GROUPS).fill_null(0.0))

    # Normalize so each row sums to exactly 100%, then round with the
    # largest-remainder method to preserve the sum. Independent rounding of 6
    # values can drift +/-0.3.
    row_total = sum(pl.col(c) for c in OUTPUT_GROUPS)
    wide = wide.with_columns(
        [(pl.col(c) / row_total * 100.0).alias(c) for c in OUTPUT_GROUPS]
    )
    # Round to 1 decimal, then adjust the largest group to absorb the residual.
    wide = wide.with_columns([pl.col(c).round(1).alias(c) for c in OUTPUT_GROUPS])
    rounded_sum = sum(pl.col(c) for c in OUTPUT_GROUPS)
    residual = (100.0 - rounded_sum).round(1)
    largest_col = pl.concat_list(OUTPUT_GROUPS).list.arg_max()
    wide = wide.with_columns(
        [
            pl.when(largest_col == i)
            .then(pl.col(c) + residual)
            .otherwise(pl.col(c))
            .alias(c)
            for i, c in enumerate(OUTPUT_GROUPS)
        ]
    )

    rename_map = {col: f"% {col}" for col in OUTPUT_GROUPS}
    return wide.rename(rename_map)


def download_and_convert(output_path: Path) -> None:
    print("Downloading Census 2021 ethnic group (TS021) by LSOA from NOMIS...")
    frames = []
    offset = 0
    while True:
        url = f"{BASE_URL}&recordoffset={offset}"
        response = httpx.get(url, follow_redirects=True, timeout=120)
        response.raise_for_status()
        if len(response.content) == 0:
            break
        chunk = pl.read_csv(BytesIO(response.content))
        if chunk.height == 0:
            break
        frames.append(chunk)
        print(f"  Fetched {chunk.height} rows (offset={offset})")
        if chunk.height < PAGE_SIZE:
            break
        offset += PAGE_SIZE

    df = pl.concat(frames)
    print(f"Total rows: {df.height}")

    # Filter to England only (E-prefixed LSOA codes); the merge joins on the
    # English postcode universe and the IoD coverage check is England-wide.
    df = df.filter(pl.col("GEOGRAPHY_CODE").str.starts_with("E"))

    wide = _ethnicity_percentages(df)

    print(f"England LSOAs: {wide.height}")
    print(f"Columns: {wide.columns}")

    output_path.parent.mkdir(parents=True, exist_ok=True)
    wide.write_parquet(output_path, compression="zstd")
    print(f"Saved to {output_path}")


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Download Census 2021 ethnic group (TS021) by LSOA"
    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
    )
    args = parser.parse_args()
    download_and_convert(args.output)


if __name__ == "__main__":
    main()