perfect-postcode/pipeline/download/ethnicity.py
2026-06-10 07:54:25 +01:00

198 lines
7.9 KiB
Python

"""Download Census 2021 ethnic group (TS021) by LSOA.
Downloads the 20-category ethnic-group breakdown (TS021, classification
C2021_ETH_20) from the NOMIS API at LSOA 2021 granularity, folds the 19 detailed
leaf categories into our 6 output buckets, and emits one row per LSOA with the
percentage in each bucket.
Sourcing at LSOA (~33,755 England areas) rather than Local Authority (~319) is a
~100x granularity gain with no change to the 6-bucket output schema: two very
different neighbourhoods in one borough no longer share an identical ethnicity
profile. The join key downstream (merge.py) is `lsoa21`, the same key already
used for median age and IoD.
Source: NOMIS (ONS Census 2021 — TS021 dataset, NM_2041_1)
License: Open Government Licence v3.0
"""
import argparse
from io import BytesIO
from pathlib import Path
import httpx
import polars as pl
pl.Config.set_tbl_cols(-1)
# NOMIS API: Census 2021 TS021 (ethnic group, 20 categories) by LSOA 2021
# (TYPE151). c2021_eth_20=1..19 selects the 19 detailed leaf categories
# (excluding the 5 broad aggregates 1001-1005 and the 0 = Total, which we
# re-derive ourselves). measures=20100 selects the absolute count.
BASE_URL = (
"https://www.nomisweb.co.uk/api/v01/dataset/NM_2041_1.data.csv"
"?geography=TYPE151"
"&c2021_eth_20=1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19"
"&measures=20100"
"&select=GEOGRAPHY_CODE,C2021_ETH_20_NAME,OBS_VALUE"
)
PAGE_SIZE = 25000
# Map the 19 detailed NOMIS C2021_ETH_20 leaf categories to our 6 output groups.
# The split mirrors the previous Local-Authority source exactly:
# * "Other Asian" routes to East/SE Asian (not South Asian). The ONS "Other
# Asian" bucket is predominantly East/Southeast Asian (Filipino, Vietnamese,
# Thai, Japanese, Korean, ...) rather than South Asian, so routing it here
# avoids inflating "% South Asian". The split is approximate (the bucket also
# holds some South Asian groups such as Sri Lankan/Nepalese).
GROUP_MAP = {
# White
"White: English, Welsh, Scottish, Northern Irish or British": "White",
"White: Irish": "White",
"White: Gypsy or Irish Traveller": "White",
"White: Roma": "White",
"White: Other White": "White",
# South Asian
"Asian, Asian British or Asian Welsh: Indian": "South Asian",
"Asian, Asian British or Asian Welsh: Pakistani": "South Asian",
"Asian, Asian British or Asian Welsh: Bangladeshi": "South Asian",
# East / Southeast Asian
"Asian, Asian British or Asian Welsh: Chinese": "East Asian",
"Asian, Asian British or Asian Welsh: Other Asian": "South East Asian",
# Black
"Black, Black British, Black Welsh, Caribbean or African: African": "Black",
"Black, Black British, Black Welsh, Caribbean or African: Caribbean": "Black",
"Black, Black British, Black Welsh, Caribbean or African: Other Black": "Black",
# Mixed
"Mixed or Multiple ethnic groups: White and Asian": "Mixed",
"Mixed or Multiple ethnic groups: White and Black African": "Mixed",
"Mixed or Multiple ethnic groups: White and Black Caribbean": "Mixed",
"Mixed or Multiple ethnic groups: Other Mixed or Multiple ethnic groups": "Mixed",
# Other
"Other ethnic group: Arab": "Other",
"Other ethnic group: Any other ethnic group": "Other",
}
# The 6 output groups, in a fixed order so the largest-remainder rounding below
# is deterministic regardless of pivot column ordering.
OUTPUT_GROUPS = ["White", "South Asian", "East/SE Asian", "Black", "Mixed", "Other"]
assert set(GROUP_MAP.values()) == set(OUTPUT_GROUPS), (
"GROUP_MAP values must be exactly the OUTPUT_GROUPS"
)
def _ethnicity_percentages(df: pl.DataFrame) -> pl.DataFrame:
"""Fold the 19 NOMIS leaf categories into 6-bucket percentages per LSOA.
`df` is the long-format NOMIS download with columns GEOGRAPHY_CODE,
C2021_ETH_20_NAME (the detailed leaf label) and OBS_VALUE (a count). A
missing/extra/relabelled leaf category would silently drop people from the
denominator, so we validate the category set against GROUP_MAP first and
fail loudly otherwise.
"""
found = set(df["C2021_ETH_20_NAME"].unique().to_list())
expected = set(GROUP_MAP)
if found != expected:
missing = sorted(expected - found)
unexpected = sorted(found - expected)
raise ValueError(
"Census ethnic-group categories do not match the expected NOMIS "
"TS021 C2021_ETH_20 leaf set.\n"
f" expected {len(expected)} categories, found {len(found)}\n"
f" missing: {missing}\n"
f" unexpected: {unexpected}\n"
"Refusing to compute percentages against an unrecognised breakdown."
)
# Map each leaf to its output group and sum counts per (LSOA, group). Summing
# counts (not rounded percentages) keeps the denominator exact.
grouped = (
df.with_columns(
pl.col("C2021_ETH_20_NAME").replace_strict(GROUP_MAP).alias("group"),
pl.col("OBS_VALUE").cast(pl.Float64, strict=False).alias("_count"),
)
.group_by("GEOGRAPHY_CODE", "group")
.agg(pl.col("_count").sum())
)
wide = grouped.pivot(on="group", index="GEOGRAPHY_CODE", values="_count").rename(
{"GEOGRAPHY_CODE": "lsoa21"}
)
# A group with no people in an LSOA is absent from the long rows, so the pivot
# leaves a null; treat it as 0 before normalising.
wide = wide.with_columns(pl.col(OUTPUT_GROUPS).fill_null(0.0))
# Normalize so each row sums to exactly 100%, then round with the
# largest-remainder method to preserve the sum. Independent rounding of 6
# values can drift +/-0.3.
row_total = sum(pl.col(c) for c in OUTPUT_GROUPS)
wide = wide.with_columns(
[(pl.col(c) / row_total * 100.0).alias(c) for c in OUTPUT_GROUPS]
)
# Round to 1 decimal, then adjust the largest group to absorb the residual.
wide = wide.with_columns([pl.col(c).round(1).alias(c) for c in OUTPUT_GROUPS])
rounded_sum = sum(pl.col(c) for c in OUTPUT_GROUPS)
residual = (100.0 - rounded_sum).round(1)
largest_col = pl.concat_list(OUTPUT_GROUPS).list.arg_max()
wide = wide.with_columns(
[
pl.when(largest_col == i)
.then(pl.col(c) + residual)
.otherwise(pl.col(c))
.alias(c)
for i, c in enumerate(OUTPUT_GROUPS)
]
)
rename_map = {col: f"% {col}" for col in OUTPUT_GROUPS}
return wide.rename(rename_map)
def download_and_convert(output_path: Path) -> None:
print("Downloading Census 2021 ethnic group (TS021) by LSOA from NOMIS...")
frames = []
offset = 0
while True:
url = f"{BASE_URL}&recordoffset={offset}"
response = httpx.get(url, follow_redirects=True, timeout=120)
response.raise_for_status()
if len(response.content) == 0:
break
chunk = pl.read_csv(BytesIO(response.content))
if chunk.height == 0:
break
frames.append(chunk)
print(f" Fetched {chunk.height} rows (offset={offset})")
if chunk.height < PAGE_SIZE:
break
offset += PAGE_SIZE
df = pl.concat(frames)
print(f"Total rows: {df.height}")
# Filter to England only (E-prefixed LSOA codes); the merge joins on the
# English postcode universe and the IoD coverage check is England-wide.
df = df.filter(pl.col("GEOGRAPHY_CODE").str.starts_with("E"))
wide = _ethnicity_percentages(df)
print(f"England LSOAs: {wide.height}")
print(f"Columns: {wide.columns}")
output_path.parent.mkdir(parents=True, exist_ok=True)
wide.write_parquet(output_path, compression="zstd")
print(f"Saved to {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download Census 2021 ethnic group (TS021) by LSOA"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
download_and_convert(args.output)
if __name__ == "__main__":
main()