198 lines
7.9 KiB
Python
198 lines
7.9 KiB
Python
"""Download Census 2021 ethnic group (TS021) by LSOA.
|
|
|
|
Downloads the 20-category ethnic-group breakdown (TS021, classification
|
|
C2021_ETH_20) from the NOMIS API at LSOA 2021 granularity, folds the 19 detailed
|
|
leaf categories into our 6 output buckets, and emits one row per LSOA with the
|
|
percentage in each bucket.
|
|
|
|
Sourcing at LSOA (~33,755 England areas) rather than Local Authority (~319) is a
|
|
~100x granularity gain with no change to the 6-bucket output schema: two very
|
|
different neighbourhoods in one borough no longer share an identical ethnicity
|
|
profile. The join key downstream (merge.py) is `lsoa21`, the same key already
|
|
used for median age and IoD.
|
|
|
|
Source: NOMIS (ONS Census 2021 — TS021 dataset, NM_2041_1)
|
|
License: Open Government Licence v3.0
|
|
"""
|
|
|
|
import argparse
|
|
from io import BytesIO
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
import polars as pl
|
|
|
|
pl.Config.set_tbl_cols(-1)
|
|
|
|
# NOMIS API: Census 2021 TS021 (ethnic group, 20 categories) by LSOA 2021
|
|
# (TYPE151). c2021_eth_20=1..19 selects the 19 detailed leaf categories
|
|
# (excluding the 5 broad aggregates 1001-1005 and the 0 = Total, which we
|
|
# re-derive ourselves). measures=20100 selects the absolute count.
|
|
BASE_URL = (
|
|
"https://www.nomisweb.co.uk/api/v01/dataset/NM_2041_1.data.csv"
|
|
"?geography=TYPE151"
|
|
"&c2021_eth_20=1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19"
|
|
"&measures=20100"
|
|
"&select=GEOGRAPHY_CODE,C2021_ETH_20_NAME,OBS_VALUE"
|
|
)
|
|
PAGE_SIZE = 25000
|
|
|
|
# Map the 19 detailed NOMIS C2021_ETH_20 leaf categories to our 6 output groups.
|
|
# The split mirrors the previous Local-Authority source exactly:
|
|
# * "Other Asian" routes to East/SE Asian (not South Asian). The ONS "Other
|
|
# Asian" bucket is predominantly East/Southeast Asian (Filipino, Vietnamese,
|
|
# Thai, Japanese, Korean, ...) rather than South Asian, so routing it here
|
|
# avoids inflating "% South Asian". The split is approximate (the bucket also
|
|
# holds some South Asian groups such as Sri Lankan/Nepalese).
|
|
GROUP_MAP = {
|
|
# White
|
|
"White: English, Welsh, Scottish, Northern Irish or British": "White",
|
|
"White: Irish": "White",
|
|
"White: Gypsy or Irish Traveller": "White",
|
|
"White: Roma": "White",
|
|
"White: Other White": "White",
|
|
# South Asian
|
|
"Asian, Asian British or Asian Welsh: Indian": "South Asian",
|
|
"Asian, Asian British or Asian Welsh: Pakistani": "South Asian",
|
|
"Asian, Asian British or Asian Welsh: Bangladeshi": "South Asian",
|
|
# East / Southeast Asian
|
|
"Asian, Asian British or Asian Welsh: Chinese": "East Asian",
|
|
"Asian, Asian British or Asian Welsh: Other Asian": "South East Asian",
|
|
# Black
|
|
"Black, Black British, Black Welsh, Caribbean or African: African": "Black",
|
|
"Black, Black British, Black Welsh, Caribbean or African: Caribbean": "Black",
|
|
"Black, Black British, Black Welsh, Caribbean or African: Other Black": "Black",
|
|
# Mixed
|
|
"Mixed or Multiple ethnic groups: White and Asian": "Mixed",
|
|
"Mixed or Multiple ethnic groups: White and Black African": "Mixed",
|
|
"Mixed or Multiple ethnic groups: White and Black Caribbean": "Mixed",
|
|
"Mixed or Multiple ethnic groups: Other Mixed or Multiple ethnic groups": "Mixed",
|
|
# Other
|
|
"Other ethnic group: Arab": "Other",
|
|
"Other ethnic group: Any other ethnic group": "Other",
|
|
}
|
|
|
|
# The 6 output groups, in a fixed order so the largest-remainder rounding below
|
|
# is deterministic regardless of pivot column ordering.
|
|
OUTPUT_GROUPS = ["White", "South Asian", "East/SE Asian", "Black", "Mixed", "Other"]
|
|
assert set(GROUP_MAP.values()) == set(OUTPUT_GROUPS), (
|
|
"GROUP_MAP values must be exactly the OUTPUT_GROUPS"
|
|
)
|
|
|
|
|
|
def _ethnicity_percentages(df: pl.DataFrame) -> pl.DataFrame:
|
|
"""Fold the 19 NOMIS leaf categories into 6-bucket percentages per LSOA.
|
|
|
|
`df` is the long-format NOMIS download with columns GEOGRAPHY_CODE,
|
|
C2021_ETH_20_NAME (the detailed leaf label) and OBS_VALUE (a count). A
|
|
missing/extra/relabelled leaf category would silently drop people from the
|
|
denominator, so we validate the category set against GROUP_MAP first and
|
|
fail loudly otherwise.
|
|
"""
|
|
found = set(df["C2021_ETH_20_NAME"].unique().to_list())
|
|
expected = set(GROUP_MAP)
|
|
if found != expected:
|
|
missing = sorted(expected - found)
|
|
unexpected = sorted(found - expected)
|
|
raise ValueError(
|
|
"Census ethnic-group categories do not match the expected NOMIS "
|
|
"TS021 C2021_ETH_20 leaf set.\n"
|
|
f" expected {len(expected)} categories, found {len(found)}\n"
|
|
f" missing: {missing}\n"
|
|
f" unexpected: {unexpected}\n"
|
|
"Refusing to compute percentages against an unrecognised breakdown."
|
|
)
|
|
|
|
# Map each leaf to its output group and sum counts per (LSOA, group). Summing
|
|
# counts (not rounded percentages) keeps the denominator exact.
|
|
grouped = (
|
|
df.with_columns(
|
|
pl.col("C2021_ETH_20_NAME").replace_strict(GROUP_MAP).alias("group"),
|
|
pl.col("OBS_VALUE").cast(pl.Float64, strict=False).alias("_count"),
|
|
)
|
|
.group_by("GEOGRAPHY_CODE", "group")
|
|
.agg(pl.col("_count").sum())
|
|
)
|
|
wide = grouped.pivot(on="group", index="GEOGRAPHY_CODE", values="_count").rename(
|
|
{"GEOGRAPHY_CODE": "lsoa21"}
|
|
)
|
|
|
|
# A group with no people in an LSOA is absent from the long rows, so the pivot
|
|
# leaves a null; treat it as 0 before normalising.
|
|
wide = wide.with_columns(pl.col(OUTPUT_GROUPS).fill_null(0.0))
|
|
|
|
# Normalize so each row sums to exactly 100%, then round with the
|
|
# largest-remainder method to preserve the sum. Independent rounding of 6
|
|
# values can drift +/-0.3.
|
|
row_total = sum(pl.col(c) for c in OUTPUT_GROUPS)
|
|
wide = wide.with_columns(
|
|
[(pl.col(c) / row_total * 100.0).alias(c) for c in OUTPUT_GROUPS]
|
|
)
|
|
# Round to 1 decimal, then adjust the largest group to absorb the residual.
|
|
wide = wide.with_columns([pl.col(c).round(1).alias(c) for c in OUTPUT_GROUPS])
|
|
rounded_sum = sum(pl.col(c) for c in OUTPUT_GROUPS)
|
|
residual = (100.0 - rounded_sum).round(1)
|
|
largest_col = pl.concat_list(OUTPUT_GROUPS).list.arg_max()
|
|
wide = wide.with_columns(
|
|
[
|
|
pl.when(largest_col == i)
|
|
.then(pl.col(c) + residual)
|
|
.otherwise(pl.col(c))
|
|
.alias(c)
|
|
for i, c in enumerate(OUTPUT_GROUPS)
|
|
]
|
|
)
|
|
|
|
rename_map = {col: f"% {col}" for col in OUTPUT_GROUPS}
|
|
return wide.rename(rename_map)
|
|
|
|
|
|
def download_and_convert(output_path: Path) -> None:
|
|
print("Downloading Census 2021 ethnic group (TS021) by LSOA from NOMIS...")
|
|
frames = []
|
|
offset = 0
|
|
while True:
|
|
url = f"{BASE_URL}&recordoffset={offset}"
|
|
response = httpx.get(url, follow_redirects=True, timeout=120)
|
|
response.raise_for_status()
|
|
if len(response.content) == 0:
|
|
break
|
|
chunk = pl.read_csv(BytesIO(response.content))
|
|
if chunk.height == 0:
|
|
break
|
|
frames.append(chunk)
|
|
print(f" Fetched {chunk.height} rows (offset={offset})")
|
|
if chunk.height < PAGE_SIZE:
|
|
break
|
|
offset += PAGE_SIZE
|
|
|
|
df = pl.concat(frames)
|
|
print(f"Total rows: {df.height}")
|
|
|
|
# Filter to England only (E-prefixed LSOA codes); the merge joins on the
|
|
# English postcode universe and the IoD coverage check is England-wide.
|
|
df = df.filter(pl.col("GEOGRAPHY_CODE").str.starts_with("E"))
|
|
|
|
wide = _ethnicity_percentages(df)
|
|
|
|
print(f"England LSOAs: {wide.height}")
|
|
print(f"Columns: {wide.columns}")
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
wide.write_parquet(output_path, compression="zstd")
|
|
print(f"Saved to {output_path}")
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Download Census 2021 ethnic group (TS021) by LSOA"
|
|
)
|
|
parser.add_argument(
|
|
"--output", type=Path, required=True, help="Output parquet file path"
|
|
)
|
|
args = parser.parse_args()
|
|
download_and_convert(args.output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|