perfect-postcode/pipeline/download/lsoa_children.py
Andras Schmelczer f59d01227b
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 15s
CI / Check (push) Failing after 1m58s
SPlit up
2026-06-12 21:51:37 +01:00

79 lines
2.6 KiB
Python

"""Download Census 2021 children by five-year age band per LSOA.
Source: NOMIS (ONS Census 2021 — TS007A dataset, age by five-year bands)
License: Open Government Licence v3.0
Used to estimate how many primary-age (4-10) and secondary-age (11-15)
children live in each LSOA, which drives the school catchment model. Census
bands don't align with school phases, so phase totals take fractional shares
of the 0-4, 10-14 and 15-19 bands (one fifth per single year of age).
"""
import argparse
from pathlib import Path
import polars as pl
from pipeline.utils import ENGLAND_LSOA_COUNT_2021, download_nomis_csv
# NOMIS API: Census 2021 TS007A (age, five-year bands) by LSOA 2021 (TYPE151).
# c2021_age_19 codes: 1 = 0-4, 2 = 5-9, 3 = 10-14, 4 = 15-19.
BASE_URL = (
"https://www.nomisweb.co.uk/api/v01/dataset/NM_2020_1.data.csv"
"?date=latest&geography=TYPE151&measures=20100&c2021_age_19=1,2,3,4"
"&select=GEOGRAPHY_CODE,C2021_AGE_19,OBS_VALUE"
)
AGE_BAND_COLUMNS = {
1: "aged_0_4",
2: "aged_5_9",
3: "aged_10_14",
4: "aged_15_19",
}
def download_and_convert(output_path: Path) -> None:
print("Downloading Census 2021 LSOA age bands from NOMIS...")
df = download_nomis_csv(BASE_URL)
print(f"Total rows: {df.height}")
result = (
df.rename({"GEOGRAPHY_CODE": "lsoa21"})
.pivot(on="C2021_AGE_19", index="lsoa21", values="OBS_VALUE")
.rename({str(code): name for code, name in AGE_BAND_COLUMNS.items()})
.with_columns(pl.col(name).cast(pl.UInt32) for name in AGE_BAND_COLUMNS.values())
.filter(pl.col("lsoa21").str.starts_with("E"))
.sort("lsoa21")
)
missing = [c for c in AGE_BAND_COLUMNS.values() if c not in result.columns]
if missing:
raise ValueError(f"NOMIS response missing age bands: {missing}")
print(f"England LSOAs: {result.height}")
if result.height != ENGLAND_LSOA_COUNT_2021:
raise ValueError(
f"Expected {ENGLAND_LSOA_COUNT_2021} England LSOAs, "
f"got {result.height}: truncated NOMIS download?"
)
for name in AGE_BAND_COLUMNS.values():
print(f" {name}: total {result[name].sum():,}")
output_path.parent.mkdir(parents=True, exist_ok=True)
result.write_parquet(output_path, compression="zstd")
print(f"Saved to {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download Census 2021 age bands (children) by LSOA"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
download_and_convert(args.output)
if __name__ == "__main__":
main()