"""Download Census 2021 median age by LSOA.

Downloads five-year age band counts (TS007A) from the NOMIS API, then computes
the median age per LSOA using linear interpolation within the median class.

Source: NOMIS (ONS Census 2021 — TS007A dataset, NM_2020_1)
License: Open Government Licence v3.0
"""

import argparse
from io import BytesIO
from pathlib import Path

import httpx
import polars as pl

# NOMIS API: Census 2021 TS007A (age by five-year bands) by LSOA 2021 (TYPE151)
# c2021_age_19=1..18 selects 18 five-year bands (excluding 0 = Total)
# measures=20100 selects absolute count
BASE_URL = "https://www.nomisweb.co.uk/api/v01/dataset/NM_2020_1.data.csv?date=latest&geography=TYPE151&c2021_age_19=1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18&measures=20100&select=GEOGRAPHY_CODE,C2021_AGE_19_NAME,OBS_VALUE"
PAGE_SIZE = 25000

# Five-year age bands in order, with lower bounds for interpolation.
# The last band (85+) is open-ended — we treat it as 85-89 for median purposes.
AGE_BANDS = [
    (0, 5),  # Aged 0 to 4 years
    (5, 5),  # Aged 5 to 9 years
    (10, 5),  # Aged 10 to 14 years
    (15, 5),  # Aged 15 to 19 years
    (20, 5),  # Aged 20 to 24 years
    (25, 5),  # Aged 25 to 29 years
    (30, 5),  # Aged 30 to 34 years
    (35, 5),  # Aged 35 to 39 years
    (40, 5),  # Aged 40 to 44 years
    (45, 5),  # Aged 45 to 49 years
    (50, 5),  # Aged 50 to 54 years
    (55, 5),  # Aged 55 to 59 years
    (60, 5),  # Aged 60 to 64 years
    (65, 5),  # Aged 65 to 69 years
    (70, 5),  # Aged 70 to 74 years
    (75, 5),  # Aged 75 to 79 years
    (80, 5),  # Aged 80 to 84 years
    (85, 5),  # Aged 85 years and over
]

# Canonical NOMIS TS007A (C2021_AGE_19_NAME) band labels, in the SAME order as
# AGE_BANDS. Index i here corresponds to AGE_BANDS[i]; we validate the pivot
# output against this set and use it (not positional string parsing) to order
# the columns, so a stray/relabelled/missing band fails loudly instead of
# silently mis-aligning counts against the wrong lower bound.
EXPECTED_BAND_NAMES = [
    "Aged 0 to 4 years",
    "Aged 5 to 9 years",
    "Aged 10 to 14 years",
    "Aged 15 to 19 years",
    "Aged 20 to 24 years",
    "Aged 25 to 29 years",
    "Aged 30 to 34 years",
    "Aged 35 to 39 years",
    "Aged 40 to 44 years",
    "Aged 45 to 49 years",
    "Aged 50 to 54 years",
    "Aged 55 to 59 years",
    "Aged 60 to 64 years",
    "Aged 65 to 69 years",
    "Aged 70 to 74 years",
    "Aged 75 to 79 years",
    "Aged 80 to 84 years",
    "Aged 85 years and over",
]
assert len(EXPECTED_BAND_NAMES) == len(AGE_BANDS), (
    "EXPECTED_BAND_NAMES and AGE_BANDS must stay aligned 1:1"
)

# NOMIS sometimes labels a band with a wording variant that denotes the SAME
# age range (e.g. "Aged 4 years and under" for ages 0-4, "Aged 90 years and
# over" wording for the top band). Map such known-equivalent labels back to the
# canonical name BEFORE validation so a real band change still fails loudly,
# but a cosmetic relabel of an identical range does not block the build.
BAND_NAME_ALIASES = {
    "Aged 4 years and under": "Aged 0 to 4 years",
}
assert set(BAND_NAME_ALIASES.values()) <= set(EXPECTED_BAND_NAMES), (
    "BAND_NAME_ALIASES must map to canonical EXPECTED_BAND_NAMES"
)


def compute_median_age(counts: list[int]) -> float:
    """Compute median age from five-year band counts using linear interpolation."""
    total = sum(counts)
    if total == 0:
        return float("nan")

    half = total / 2
    cumulative = 0
    for i, count in enumerate(counts):
        if cumulative + count >= half:
            lower_bound, width = AGE_BANDS[i]
            # Linear interpolation within the median band
            return lower_bound + ((half - cumulative) / count) * width
        cumulative += count

    return float("nan")


def _bands_to_median_table(pivoted: pl.DataFrame) -> pl.DataFrame:
    """Validate the pivoted age-band columns, then compute median age per LSOA.

    The pivot must contain exactly the canonical NOMIS TS007A bands; a
    missing/extra/relabelled band would otherwise silently mis-align counts
    against the wrong AGE_BANDS lower bound, so we fail loudly instead.
    """
    # Normalise known-equivalent NOMIS label variants to their canonical name
    # before validating (renaming onto an already-present canonical column would
    # collide, so polars raises loudly in that genuinely ambiguous case).
    rename_map = {
        c: BAND_NAME_ALIASES[c] for c in pivoted.columns if c in BAND_NAME_ALIASES
    }
    if rename_map:
        pivoted = pivoted.rename(rename_map)

    # Validate the pivoted age-band columns against the canonical NOMIS set
    # BEFORE computing anything.
    band_cols = [c for c in pivoted.columns if c != "GEOGRAPHY_CODE"]
    found = set(band_cols)
    expected = set(EXPECTED_BAND_NAMES)
    if found != expected:
        missing = sorted(expected - found)
        unexpected = sorted(found - expected)
        raise ValueError(
            "Census age-band columns do not match the expected NOMIS TS007A bands.\n"
            f"  expected {len(EXPECTED_BAND_NAMES)} bands, found {len(band_cols)}\n"
            f"  missing:    {missing}\n"
            f"  unexpected: {unexpected}\n"
            "Refusing to compute medians against misaligned bands."
        )

    # Use the canonical order (guaranteed aligned with AGE_BANDS), not positional
    # string parsing, and treat a null band (zero-population) as 0 rather than
    # crashing on sum().
    band_cols = list(EXPECTED_BAND_NAMES)
    pivoted = pivoted.with_columns(pl.col(band_cols).fill_null(0))

    print(f"Age bands found: {len(band_cols)}")
    print(f"  First: {band_cols[0]}")
    print(f"  Last:  {band_cols[-1]}")

    rows = pivoted.select("GEOGRAPHY_CODE", *band_cols).to_dicts()
    medians = []
    for row in rows:
        counts = [row[col] for col in band_cols]
        median = compute_median_age(counts)
        medians.append(
            {"lsoa21": row["GEOGRAPHY_CODE"], "median_age": round(median, 1)}
        )

    return pl.DataFrame(medians).with_columns(
        pl.col("median_age").cast(pl.Float32),
    )


def download_and_convert(output_path: Path) -> None:
    print("Downloading Census 2021 age by five-year bands from NOMIS...")
    frames = []
    offset = 0
    while True:
        url = f"{BASE_URL}&recordoffset={offset}"
        response = httpx.get(url, follow_redirects=True, timeout=120)
        response.raise_for_status()
        if len(response.content) == 0:
            break
        chunk = pl.read_csv(BytesIO(response.content))
        if chunk.height == 0:
            break
        frames.append(chunk)
        print(f"  Fetched {chunk.height} rows (offset={offset})")
        if chunk.height < PAGE_SIZE:
            break
        offset += PAGE_SIZE

    df = pl.concat(frames)
    print(f"Total rows: {df.height}")

    # Filter to England only
    df = df.filter(pl.col("GEOGRAPHY_CODE").str.starts_with("E"))

    # Pivot: one row per LSOA, columns = age band names, values = counts
    pivoted = df.pivot(
        on="C2021_AGE_19_NAME",
        index="GEOGRAPHY_CODE",
        values="OBS_VALUE",
    )

    result = _bands_to_median_table(pivoted)

    print(f"England LSOAs: {result.height}")
    print(
        f"Median age range: {result['median_age'].min()} - {result['median_age'].max()}"
    )
    print(f"Mean of medians: {result['median_age'].mean():.1f}")

    output_path.parent.mkdir(parents=True, exist_ok=True)
    result.write_parquet(output_path, compression="zstd")
    print(f"Saved to {output_path}")


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Download Census 2021 median age by LSOA"
    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
    )
    args = parser.parse_args()
    download_and_convert(args.output)


if __name__ == "__main__":
    main()