good changes

2026-03-25 08:04:48 +00:00 · 2026-03-25 08:04:48 +00:00 · c997ea46a5
commit c997ea46a5
parent 160283f1a1
26 changed files with 991 additions and 288 deletions
--- a/pipeline/download/geosure.py
+++ b/pipeline/download/geosure.py
@ -1,44 +0,0 @@
-"""Download OS GeoSure ground stability data (5km hex grid).
-
-Downloads the GB-Hex-5km-GeoSure dataset from Ordnance Survey as an ESRI
-Shapefile and extracts it.
-
-Source: https://osdatahub.os.uk/downloads/open/GeoSure
-License: Open Government Licence v3.0
-"""
-
-import argparse
-import tempfile
-from pathlib import Path
-
-from pipeline.utils import download, extract_zip
-
-URL = "https://api.os.uk/downloads/v1/products/GB-Hex-5km-GeoSure/downloads?area=GB&format=ESRI%C2%AE+Shapefile&redirect"
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Download OS GeoSure ground stability data"
-    )
-    parser.add_argument(
-        "--output",
-        type=Path,
-        required=True,
-        help="Output directory for extracted shapefile",
-    )
-    args = parser.parse_args()
-
-    with tempfile.TemporaryDirectory() as cache_dir:
-        zip_path = Path(cache_dir) / "geosure.zip"
-
-        download(URL, zip_path, timeout=300)
-        extract_zip(zip_path, args.output)
-
-        shp_files = list(args.output.rglob("*.shp"))
-        print(f"Extracted {len(shp_files)} shapefiles to {args.output}")
-        for f in shp_files:
-            print(f"  {f.relative_to(args.output)}")
-
-
-if __name__ == "__main__":
-    main()
--- a/pipeline/download/median_age.py
+++ b/pipeline/download/median_age.py
@ -0,0 +1,140 @@
+"""Download Census 2021 median age by LSOA.
+
+Downloads five-year age band counts (TS007A) from the NOMIS API, then computes
+the median age per LSOA using linear interpolation within the median class.
+
+Source: NOMIS (ONS Census 2021 — TS007A dataset, NM_2020_1)
+License: Open Government Licence v3.0
+"""
+
+import argparse
+from io import BytesIO
+from pathlib import Path
+
+import httpx
+import polars as pl
+
+# NOMIS API: Census 2021 TS007A (age by five-year bands) by LSOA 2021 (TYPE151)
+# c2021_age_19=1..18 selects 18 five-year bands (excluding 0 = Total)
+# measures=20100 selects absolute count
+BASE_URL = "https://www.nomisweb.co.uk/api/v01/dataset/NM_2020_1.data.csv?date=latest&geography=TYPE151&c2021_age_19=1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18&measures=20100&select=GEOGRAPHY_CODE,C2021_AGE_19_NAME,OBS_VALUE"
+PAGE_SIZE = 25000
+
+# Five-year age bands in order, with lower bounds for interpolation.
+# The last band (85+) is open-ended — we treat it as 85-89 for median purposes.
+AGE_BANDS = [
+    (0, 5),    # Aged 0 to 4 years
+    (5, 5),    # Aged 5 to 9 years
+    (10, 5),   # Aged 10 to 14 years
+    (15, 5),   # Aged 15 to 19 years
+    (20, 5),   # Aged 20 to 24 years
+    (25, 5),   # Aged 25 to 29 years
+    (30, 5),   # Aged 30 to 34 years
+    (35, 5),   # Aged 35 to 39 years
+    (40, 5),   # Aged 40 to 44 years
+    (45, 5),   # Aged 45 to 49 years
+    (50, 5),   # Aged 50 to 54 years
+    (55, 5),   # Aged 55 to 59 years
+    (60, 5),   # Aged 60 to 64 years
+    (65, 5),   # Aged 65 to 69 years
+    (70, 5),   # Aged 70 to 74 years
+    (75, 5),   # Aged 75 to 79 years
+    (80, 5),   # Aged 80 to 84 years
+    (85, 5),   # Aged 85 years and over
+]
+
+
+def compute_median_age(counts: list[int]) -> float:
+    """Compute median age from five-year band counts using linear interpolation."""
+    total = sum(counts)
+    if total == 0:
+        return float("nan")
+
+    half = total / 2
+    cumulative = 0
+    for i, count in enumerate(counts):
+        if cumulative + count >= half:
+            lower_bound, width = AGE_BANDS[i]
+            # Linear interpolation within the median band
+            return lower_bound + ((half - cumulative) / count) * width
+        cumulative += count
+
+    return float("nan")
+
+
+def download_and_convert(output_path: Path) -> None:
+    print("Downloading Census 2021 age by five-year bands from NOMIS...")
+    frames = []
+    offset = 0
+    while True:
+        url = f"{BASE_URL}&recordoffset={offset}"
+        response = httpx.get(url, follow_redirects=True, timeout=120)
+        response.raise_for_status()
+        if len(response.content) == 0:
+            break
+        chunk = pl.read_csv(BytesIO(response.content))
+        if chunk.height == 0:
+            break
+        frames.append(chunk)
+        print(f"  Fetched {chunk.height} rows (offset={offset})")
+        if chunk.height < PAGE_SIZE:
+            break
+        offset += PAGE_SIZE
+
+    df = pl.concat(frames)
+    print(f"Total rows: {df.height}")
+
+    # Filter to England only
+    df = df.filter(pl.col("GEOGRAPHY_CODE").str.starts_with("E"))
+
+    # Pivot: one row per LSOA, columns = age band names, values = counts
+    pivoted = df.pivot(
+        on="C2021_AGE_19_NAME",
+        index="GEOGRAPHY_CODE",
+        values="OBS_VALUE",
+    )
+
+    # Extract age band columns in order and compute median
+    # NOMIS returns band names like "Aged 0 to 4 years", "Aged 85 years and over"
+    band_cols = [c for c in pivoted.columns if c != "GEOGRAPHY_CODE"]
+    # Sort by the lower bound of each band
+    band_cols.sort(key=lambda c: int(c.split()[1]))
+
+    print(f"Age bands found: {len(band_cols)}")
+    print(f"  First: {band_cols[0]}")
+    print(f"  Last:  {band_cols[-1]}")
+
+    # Compute median age per LSOA
+    rows = pivoted.select("GEOGRAPHY_CODE", *band_cols).to_dicts()
+    medians = []
+    for row in rows:
+        counts = [row[col] for col in band_cols]
+        median = compute_median_age(counts)
+        medians.append({"lsoa21": row["GEOGRAPHY_CODE"], "median_age": round(median, 1)})
+
+    result = pl.DataFrame(medians).with_columns(
+        pl.col("median_age").cast(pl.Float32),
+    )
+
+    print(f"England LSOAs: {result.height}")
+    print(f"Median age range: {result['median_age'].min()} - {result['median_age'].max()}")
+    print(f"Mean of medians: {result['median_age'].mean():.1f}")
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    result.write_parquet(output_path, compression="zstd")
+    print(f"Saved to {output_path}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Download Census 2021 median age by LSOA"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output parquet file path"
+    )
+    args = parser.parse_args()
+    download_and_convert(args.output)
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/download/os_greenspace.py
+++ b/pipeline/download/os_greenspace.py
@ -0,0 +1,127 @@
+"""Download OS Open Greenspace and extract site centroids.
+
+Downloads the OS Open Greenspace dataset as ESRI Shapefile, computes
+WGS84 centroids for each greenspace site polygon, and outputs a parquet
+with lat/lng/category columns compatible with the POI proximity pipeline.
+
+Source: https://osdatahub.os.uk/downloads/open/OpenGreenspace
+License: Open Government Licence v3.0
+"""
+
+import argparse
+import tempfile
+from pathlib import Path
+
+import numpy as np
+import polars as pl
+import shapefile as shp
+from pyproj import Transformer
+from shapely.geometry import shape as to_shapely
+
+from pipeline.utils.download import download, extract_zip
+
+URL = "https://api.os.uk/downloads/v1/products/OpenGreenspace/downloads?area=GB&format=ESRI%C2%AE+Shapefile&redirect"
+
+_to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
+
+
+def download_greenspace(output: Path) -> None:
+    output.parent.mkdir(parents=True, exist_ok=True)
+
+    with tempfile.TemporaryDirectory() as cache_dir:
+        zip_path = Path(cache_dir) / "greenspace.zip"
+        extract_dir = Path(cache_dir) / "extracted"
+
+        download(URL, zip_path, timeout=300)
+        extract_zip(zip_path, extract_dir)
+
+        # Find the GreenspaceSite shapefile (not the AccessPoint one)
+        shp_files = list(extract_dir.rglob("*GreenspaceSite*.shp"))
+        if not shp_files:
+            shp_files = [
+                f
+                for f in extract_dir.rglob("*.shp")
+                if "AccessPoint" not in f.name
+            ]
+        if not shp_files:
+            raise FileNotFoundError(
+                "No GreenspaceSite shapefile found in download"
+            )
+
+        print(f"Reading {shp_files[0].name}...")
+        reader = shp.Reader(str(shp_files[0]), encoding="latin-1")
+
+        # Find the "function" field (greenspace type)
+        field_names = [f[0] for f in reader.fields[1:]]  # skip deletion flag
+        func_field = None
+        for name in field_names:
+            if "funct" in name.lower():
+                func_field = name
+                break
+        if func_field is None:
+            raise ValueError(
+                f"No 'function' field found. Available: {field_names}"
+            )
+        func_idx = field_names.index(func_field)
+
+        # Find a name field if available
+        name_idx = None
+        for name in field_names:
+            if "distname" in name.lower():
+                name_idx = field_names.index(name)
+                break
+
+        lats = []
+        lngs = []
+        categories = []
+        names = []
+
+        for sr in reader.shapeRecords():
+            func = sr.record[func_idx]
+            site_name = sr.record[name_idx] if name_idx is not None else ""
+
+            try:
+                geom = to_shapely(sr.shape.__geo_interface__)
+                if geom.is_empty or not geom.is_valid:
+                    continue
+                centroid = geom.centroid
+                lng, lat = _to_wgs84.transform(centroid.x, centroid.y)
+            except Exception:
+                continue
+
+            lats.append(lat)
+            lngs.append(lng)
+            categories.append(func)
+            names.append(site_name or "")
+
+    df = pl.DataFrame(
+        {
+            "lat": np.array(lats, dtype=np.float64),
+            "lng": np.array(lngs, dtype=np.float64),
+            "category": categories,
+            "name": names,
+        }
+    )
+
+    df.write_parquet(output)
+    size_mb = output.stat().st_size / (1024 * 1024)
+    print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} greenspace sites)")
+
+    counts = df.group_by("category").len().sort("len", descending=True)
+    for row in counts.iter_rows(named=True):
+        print(f"  {row['category']}: {row['len']:,}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Download OS Open Greenspace site centroids"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output parquet file path"
+    )
+    args = parser.parse_args()
+    download_greenspace(args.output)
+
+
+if __name__ == "__main__":
+    main()