Add postcode mapping

2026-02-07 19:28:57 +00:00 · 2026-02-07 19:28:57 +00:00 · 4506263e5b
commit 4506263e5b
parent e7f2d1ffc3
5 changed files with 966 additions and 0 deletions
--- a/pipeline/download/inspire.py
+++ b/pipeline/download/inspire.py
@ -0,0 +1,97 @@
+"""Download INSPIRE Index Polygons from HM Land Registry.
+
+Downloads GML files for all local authorities from the INSPIRE download page.
+Each ZIP contains a GML file with title extent polygons for that authority.
+
+Source: https://use-land-property-data.service.gov.uk/datasets/inspire/download
+License: INSPIRE End User Licence
+"""
+
+import argparse
+import re
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+
+import httpx
+from tqdm import tqdm
+
+BASE = "https://use-land-property-data.service.gov.uk"
+INDEX_URL = f"{BASE}/datasets/inspire/download"
+
+
+def get_zip_urls() -> list[str]:
+    """Scrape the INSPIRE download page for all .zip hrefs."""
+    # The site requires a cookie jar to avoid redirect loops.
+    with httpx.Client(
+        follow_redirects=True,
+        timeout=httpx.Timeout(30.0, read=60),
+        headers={"User-Agent": "Mozilla/5.0", "Accept": "text/html"},
+    ) as client:
+        resp = client.get(INDEX_URL)
+        resp.raise_for_status()
+        html = resp.text
+
+    pattern = r'href="(/datasets/inspire/download/[^"]+\.zip)"'
+    paths = sorted(set(re.findall(pattern, html)))
+    return [f"{BASE}{p}" for p in paths]
+
+
+def download_one(url: str, output_dir: Path, client: httpx.Client) -> str:
+    """Download a single ZIP file. Returns the filename."""
+    name = url.rsplit("/", 1)[-1]
+    dest = output_dir / name
+    if dest.exists():
+        return f"{name} (skipped, exists)"
+
+    resp = client.get(url)
+    resp.raise_for_status()
+    dest.write_bytes(resp.content)
+    return name
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Download INSPIRE Index Polygon GML files"
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        required=True,
+        help="Output directory for downloaded ZIPs",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=8,
+        help="Number of parallel downloads (default: 8)",
+    )
+    args = parser.parse_args()
+
+    args.output.mkdir(parents=True, exist_ok=True)
+
+    print("Fetching download index...")
+    urls = get_zip_urls()
+    print(f"Found {len(urls)} files to download")
+
+    with (
+        httpx.Client(
+            follow_redirects=True,
+            timeout=httpx.Timeout(30.0, read=120),
+            headers={"User-Agent": "Mozilla/5.0"},
+        ) as client,
+        tqdm(total=len(urls), unit="file") as pbar,
+    ):
+        with ThreadPoolExecutor(max_workers=args.workers) as pool:
+            futures = {
+                pool.submit(download_one, url, args.output, client): url for url in urls
+            }
+            for future in as_completed(futures):
+                result = future.result()
+                pbar.set_postfix_str(result[:40])
+                pbar.update(1)
+
+    print(f"Done. {len(urls)} files in {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/download/oa_boundaries.py
+++ b/pipeline/download/oa_boundaries.py
@ -0,0 +1,36 @@
+"""Download Output Areas (December 2021) Boundaries EW BGC (V2).
+
+Generalised clipped (20m) boundary polygons for 2021 Census Output Areas
+covering England and Wales.
+
+Source: https://open-geography-portalx-ons.hub.arcgis.com/datasets/ons::output-areas-december-2021-boundaries-ew-bgc-v2
+License: Open Government Licence v3.0
+"""
+
+import argparse
+from pathlib import Path
+
+from pipeline.utils import download
+
+URL = "https://open-geography-portalx-ons.hub.arcgis.com/api/download/v1/items/6beafcfd9b9c4c9993a06b6b199d7e6d/geoPackage?layers=0"
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Download OA 2021 boundary polygons (England & Wales)"
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        required=True,
+        help="Output GeoPackage file path",
+    )
+    args = parser.parse_args()
+
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    download(URL, args.output, timeout=600)
+    print(f"Saved to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/download/uprn_lookup.py
+++ b/pipeline/download/uprn_lookup.py
@ -0,0 +1,77 @@
+"""Download National Statistics UPRN Lookup (December 2025, Epoch 123).
+
+Maps Unique Property Reference Numbers (UPRNs) to administrative and
+statistical geographies (wards, output areas, LSOAs, etc.) across GB.
+
+Source: https://geoportal.statistics.gov.uk/datasets/ons::national-statistics-uprn-lookup-december-2025-epoch-123
+License: Contains Royal Mail, Ordnance Survey, and ONS data (OGL v3.0)
+"""
+
+import argparse
+import tempfile
+from pathlib import Path
+
+import polars as pl
+
+from pipeline.utils import download, extract_zip
+
+URL = "https://www.arcgis.com/sharing/rest/content/items/4e0b4b3fbc2540caae27e7be532e61be/data"
+
+
+def find_csvs(extract_path: Path) -> list[Path]:
+    """Find all NSUL regional CSVs inside the extracted archive."""
+    csvs = sorted(extract_path.rglob("NSUL_*.csv"))
+    if not csvs:
+        csvs = sorted(extract_path.rglob("*.csv"))
+    if not csvs:
+        raise FileNotFoundError(f"No CSV files found in {extract_path}")
+    print(f"Found {len(csvs)} CSV(s):")
+    for f in csvs:
+        print(f"  {f.name}")
+    return csvs
+
+
+def convert_to_parquet(csv_paths: list[Path], parquet_path: Path) -> None:
+    # Some regional files infer different types for the same column (e.g.
+    # ruc21ind is String in most but Int64 in YH). Read all code columns as
+    # String to avoid schema mismatches.
+    CODE_COLS = {
+        "ruc21ind": pl.String,
+        "oac21ind": pl.String,
+        "imd19ind": pl.String,
+    }
+    df = pl.concat(
+        [
+            pl.scan_csv(p, try_parse_dates=True, schema_overrides=CODE_COLS)
+            for p in csv_paths
+        ]
+    )
+    print(f"Columns: {df.collect_schema().names()}")
+    parquet_path.parent.mkdir(parents=True, exist_ok=True)
+    df.sink_parquet(parquet_path, compression="zstd")
+    n = pl.scan_parquet(parquet_path).select(pl.len()).collect().item()
+    print(f"Saved {n:,} rows to {parquet_path}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Download National Statistics UPRN Lookup"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output parquet file path"
+    )
+    args = parser.parse_args()
+
+    with tempfile.TemporaryDirectory() as cache_dir:
+        zip_path = Path(cache_dir) / "uprn_lookup.zip"
+        extract_path = Path(cache_dir) / "uprn_extracted"
+
+        download(URL, zip_path, timeout=600)
+        extract_zip(zip_path, extract_path)
+
+        csv_paths = find_csvs(extract_path)
+        convert_to_parquet(csv_paths, args.output)
+
+
+if __name__ == "__main__":
+    main()