Add postcode mapping

This commit is contained in:
Andras Schmelczer 2026-02-07 19:28:57 +00:00
parent e7f2d1ffc3
commit 4506263e5b
5 changed files with 966 additions and 0 deletions

View file

@ -0,0 +1,97 @@
"""Download INSPIRE Index Polygons from HM Land Registry.
Downloads GML files for all local authorities from the INSPIRE download page.
Each ZIP contains a GML file with title extent polygons for that authority.
Source: https://use-land-property-data.service.gov.uk/datasets/inspire/download
License: INSPIRE End User Licence
"""
import argparse
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import httpx
from tqdm import tqdm
BASE = "https://use-land-property-data.service.gov.uk"
INDEX_URL = f"{BASE}/datasets/inspire/download"
def get_zip_urls() -> list[str]:
"""Scrape the INSPIRE download page for all .zip hrefs."""
# The site requires a cookie jar to avoid redirect loops.
with httpx.Client(
follow_redirects=True,
timeout=httpx.Timeout(30.0, read=60),
headers={"User-Agent": "Mozilla/5.0", "Accept": "text/html"},
) as client:
resp = client.get(INDEX_URL)
resp.raise_for_status()
html = resp.text
pattern = r'href="(/datasets/inspire/download/[^"]+\.zip)"'
paths = sorted(set(re.findall(pattern, html)))
return [f"{BASE}{p}" for p in paths]
def download_one(url: str, output_dir: Path, client: httpx.Client) -> str:
"""Download a single ZIP file. Returns the filename."""
name = url.rsplit("/", 1)[-1]
dest = output_dir / name
if dest.exists():
return f"{name} (skipped, exists)"
resp = client.get(url)
resp.raise_for_status()
dest.write_bytes(resp.content)
return name
def main() -> None:
parser = argparse.ArgumentParser(
description="Download INSPIRE Index Polygon GML files"
)
parser.add_argument(
"--output",
type=Path,
required=True,
help="Output directory for downloaded ZIPs",
)
parser.add_argument(
"--workers",
type=int,
default=8,
help="Number of parallel downloads (default: 8)",
)
args = parser.parse_args()
args.output.mkdir(parents=True, exist_ok=True)
print("Fetching download index...")
urls = get_zip_urls()
print(f"Found {len(urls)} files to download")
with (
httpx.Client(
follow_redirects=True,
timeout=httpx.Timeout(30.0, read=120),
headers={"User-Agent": "Mozilla/5.0"},
) as client,
tqdm(total=len(urls), unit="file") as pbar,
):
with ThreadPoolExecutor(max_workers=args.workers) as pool:
futures = {
pool.submit(download_one, url, args.output, client): url for url in urls
}
for future in as_completed(futures):
result = future.result()
pbar.set_postfix_str(result[:40])
pbar.update(1)
print(f"Done. {len(urls)} files in {args.output}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,36 @@
"""Download Output Areas (December 2021) Boundaries EW BGC (V2).
Generalised clipped (20m) boundary polygons for 2021 Census Output Areas
covering England and Wales.
Source: https://open-geography-portalx-ons.hub.arcgis.com/datasets/ons::output-areas-december-2021-boundaries-ew-bgc-v2
License: Open Government Licence v3.0
"""
import argparse
from pathlib import Path
from pipeline.utils import download
URL = "https://open-geography-portalx-ons.hub.arcgis.com/api/download/v1/items/6beafcfd9b9c4c9993a06b6b199d7e6d/geoPackage?layers=0"
def main() -> None:
parser = argparse.ArgumentParser(
description="Download OA 2021 boundary polygons (England & Wales)"
)
parser.add_argument(
"--output",
type=Path,
required=True,
help="Output GeoPackage file path",
)
args = parser.parse_args()
args.output.parent.mkdir(parents=True, exist_ok=True)
download(URL, args.output, timeout=600)
print(f"Saved to {args.output}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,77 @@
"""Download National Statistics UPRN Lookup (December 2025, Epoch 123).
Maps Unique Property Reference Numbers (UPRNs) to administrative and
statistical geographies (wards, output areas, LSOAs, etc.) across GB.
Source: https://geoportal.statistics.gov.uk/datasets/ons::national-statistics-uprn-lookup-december-2025-epoch-123
License: Contains Royal Mail, Ordnance Survey, and ONS data (OGL v3.0)
"""
import argparse
import tempfile
from pathlib import Path
import polars as pl
from pipeline.utils import download, extract_zip
URL = "https://www.arcgis.com/sharing/rest/content/items/4e0b4b3fbc2540caae27e7be532e61be/data"
def find_csvs(extract_path: Path) -> list[Path]:
"""Find all NSUL regional CSVs inside the extracted archive."""
csvs = sorted(extract_path.rglob("NSUL_*.csv"))
if not csvs:
csvs = sorted(extract_path.rglob("*.csv"))
if not csvs:
raise FileNotFoundError(f"No CSV files found in {extract_path}")
print(f"Found {len(csvs)} CSV(s):")
for f in csvs:
print(f" {f.name}")
return csvs
def convert_to_parquet(csv_paths: list[Path], parquet_path: Path) -> None:
# Some regional files infer different types for the same column (e.g.
# ruc21ind is String in most but Int64 in YH). Read all code columns as
# String to avoid schema mismatches.
CODE_COLS = {
"ruc21ind": pl.String,
"oac21ind": pl.String,
"imd19ind": pl.String,
}
df = pl.concat(
[
pl.scan_csv(p, try_parse_dates=True, schema_overrides=CODE_COLS)
for p in csv_paths
]
)
print(f"Columns: {df.collect_schema().names()}")
parquet_path.parent.mkdir(parents=True, exist_ok=True)
df.sink_parquet(parquet_path, compression="zstd")
n = pl.scan_parquet(parquet_path).select(pl.len()).collect().item()
print(f"Saved {n:,} rows to {parquet_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download National Statistics UPRN Lookup"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
zip_path = Path(cache_dir) / "uprn_lookup.zip"
extract_path = Path(cache_dir) / "uprn_extracted"
download(URL, zip_path, timeout=600)
extract_zip(zip_path, extract_path)
csv_paths = find_csvs(extract_path)
convert_to_parquet(csv_paths, args.output)
if __name__ == "__main__":
main()