Add postcode mapping
This commit is contained in:
parent
e7f2d1ffc3
commit
4506263e5b
5 changed files with 966 additions and 0 deletions
97
pipeline/download/inspire.py
Normal file
97
pipeline/download/inspire.py
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
"""Download INSPIRE Index Polygons from HM Land Registry.
|
||||
|
||||
Downloads GML files for all local authorities from the INSPIRE download page.
|
||||
Each ZIP contains a GML file with title extent polygons for that authority.
|
||||
|
||||
Source: https://use-land-property-data.service.gov.uk/datasets/inspire/download
|
||||
License: INSPIRE End User Licence
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from tqdm import tqdm
|
||||
|
||||
BASE = "https://use-land-property-data.service.gov.uk"
|
||||
INDEX_URL = f"{BASE}/datasets/inspire/download"
|
||||
|
||||
|
||||
def get_zip_urls() -> list[str]:
|
||||
"""Scrape the INSPIRE download page for all .zip hrefs."""
|
||||
# The site requires a cookie jar to avoid redirect loops.
|
||||
with httpx.Client(
|
||||
follow_redirects=True,
|
||||
timeout=httpx.Timeout(30.0, read=60),
|
||||
headers={"User-Agent": "Mozilla/5.0", "Accept": "text/html"},
|
||||
) as client:
|
||||
resp = client.get(INDEX_URL)
|
||||
resp.raise_for_status()
|
||||
html = resp.text
|
||||
|
||||
pattern = r'href="(/datasets/inspire/download/[^"]+\.zip)"'
|
||||
paths = sorted(set(re.findall(pattern, html)))
|
||||
return [f"{BASE}{p}" for p in paths]
|
||||
|
||||
|
||||
def download_one(url: str, output_dir: Path, client: httpx.Client) -> str:
|
||||
"""Download a single ZIP file. Returns the filename."""
|
||||
name = url.rsplit("/", 1)[-1]
|
||||
dest = output_dir / name
|
||||
if dest.exists():
|
||||
return f"{name} (skipped, exists)"
|
||||
|
||||
resp = client.get(url)
|
||||
resp.raise_for_status()
|
||||
dest.write_bytes(resp.content)
|
||||
return name
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download INSPIRE Index Polygon GML files"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Output directory for downloaded ZIPs",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workers",
|
||||
type=int,
|
||||
default=8,
|
||||
help="Number of parallel downloads (default: 8)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
args.output.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print("Fetching download index...")
|
||||
urls = get_zip_urls()
|
||||
print(f"Found {len(urls)} files to download")
|
||||
|
||||
with (
|
||||
httpx.Client(
|
||||
follow_redirects=True,
|
||||
timeout=httpx.Timeout(30.0, read=120),
|
||||
headers={"User-Agent": "Mozilla/5.0"},
|
||||
) as client,
|
||||
tqdm(total=len(urls), unit="file") as pbar,
|
||||
):
|
||||
with ThreadPoolExecutor(max_workers=args.workers) as pool:
|
||||
futures = {
|
||||
pool.submit(download_one, url, args.output, client): url for url in urls
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
result = future.result()
|
||||
pbar.set_postfix_str(result[:40])
|
||||
pbar.update(1)
|
||||
|
||||
print(f"Done. {len(urls)} files in {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
36
pipeline/download/oa_boundaries.py
Normal file
36
pipeline/download/oa_boundaries.py
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
"""Download Output Areas (December 2021) Boundaries EW BGC (V2).
|
||||
|
||||
Generalised clipped (20m) boundary polygons for 2021 Census Output Areas
|
||||
covering England and Wales.
|
||||
|
||||
Source: https://open-geography-portalx-ons.hub.arcgis.com/datasets/ons::output-areas-december-2021-boundaries-ew-bgc-v2
|
||||
License: Open Government Licence v3.0
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
from pipeline.utils import download
|
||||
|
||||
URL = "https://open-geography-portalx-ons.hub.arcgis.com/api/download/v1/items/6beafcfd9b9c4c9993a06b6b199d7e6d/geoPackage?layers=0"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download OA 2021 boundary polygons (England & Wales)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Output GeoPackage file path",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
download(URL, args.output, timeout=600)
|
||||
print(f"Saved to {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
77
pipeline/download/uprn_lookup.py
Normal file
77
pipeline/download/uprn_lookup.py
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
"""Download National Statistics UPRN Lookup (December 2025, Epoch 123).
|
||||
|
||||
Maps Unique Property Reference Numbers (UPRNs) to administrative and
|
||||
statistical geographies (wards, output areas, LSOAs, etc.) across GB.
|
||||
|
||||
Source: https://geoportal.statistics.gov.uk/datasets/ons::national-statistics-uprn-lookup-december-2025-epoch-123
|
||||
License: Contains Royal Mail, Ordnance Survey, and ONS data (OGL v3.0)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
|
||||
from pipeline.utils import download, extract_zip
|
||||
|
||||
URL = "https://www.arcgis.com/sharing/rest/content/items/4e0b4b3fbc2540caae27e7be532e61be/data"
|
||||
|
||||
|
||||
def find_csvs(extract_path: Path) -> list[Path]:
|
||||
"""Find all NSUL regional CSVs inside the extracted archive."""
|
||||
csvs = sorted(extract_path.rglob("NSUL_*.csv"))
|
||||
if not csvs:
|
||||
csvs = sorted(extract_path.rglob("*.csv"))
|
||||
if not csvs:
|
||||
raise FileNotFoundError(f"No CSV files found in {extract_path}")
|
||||
print(f"Found {len(csvs)} CSV(s):")
|
||||
for f in csvs:
|
||||
print(f" {f.name}")
|
||||
return csvs
|
||||
|
||||
|
||||
def convert_to_parquet(csv_paths: list[Path], parquet_path: Path) -> None:
|
||||
# Some regional files infer different types for the same column (e.g.
|
||||
# ruc21ind is String in most but Int64 in YH). Read all code columns as
|
||||
# String to avoid schema mismatches.
|
||||
CODE_COLS = {
|
||||
"ruc21ind": pl.String,
|
||||
"oac21ind": pl.String,
|
||||
"imd19ind": pl.String,
|
||||
}
|
||||
df = pl.concat(
|
||||
[
|
||||
pl.scan_csv(p, try_parse_dates=True, schema_overrides=CODE_COLS)
|
||||
for p in csv_paths
|
||||
]
|
||||
)
|
||||
print(f"Columns: {df.collect_schema().names()}")
|
||||
parquet_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
df.sink_parquet(parquet_path, compression="zstd")
|
||||
n = pl.scan_parquet(parquet_path).select(pl.len()).collect().item()
|
||||
print(f"Saved {n:,} rows to {parquet_path}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download National Statistics UPRN Lookup"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
with tempfile.TemporaryDirectory() as cache_dir:
|
||||
zip_path = Path(cache_dir) / "uprn_lookup.zip"
|
||||
extract_path = Path(cache_dir) / "uprn_extracted"
|
||||
|
||||
download(URL, zip_path, timeout=600)
|
||||
extract_zip(zip_path, extract_path)
|
||||
|
||||
csv_paths = find_csvs(extract_path)
|
||||
convert_to_parquet(csv_paths, args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue