scraping and data

This commit is contained in:
Andras Schmelczer 2026-05-31 15:36:33 +01:00
parent d98819b569
commit 8688b7475e
43 changed files with 4920 additions and 531 deletions

View file

@ -22,6 +22,12 @@ def main() -> None:
description="Generate postcode boundary polygons from OA + INSPIRE + UPRN data"
)
parser.add_argument("--uprn", type=Path, required=True, help="UPRN lookup parquet")
parser.add_argument(
"--arcgis",
type=Path,
default=None,
help="Optional ArcGIS postcode parquet used to remap terminated postcodes",
)
parser.add_argument(
"--oa-boundaries", type=Path, required=True, help="OA boundaries GeoPackage"
)
@ -46,7 +52,7 @@ def main() -> None:
print("=" * 60)
oa_geoms = load_oa_boundaries(args.oa_boundaries)
uprn_df, uprn_offsets = load_uprns(args.uprn)
uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis)
# Phase 2: Parse/load INSPIRE
print()

View file

@ -0,0 +1,105 @@
"""Load per-district postcode boundary GeoJSONs as EPSG:27700 polygons.
The postcode-boundary pipeline (:mod:`output`) writes one WGS84 GeoJSON per
postcode district under ``units/{district}.geojson``, each feature carrying a
``postcodes`` (full unit string, e.g. "AL1 1AG") property. Spatial transforms
that test points against postcode geometry want those polygons back in British
National Grid (EPSG:27700) so buffers/distances are in metres.
:func:`load_postcode_polygons` reads the files, reprojects WGS8427700, repairs
invalid rings, and returns parallel ``(postcodes, polygons)`` arrays sorted by
postcode so callers can use the array index as a stable postcode id -- the same
"buffer index == postcode index" convention used by ``tree_density``.
"""
from __future__ import annotations
import json
from pathlib import Path
import numpy as np
import shapely
from pyproj import Transformer
def _read_district(
path: Path, transformer: Transformer
) -> tuple[np.ndarray, np.ndarray]:
"""Return (postcodes, polygons_27700) for one district GeoJSON."""
with path.open() as file:
collection = json.load(file)
features = collection.get("features", [])
if not features:
return np.empty(0, dtype=object), np.empty(0, dtype=object)
postcodes = np.array(
[feature["properties"]["postcodes"] for feature in features], dtype=object
)
geom_json = np.array(
[json.dumps(feature["geometry"]) for feature in features], dtype=object
)
geoms = shapely.from_geojson(geom_json)
# Reproject every vertex in a single pyproj call, then rebuild the polygons.
coords = shapely.get_coordinates(geoms)
if coords.size:
x, y = transformer.transform(coords[:, 0], coords[:, 1])
geoms = shapely.set_coordinates(geoms, np.column_stack([x, y]))
invalid = ~shapely.is_valid(geoms)
if invalid.any():
geoms[invalid] = shapely.make_valid(geoms[invalid])
return postcodes, geoms
def load_postcode_polygons(
units_dir: Path, max_postcodes: int | None = None
) -> tuple[np.ndarray, np.ndarray]:
"""Load all postcode polygons under ``units_dir`` reprojected to EPSG:27700.
Returns ``(postcodes, polygons)`` parallel object arrays sorted by postcode.
``max_postcodes`` (testing) keeps only the lexicographically-first N
postcodes, reading just enough district files to reach the cap.
"""
units_dir = Path(units_dir)
files = sorted(units_dir.glob("*.geojson"))
if not files:
raise FileNotFoundError(f"No postcode-boundary GeoJSONs found in {units_dir}")
transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
postcode_chunks: list[np.ndarray] = []
geom_chunks: list[np.ndarray] = []
total = 0
for path in files:
postcodes, geoms = _read_district(path, transformer)
if len(postcodes) == 0:
continue
postcode_chunks.append(postcodes)
geom_chunks.append(geoms)
total += len(postcodes)
if max_postcodes is not None and total >= max_postcodes:
break
if not postcode_chunks:
raise ValueError(f"No postcode features found in {units_dir}")
postcodes = np.concatenate(postcode_chunks)
geoms = np.concatenate(geom_chunks)
# Stable postcode order makes "index == postcode id" deterministic; dedupe
# defensively (a postcode lives in exactly one district file).
order = np.argsort(postcodes, kind="stable")
postcodes = postcodes[order]
geoms = geoms[order]
_, first = np.unique(postcodes, return_index=True)
postcodes = postcodes[first]
geoms = geoms[first]
if max_postcodes is not None and len(postcodes) > max_postcodes:
postcodes = postcodes[:max_postcodes]
geoms = geoms[:max_postcodes]
print(f"Loaded {len(postcodes):,} postcode polygons from {units_dir}")
return postcodes, geoms

View file

@ -121,6 +121,50 @@ class TestWhitespacePostcodes:
loaded_df, _ = load_uprns(path)
assert len(loaded_df) == 0
def test_non_english_oas_excluded(self, tmp_path):
df = pl.DataFrame(
{
"GRIDGB1E": [500010, 300010],
"GRIDGB1N": [180010, 220010],
"PCDS": ["AA1 1AA", "CF1 1AA"],
"OA21CD": ["E00000001", "W00000001"],
}
)
path = tmp_path / "uprn.parquet"
df.write_parquet(path)
loaded_df, offsets = load_uprns(path)
assert set(offsets) == {"E00000001"}
assert loaded_df["PCDS"].to_list() == ["AA1 1AA"]
def test_terminated_postcodes_are_remapped(self, tmp_path):
uprns = pl.DataFrame(
{
"GRIDGB1E": [500010],
"GRIDGB1N": [180010],
"PCDS": ["aa1 1aa"],
"OA21CD": ["E00000001"],
}
)
uprn_path = tmp_path / "uprn.parquet"
uprns.write_parquet(uprn_path)
arcgis = pl.DataFrame(
{
"pcds": ["AA1 1AA", "AA1 1AB"],
"east1m": [500010, 500030],
"north1m": [180010, 180020],
"doterm": ["2020-01-01", None],
"ctry25cd": ["E92000001", "E92000001"],
}
)
arcgis_path = tmp_path / "arcgis.parquet"
arcgis.write_parquet(arcgis_path)
loaded_df, _offsets = load_uprns(uprn_path, arcgis_path)
assert loaded_df["PCDS"].to_list() == ["AA1 1AB"]
# ---------------------------------------------------------------------------
# Bug 3: Voronoi deduplication is first-seen-wins

View file

@ -4,11 +4,18 @@ import numpy as np
import polars as pl
from pipeline.local_temp import local_tmp_dir
from pipeline.utils.postcode_mapping import build_postcode_mapping
from .memory import release_memory
def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
def _canonical_postcode_expr(name: str) -> pl.Expr:
return pl.col(name).str.strip_chars().str.to_uppercase()
def load_uprns(
uprn_path: Path, arcgis_path: Path | None = None
) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
"""Load UPRNs as a sorted polars DataFrame with OA offset lookup.
Returns (df, offsets) where offsets[oa_code] = (start_row, end_row).
@ -17,29 +24,46 @@ def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]
import tempfile
print("Loading UPRN lookup...")
mapping = None
if arcgis_path is not None:
mapping = (
build_postcode_mapping(arcgis_path)
.with_columns(
_canonical_postcode_expr("old_postcode").alias("old_postcode"),
_canonical_postcode_expr("new_postcode").alias("new_postcode"),
)
.unique("old_postcode")
)
# Sort via streaming sink to avoid polars doubling memory during in-memory sort
with tempfile.NamedTemporaryFile(
suffix=".parquet", delete=False, dir=local_tmp_dir()
) as tmp:
tmp_path = Path(tmp.name)
(
uprns = (
pl.scan_parquet(uprn_path)
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
.filter(~pl.col("OA21CD").str.starts_with("S"))
.filter(pl.col("OA21CD").str.starts_with("E"))
.filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null())
.with_columns(pl.col("PCDS").str.strip_chars())
.with_columns(_canonical_postcode_expr("PCDS").alias("PCDS"))
.filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != ""))
.sort("OA21CD")
.sink_parquet(tmp_path)
)
if mapping is not None and mapping.height > 0:
uprns = (
uprns.join(mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left")
.with_columns(pl.coalesce("new_postcode", "PCDS").alias("PCDS"))
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
)
uprns.sort("OA21CD").sink_parquet(tmp_path)
release_memory()
# Read the sorted data — only one copy in memory (~2GB)
df = pl.read_parquet(tmp_path)
tmp_path.unlink()
n = len(df)
print(f" Loaded {n:,} UPRNs (England & Wales)")
print(f" Loaded {n:,} UPRNs (England)")
# Compute OA group offsets using polars (avoids 37M Python string creation)
boundary_df = (