This commit is contained in:
Andras Schmelczer 2026-05-28 21:48:35 +01:00
parent 39ef5c6646
commit c995f12f8b
78 changed files with 4830 additions and 1619 deletions

View file

@ -53,7 +53,7 @@ Build an STRtree spatial index over the INSPIRE candidate polygons. Convert all
For each INSPIRE parcel that contains at least one UPRN, run a majority vote: whichever postcode has the most UPRNs inside that parcel wins the parcel. Accumulate winning parcels per postcode, union them, and clip to the OA boundary. The result is `claimed[postcode] = polygon_within_oa`.
Then resolve overlaps: INSPIRE parcels can overlap geographically (digitization overlaps), so two postcodes might claim the same square meters. Walk through the claimed dict in insertion order (the postcode with the most parcel wins gets priority by virtue of appearing first), subtracting the running union from each subsequent postcode's geometry.
For INSPIRE parcels with no contained UPRN, assign the clipped parcel to the nearest UPRN's postcode using the parcel's representative point. These nearest-postcode claims run after contained-UPRN claims, so explicit address-in-parcel evidence keeps priority. Then resolve overlaps: INSPIRE parcels can overlap geographically (digitization overlaps), so two postcodes might claim the same square meters. Walk through claims in priority order, subtracting the running union from each subsequent postcode's geometry.
#### Stage B: Voronoi distribution of remaining area
@ -67,7 +67,7 @@ The Voronoi computation (`voronoi.py`):
5. For each real point's Voronoi cell, constructs the polygon from the Voronoi vertices, clips to the boundary, groups by postcode
6. Unions per-postcode fragments
The effect: every unclaimed patch of OA gets assigned to the nearest postcode by straight-line distance (Voronoi tessellation is exactly the set of all points nearest to each generator).
The effect: every non-parcel patch of OA gets assigned to the nearest postcode by straight-line distance (Voronoi tessellation is exactly the set of all points nearest to each generator).
#### Stage C: Combine
@ -77,7 +77,7 @@ The output of `process_oa` is `list[(postcode, polygon)]` — the per-OA fragmen
### Phase 4: Merging and writing
**Fragment merging** (`output.py:merge_fragments`): Groups all fragments by postcode, unions them. If the result is a MultiPolygon (meaning the postcode has disconnected pieces — either from spanning OAs with a gap, or algorithm artifacts), applies a 1m buffer-then-unbuffer to close tiny gaps from floating-point mismatches at OA boundary edges. If still a MultiPolygon after that, keeps only the largest polygon — postcodes are contiguous delivery routes, so detached fragments are artifacts.
**Fragment merging** (`output.py:merge_fragments`): Groups all fragments by postcode, unions them. If the result is a MultiPolygon (meaning the postcode has disconnected pieces — either from spanning OAs with a gap, or algorithm artifacts), applies a 5m buffer-then-unbuffer to close tiny gaps from floating-point mismatches at OA boundary edges. If still a MultiPolygon after that, keeps only the largest polygon — postcodes are contiguous delivery routes, so detached fragments are artifacts.
**GeoJSON output** (`output.py:write_district_geojson`): Groups postcodes by district (the outward code, e.g. `SW1A` from `SW1A 1AA`). For each district, converts every postcode polygon from BNG to WGS84 using pyproj, simplifies with 1m tolerance (Douglas-Peucker), rounds coordinates to 6 decimal places (~0.1m precision), and writes a single `{district}.geojson` FeatureCollection. Each Feature has `postcodes` (formatted like `"SW1A 1AA"`) and `mapit_code` (no space: `"SW1A1AA"`) in its properties.

View file

@ -8,9 +8,10 @@ Algorithm per OA:
1. Single-postcode OA entire OA polygon assigned to that postcode
2. Multi-postcode OA:
a. Assign INSPIRE parcels to postcodes via UPRN point-in-polygon majority vote
b. Union INSPIRE parcels per postcode, clip to OA "claimed" area
c. Distribute remaining (unclaimed) OA area via Voronoi of UPRN points
d. Final polygon = claimed + Voronoi share
b. Assign INSPIRE parcels with no contained UPRN to the nearest UPRN postcode
c. Union parcel claims per postcode, clip to OA "claimed" area
d. Distribute remaining non-parcel OA area via Voronoi of UPRN points
e. Final polygon = parcel claims + Voronoi share
Memory-efficient design (<12GB total):
- INSPIRE polygons stored as raw coordinate bytes in parquet; Shapely objects built

View file

@ -1,12 +1,15 @@
from collections import Counter, defaultdict
import numpy as np
from scipy.spatial import cKDTree
from shapely import STRtree, make_valid
from shapely.geometry import MultiPolygon, Polygon
from shapely.ops import unary_union
from .voronoi import compute_voronoi_regions
MIN_GEOM_AREA = 0.01
def process_oa(
oa_geom: Polygon | MultiPolygon,
@ -19,76 +22,31 @@ def process_oa(
if len(unique_pcs) == 1:
return [(next(iter(unique_pcs)), oa_geom)]
# Try INSPIRE-based assignment
claimed: dict[str, Polygon | MultiPolygon] = {}
if len(points) == 0:
return []
valid_oa = _clean_polygonal(oa_geom)
if valid_oa is None:
return []
if inspire_candidates:
cand_tree = STRtree(inspire_candidates)
from shapely import points as shp_points
uprn_pts = shp_points(points)
pt_idx, cand_idx = cand_tree.query(uprn_pts, predicate="intersects")
# Majority vote per candidate polygon
cand_postcodes: dict[int, list[str]] = defaultdict(list)
for pi, ci in zip(pt_idx, cand_idx):
cand_postcodes[ci].append(postcodes[pi])
pc_inspire_polys: dict[str, list[Polygon]] = defaultdict(list)
for ci, pc_list in cand_postcodes.items():
winner = Counter(pc_list).most_common(1)[0][0]
pc_inspire_polys[winner].append(inspire_candidates[ci])
for pc, polys in pc_inspire_polys.items():
merged = unary_union(polys)
if not merged.is_valid:
merged = make_valid(merged)
valid_oa = oa_geom if oa_geom.is_valid else make_valid(oa_geom)
clipped = merged.intersection(valid_oa)
if not clipped.is_empty:
if not clipped.is_valid:
clipped = make_valid(clipped)
clipped = _extract_polygonal(clipped)
if clipped is not None:
claimed[pc] = clipped
# Resolve overlaps: INSPIRE parcels can overlap geographically, so two
# postcodes may claim the same area. Give contested area to whichever
# postcode claimed it first (most UPRNs → first in insertion order).
if len(claimed) > 1:
resolved: dict[str, Polygon | MultiPolygon] = {}
used = None
for pc, geom in claimed.items():
if used is not None:
if not geom.is_valid:
geom = make_valid(geom)
if not used.is_valid:
used = make_valid(used)
geom = geom.difference(used)
if geom.is_empty:
continue
geom = _extract_polygonal(geom)
if geom is None:
continue
resolved[pc] = geom
used = geom if used is None else unary_union([used, geom])
claimed = resolved
claimed = _claim_inspire_parcels(valid_oa, points, postcodes, inspire_candidates)
else:
claimed = {}
# Compute remaining area
if claimed:
all_claimed = unary_union(list(claimed.values()))
if not all_claimed.is_valid:
all_claimed = make_valid(all_claimed)
valid_oa = oa_geom if oa_geom.is_valid else make_valid(oa_geom)
remaining = valid_oa.difference(all_claimed)
if not remaining.is_valid:
remaining = make_valid(remaining)
all_claimed = _clean_polygonal(all_claimed)
remaining = (
valid_oa.difference(all_claimed) if all_claimed is not None else valid_oa
)
remaining = _clean_polygonal(remaining)
else:
remaining = oa_geom if oa_geom.is_valid else make_valid(oa_geom)
remaining = valid_oa
# Distribute remaining area via Voronoi
if not remaining.is_empty and remaining.area > 0.01:
# Distribute non-parcel land via Voronoi
if remaining is not None and not remaining.is_empty and remaining.area > MIN_GEOM_AREA:
voronoi_result = compute_voronoi_regions(points, postcodes, remaining)
else:
voronoi_result = {}
@ -102,17 +60,167 @@ def process_oa(
fragments = []
for pc, parts in result.items():
merged = unary_union(parts)
if not merged.is_empty:
if not merged.is_valid:
merged = make_valid(merged)
merged = _extract_polygonal(merged)
if merged is not None:
fragments.append((pc, merged))
merged = _clean_polygonal(unary_union(parts))
if merged is not None:
fragments.append((pc, merged))
return fragments
def _claim_inspire_parcels(
valid_oa: Polygon | MultiPolygon,
points: np.ndarray,
postcodes: list[str],
inspire_candidates: list[Polygon],
) -> dict[str, Polygon | MultiPolygon]:
"""Assign INSPIRE parcels to postcodes before Voronoi fills non-parcel land."""
parcels = _prepare_inspire_parcels(valid_oa, inspire_candidates)
if not parcels:
return {}
cand_tree = STRtree(parcels)
from shapely import points as shp_points
uprn_pts = shp_points(points)
pt_idx, cand_idx = cand_tree.query(uprn_pts, predicate="within")
# First priority: parcels that physically contain UPRNs. Majority vote
# resolves blocks of flats or overlapping parcel data.
cand_postcodes: dict[int, list[str]] = defaultdict(list)
for pi, ci in zip(pt_idx, cand_idx):
cand_postcodes[ci].append(postcodes[pi])
contained_parts: dict[str, list] = defaultdict(list)
contained_scores: Counter[str] = Counter()
for ci, pc_list in cand_postcodes.items():
pc_counts = Counter(pc_list)
winner, votes = pc_counts.most_common(1)[0]
contained_parts[winner].append(parcels[ci])
contained_scores[winner] += votes
contained_claimed = _merge_parts_by_postcode(contained_parts)
contained_claims = sorted(
contained_claimed.items(),
key=lambda item: (-contained_scores[item[0]], -item[1].area, item[0]),
)
# Second priority: remaining INSPIRE parcels with no contained UPRN. Assign
# each to the nearest UPRN/postcode so parcel boundaries carry more of the
# visible postcode shape; Voronoi is then limited to roads, parks, water, and
# any other non-parcel gaps.
points_f64 = points.astype(np.float64, copy=False)
contained_union = _union_claims(contained_claims)
nearest_tree = cKDTree(points_f64)
nearest_parts: dict[str, list] = defaultdict(list)
for i, parcel in enumerate(parcels):
if i in cand_postcodes:
continue
assignable = parcel
if contained_union is not None:
assignable = assignable.difference(contained_union)
for part in _polygon_parts(assignable):
part = _clean_polygonal(part)
if part is None:
continue
pc = _nearest_postcode(part, nearest_tree, postcodes)
nearest_parts[pc].append(part)
nearest_claimed = _merge_parts_by_postcode(nearest_parts)
nearest_claims = sorted(
nearest_claimed.items(),
key=lambda item: (-item[1].area, item[0]),
)
return _resolve_ordered_claims(contained_claims + nearest_claims)
def _prepare_inspire_parcels(
valid_oa: Polygon | MultiPolygon,
inspire_candidates: list[Polygon],
) -> list[Polygon | MultiPolygon]:
parcels: list[Polygon | MultiPolygon] = []
for candidate in inspire_candidates:
geom = _clean_polygonal(candidate)
if geom is None:
continue
if not geom.intersects(valid_oa):
continue
clipped = _clean_polygonal(geom.intersection(valid_oa))
if clipped is not None:
parcels.append(clipped)
return parcels
def _nearest_postcode(
geom: Polygon | MultiPolygon,
tree: cKDTree,
postcodes: list[str],
) -> str:
point = geom.representative_point()
_, idx = tree.query([point.x, point.y])
return postcodes[idx]
def _polygon_parts(geom) -> list[Polygon]:
geom = _clean_polygonal(geom)
if geom is None:
return []
if geom.geom_type == "Polygon":
return [geom]
return list(geom.geoms)
def _merge_parts_by_postcode(
parts_by_postcode: dict[str, list],
) -> dict[str, Polygon | MultiPolygon]:
merged: dict[str, Polygon | MultiPolygon] = {}
for pc, parts in parts_by_postcode.items():
geom = _clean_polygonal(unary_union(parts))
if geom is not None:
merged[pc] = geom
return merged
def _union_claims(
claims: list[tuple[str, Polygon | MultiPolygon]],
) -> Polygon | MultiPolygon | None:
if not claims:
return None
return _clean_polygonal(unary_union([geom for _, geom in claims]))
def _resolve_ordered_claims(
claims: list[tuple[str, Polygon | MultiPolygon]],
) -> dict[str, Polygon | MultiPolygon]:
"""Resolve overlapping parcel claims in priority order."""
resolved_parts: dict[str, list] = defaultdict(list)
used = None
for pc, geom in claims:
geom = _clean_polygonal(geom)
if geom is None:
continue
if used is not None:
geom = _clean_polygonal(geom.difference(used))
if geom is None:
continue
resolved_parts[pc].append(geom)
used = _clean_polygonal(geom if used is None else unary_union([used, geom]))
return _merge_parts_by_postcode(resolved_parts)
def _clean_polygonal(geom) -> Polygon | MultiPolygon | None:
if geom is None or geom.is_empty:
return None
if not geom.is_valid:
geom = make_valid(geom)
geom = _extract_polygonal(geom)
if geom is None or geom.is_empty or geom.area <= MIN_GEOM_AREA:
return None
return geom
def _extract_polygonal(geom) -> Polygon | MultiPolygon | None:
"""Extract only Polygon/MultiPolygon parts from a geometry.

View file

@ -7,6 +7,7 @@ import numpy as np
import polars as pl
import pytest
from shapely.geometry import MultiPolygon, Polygon, box
from shapely.ops import unary_union
from .oa_boundaries import parse_gpkg_geometry
from .greenspace import subtract_greenspace
@ -215,6 +216,20 @@ class TestVoronoiCollinear:
assert ratio > 0.3, f"Area split too unfair: {area_a:.0f} vs {area_b:.0f}"
class TestVoronoiCoverage:
"""Voronoi fallback should cover large OAs even when UPRNs are clustered."""
def test_clustered_points_cover_large_boundary(self):
boundary = box(0, 0, 5000, 100)
points = np.array([[10, 50], [20, 50]])
result = compute_voronoi_regions(points, ["A", "B"], boundary)
covered = unary_union(list(result.values()))
assert covered.area == pytest.approx(boundary.area)
assert boundary.difference(covered).area < 0.01
class TestEqualSplitFallback:
"""_equal_split_fallback must give every postcode some area."""
@ -306,6 +321,186 @@ class TestProcessOAGeometryTypes:
)
class TestProcessOAInspireParcelAssignment:
"""INSPIRE parcels without UPRNs should still shape postcode boundaries."""
def test_unoccupied_inspire_parcel_goes_to_nearest_postcode(self):
"""A parcel with no contained UPRN should not be split by Voronoi."""
oa_geom = box(0, 0, 100, 100)
parcel = box(20, 40, 65, 60) # crosses the x=50 Voronoi split
points = np.array(
[
[10, 50], # postcode A
[90, 50], # postcode B
]
)
postcodes = ["A", "B"]
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[parcel])
frag_dict = dict(fragments)
assert "A" in frag_dict and "B" in frag_dict
assert parcel.difference(frag_dict["A"]).area < 0.01
assert frag_dict["B"].intersection(parcel).area < 0.01
def test_contained_uprn_claim_wins_over_overlapping_nearest_parcel(self):
"""Contained-UPRN parcel claims should keep priority over nearest claims."""
oa_geom = box(0, 0, 100, 100)
contained_a = box(0, 0, 60, 100)
unoccupied_nearer_b = box(50, 0, 80, 100)
points = np.array(
[
[20, 50], # postcode A, inside contained_a
[90, 50], # postcode B, outside unoccupied_nearer_b
]
)
postcodes = ["A", "B"]
fragments = process_oa(
oa_geom,
points,
postcodes,
inspire_candidates=[contained_a, unoccupied_nearer_b],
)
frag_dict = dict(fragments)
assert "A" in frag_dict and "B" in frag_dict
assert contained_a.difference(frag_dict["A"]).area < 0.01
assert frag_dict["A"].intersection(frag_dict["B"]).area < 0.01
assert frag_dict["B"].intersection(box(60, 0, 80, 100)).area > 0
def test_nearest_uses_assignable_fragment_after_contained_subtraction(self):
"""Nearest assignment should use the part left after priority subtraction."""
oa_geom = box(0, 0, 100, 100)
contained_a = box(0, 0, 60, 100)
unoccupied = box(25, 0, 80, 100)
points = np.array(
[
[20, 50], # postcode A, inside contained_a
[90, 50], # postcode B, nearest to unoccupied remainder
]
)
postcodes = ["A", "B"]
fragments = process_oa(
oa_geom,
points,
postcodes,
inspire_candidates=[contained_a, unoccupied],
)
frag_dict = dict(fragments)
assert contained_a.difference(frag_dict["A"]).area < 0.01
assert box(60, 0, 80, 100).difference(frag_dict["B"]).area < 0.01
def test_boundary_uprn_does_not_claim_adjacent_parcel(self):
"""A UPRN on a parcel edge should not count inside both parcels."""
oa_geom = box(0, 0, 100, 100)
left = box(0, 0, 50, 100)
right = box(50, 0, 100, 100)
points = np.array(
[
[50, 50], # postcode A, exactly on shared parcel boundary
[75, 50], # postcode B, strictly inside right parcel
]
)
postcodes = ["A", "B"]
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[left, right])
frag_dict = dict(fragments)
assert "A" in frag_dict and "B" in frag_dict
assert right.difference(frag_dict["B"]).area < 0.01
def test_disconnected_nearest_fragments_can_go_to_different_postcodes(self):
"""A split unoccupied parcel should be assigned component by component."""
oa_geom = box(0, 0, 100, 100)
contained_b = box(40, 0, 60, 100)
unoccupied = box(0, 40, 100, 60)
points = np.array(
[
[10, 20], # postcode A, nearest to left split fragment
[50, 20], # postcode B, inside contained_b but outside unoccupied
[90, 20], # postcode C, nearest to right split fragment
]
)
postcodes = ["A", "B", "C"]
fragments = process_oa(
oa_geom,
points,
postcodes,
inspire_candidates=[contained_b, unoccupied],
)
frag_dict = dict(fragments)
assert box(0, 40, 40, 60).difference(frag_dict["A"]).area < 0.01
assert box(60, 40, 100, 60).difference(frag_dict["C"]).area < 0.01
def test_overlapping_nearest_parcels_do_not_overlap_in_output(self):
"""Two unoccupied nearest-assigned parcels should be resolved cleanly."""
oa_geom = box(0, 0, 100, 100)
left = box(0, 0, 70, 100)
right = box(30, 0, 100, 100)
points = np.array(
[
[10, 50], # postcode A, nearest to left parcel
[90, 50], # postcode B, nearest to right parcel
]
)
postcodes = ["A", "B"]
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[left, right])
frag_dict = dict(fragments)
assert "A" in frag_dict and "B" in frag_dict
assert frag_dict["A"].intersection(frag_dict["B"]).area < 0.01
def test_mixed_inspire_and_voronoi_covers_oa_without_overlap(self):
"""Parcel claims plus Voronoi fallback should cover the whole OA."""
oa_geom = box(0, 0, 100, 100)
contained_a = box(0, 0, 30, 100)
unoccupied = box(70, 0, 90, 100)
points = np.array(
[
[10, 50],
[90, 50],
]
)
postcodes = ["A", "B"]
fragments = process_oa(
oa_geom,
points,
postcodes,
inspire_candidates=[contained_a, unoccupied],
)
geoms = [geom for _, geom in fragments]
covered = unary_union(geoms)
overlap = sum(geom.area for geom in geoms) - covered.area
assert covered.area == pytest.approx(oa_geom.area)
assert oa_geom.difference(covered).area < 0.01
assert overlap < 0.01
def test_inspire_parcel_straddling_oa_is_clipped(self):
"""INSPIRE parcels crossing the OA boundary should not leak outside it."""
oa_geom = box(0, 0, 100, 100)
straddling = box(80, 0, 140, 100)
points = np.array(
[
[10, 50],
[90, 50],
]
)
postcodes = ["A", "B"]
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[straddling])
for _, geom in fragments:
assert geom.difference(oa_geom).area < 0.01
# ---------------------------------------------------------------------------
# _extract_polygonal helper
# ---------------------------------------------------------------------------

View file

@ -52,9 +52,17 @@ def compute_voronoi_regions(
if len(unique_pts) == 1:
return {unique_pcs[0]: boundary}
if not boundary.is_valid:
boundary = make_valid(boundary)
pts = np.array(unique_pts)
min_e, min_n = pts.min(axis=0)
max_e, max_n = pts.max(axis=0)
pts_min_e, pts_min_n = pts.min(axis=0)
pts_max_e, pts_max_n = pts.max(axis=0)
boundary_min_e, boundary_min_n, boundary_max_e, boundary_max_n = boundary.bounds
min_e = min(pts_min_e, boundary_min_e)
min_n = min(pts_min_n, boundary_min_n)
max_e = max(pts_max_e, boundary_max_e)
max_n = max(pts_max_n, boundary_max_n)
span = max(max_e - min_e, max_n - min_n, 100)
dummy = np.array(
@ -79,9 +87,6 @@ def compute_voronoi_regions(
n_real = len(pts)
pc_polys: dict[str, list[Polygon]] = defaultdict(list)
if not boundary.is_valid:
boundary = make_valid(boundary)
for i in range(n_real):
region_idx = vor.point_region[i]
region = vor.regions[region_idx]