This commit is contained in:
Andras Schmelczer 2026-06-02 13:46:18 +01:00
parent a04ac2d857
commit d43da9708c
47 changed files with 4120 additions and 573 deletions

View file

@ -3,8 +3,9 @@ import shutil
from collections import defaultdict
from pathlib import Path
import numpy as np
from pyproj import Transformer
from shapely import make_valid, set_precision
from shapely import STRtree, make_valid, set_precision
from shapely.errors import GEOSException
from shapely.geometry import MultiPolygon, Polygon, mapping, shape
from shapely.ops import transform as transform_geometry
@ -41,30 +42,30 @@ def _largest_polygonal(geom) -> Polygon | None:
return None
def to_wgs84_geojson(
geom: Polygon | MultiPolygon, tolerance: float = 1.0
) -> dict | None:
"""Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
# Output coordinate grid (~0.11 m at UK latitudes). Polygons whose extent is
# below this in any direction snap to empty during serialization.
_OUTPUT_PRECISION_DEG = 0.000001
# Minimal BNG buffer used to rescue sub-grid slivers into a representable
# footprint. A near-zero-area Voronoi/INSPIRE spike (e.g. three almost-collinear
# vertices) would otherwise vanish at output precision; since every *active*
# postcode must keep a boundary (validate_outputs enforces this with zero
# tolerance), we fatten it just enough to survive snapping rather than drop it.
_MIN_FOOTPRINT_BUFFER_M = 0.5
def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
"""Transform a BNG polygon to WGS84, snap to output precision, validate.
Validates the *serialized* GeoJSON dict (via a ``shape()`` round-trip), not
just the intermediate Shapely object: coordinate snapping during
serialization can otherwise leave a self-intersecting ring that only shows up
once the feature is read back from disk. Any such geometry is repaired with
``make_valid`` before returning so written features are always valid.
once the feature is read back from disk. Returns ``None`` if the geometry
collapses to empty (a sub-grid sliver).
"""
geom = _largest_polygonal(geom)
if geom is None:
return None
simplified = geom.simplify(tolerance, preserve_topology=True)
simplified = _largest_polygonal(simplified)
if simplified is None:
return None
transformer = _get_to_wgs84()
wgs84 = transform_geometry(transformer.transform, simplified)
wgs84 = transform_geometry(transformer.transform, geom_bng)
try:
wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
wgs84 = set_precision(wgs84, _OUTPUT_PRECISION_DEG, mode="valid_output")
except GEOSException:
# Precision snapping can fail on pathological geometries; fall back to a
# plain validity repair without coordinate snapping.
@ -87,20 +88,105 @@ def to_wgs84_geojson(
return geojson_dict
def _rescue_footprint(geom_bng) -> dict | None:
"""Fatten a degenerate BNG geometry into a representable footprint and snap."""
footprint = _largest_polygonal(geom_bng.buffer(_MIN_FOOTPRINT_BUFFER_M))
if footprint is None:
return None
return _snap_to_wgs84_geojson(footprint)
def to_wgs84_geojson(
geom: Polygon | MultiPolygon, tolerance: float = 1.0
) -> dict | None:
"""Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
A few thousand postcodes reduce to a sub-grid sliver that snaps to empty at
output precision. Dropping them would leave an active postcode with no
boundary (validate_outputs rejects that with zero tolerance), so instead they
are fattened into a minimal footprint at the right location: first by buffering
the (often elongated) sliver itself, then -- for fully-degenerate input -- a
small disc around ``representative_point()``, which lies inside any non-empty
geometry. ``None`` is returned only for a genuinely empty input.
"""
if geom is None or geom.is_empty:
return None
cleaned = _largest_polygonal(geom)
if cleaned is not None:
simplified = _largest_polygonal(
cleaned.simplify(tolerance, preserve_topology=True)
)
if simplified is None:
simplified = cleaned
# Normal path; if snapping erases a thin sliver, fatten its real shape.
result = _snap_to_wgs84_geojson(simplified)
if result is None:
result = _rescue_footprint(simplified)
if result is not None:
return result
# Universal fallback for input too degenerate to clean or fatten in place.
return _rescue_footprint(geom.representative_point())
def to_wgs84_geojson_multi(
geom: Polygon | MultiPolygon, tolerance: float = 1.0
) -> dict | None:
"""Convert a (possibly multi-part) postcode geometry to a GeoJSON dict,
preserving every part. Each part is simplified/snapped/rescued independently
via :func:`to_wgs84_geojson`; the result is a ``Polygon`` for a single part or
a ``MultiPolygon`` for several. ``None`` only if every part is degenerate.
"""
parts = list(geom.geoms) if geom.geom_type == "MultiPolygon" else [geom]
part_dicts = [d for part in parts if (d := to_wgs84_geojson(part, tolerance))]
if not part_dicts:
return None
if len(part_dicts) == 1:
return part_dicts[0]
return {
"type": "MultiPolygon",
"coordinates": [pd["coordinates"] for pd in part_dicts],
}
# Interior holes from the INSPIRE+Voronoi+make_valid chain are small artifacts and
# get filled. A hole at least this large is likely a genuinely enclosed postcode
# (kept, so we never solidify over a neighbour); the de-overlap pass is the real
# guarantee, this is defence-in-depth.
_MAX_ARTIFACT_HOLE_AREA = 1000.0
def _fill_small_holes(poly: Polygon) -> Polygon:
kept = [r for r in poly.interiors if Polygon(r).area >= _MAX_ARTIFACT_HOLE_AREA]
return Polygon(poly.exterior, kept)
def _fill_holes(geom):
"""Remove all interior rings (holes) from a polygon or multipolygon."""
"""Fill small artifact interior rings; keep large (real-enclosed) holes."""
if geom.geom_type == "Polygon":
return Polygon(geom.exterior)
return _fill_small_holes(geom)
elif geom.geom_type == "MultiPolygon":
return MultiPolygon([Polygon(p.exterior) for p in geom.geoms])
return MultiPolygon([_fill_small_holes(p) for p in geom.geoms])
return geom
def _largest_polygon(geom):
"""Extract the largest polygon from a MultiPolygon."""
if geom.geom_type == "MultiPolygon":
return max(geom.geoms, key=lambda g: g.area)
return geom
# A postcode genuinely split across an OA seam (by a railway, river, or main road
# wider than the merge buffer) arrives here as a MultiPolygon. Keeping only the
# largest part used to discard the rest, leaving ~1.8% of merged area as uncovered
# gaps (often 3000-5000 m² building blocks). Keep every part at least this big;
# smaller detached bits are Voronoi/clipping noise and are still dropped.
_MIN_DETACHED_PART_AREA = 100.0
def _keep_polygon_parts(geom):
"""Keep all MultiPolygon parts >= _MIN_DETACHED_PART_AREA (largest if none)."""
if geom.geom_type != "MultiPolygon":
return geom
parts = [g for g in geom.geoms if g.area >= _MIN_DETACHED_PART_AREA]
if not parts:
parts = [max(geom.geoms, key=lambda g: g.area)]
return parts[0] if len(parts) == 1 else MultiPolygon(parts)
def merge_fragments(
@ -126,14 +212,19 @@ def merge_fragments(
continue
if not combined.is_valid:
combined = make_valid(combined)
# Close tiny gaps between adjacent OA boundary edges (float mismatches)
# Close tiny gaps between adjacent OA boundary edges (float mismatches).
# The closing can erode a tiny MultiPolygon (e.g. a postcode with only a
# sliver fragment) to nothing, which would leave the postcode with no
# geometry at all — keep the un-closed shape if that happens.
if combined.geom_type == "MultiPolygon":
combined = combined.buffer(5.0).buffer(-5.0)
if not combined.is_valid:
combined = make_valid(combined)
# Postcodes are contiguous delivery routes — keep only the largest
# polygon; small detached fragments are algorithm artifacts
combined = _largest_polygon(combined)
closed = combined.buffer(5.0).buffer(-5.0)
if not closed.is_valid:
closed = make_valid(closed)
if not closed.is_empty:
combined = closed
# Keep the postcode whole: the largest part plus any other substantial
# part (a genuine railway/river split), dropping only tiny noise slivers.
combined = _keep_polygon_parts(combined)
# Remove artifact interior holes from INSPIRE+Voronoi+make_valid chain
combined = _fill_holes(combined)
# Subtract parks/water if provided
@ -142,7 +233,7 @@ def merge_fragments(
pre_green = combined
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
combined = _largest_polygon(combined)
combined = _keep_polygon_parts(combined)
# Do NOT _fill_holes here: interior holes carved by the greenspace
# subtraction (lakes, enclosed parks) are intentional, not artifacts.
# Filling them would re-add the removed area and negate the
@ -155,10 +246,114 @@ def merge_fragments(
return merged
def _polygonal(geom):
"""Return only the polygonal part(s) of a geometry, or None if none remain."""
if geom is None or geom.is_empty:
return None
if geom.geom_type in ("Polygon", "MultiPolygon"):
return geom
if geom.geom_type == "GeometryCollection":
polys = [
g
for g in geom.geoms
if g.geom_type in ("Polygon", "MultiPolygon") and not g.is_empty
]
if not polys:
return None
merged = unary_union(polys)
return merged if not merged.is_empty else None
return None
def _resolve_overlaps(
items: list[tuple[str, Polygon | MultiPolygon]],
) -> list[tuple[str, Polygon | MultiPolygon]]:
"""Make the postcode polygons a partition: no two cover the same ground.
Overlap appears at OA seams (the 5m merge buffer expands each postcode
independently), from simplifying each postcode on its own, and as genuine
containment (a postcode fully enclosed by another). Each postcode is trimmed
by the union of its higher-priority overlapping neighbours, where **priority =
ascending area**: a smaller postcode wins contested ground. That single rule
handles both cases correctly an enclosed postcode is always smaller than its
container, so it keeps its area while the container gets a hole (a `overlaps`
query alone would miss containment entirely). Run last, on the final output
geometries, so nothing re-introduces overlap afterwards. A postcode that would
be emptied keeps its original geometry, so an active postcode is never dropped.
"""
geoms = [g for _, g in items]
n = len(geoms)
if n < 2:
return items
# rank[i]: 0 = highest priority (smallest area). Postcode string breaks ties
# for determinism.
rank = {
idx: r
for r, idx in enumerate(
sorted(range(n), key=lambda i: (geoms[i].area, items[i][0]))
)
}
tree = STRtree(geoms)
arr = np.array(geoms, dtype=object)
pairs: set[tuple[int, int]] = set()
# "overlaps" gives partial overlaps; "contains" gives containment (which
# "overlaps" excludes) — together they cover every 2-D overlap without the
# edge-touch explosion a plain "intersects" query would add.
for predicate in ("overlaps", "contains"):
qsrc, qtgt = tree.query(arr, predicate=predicate)
for s, t in zip(qsrc.tolist(), qtgt.tolist()):
if s != t:
pairs.add((s, t) if s < t else (t, s))
# For each loser (lower priority) the higher-priority neighbours to subtract.
higher: dict[int, list[int]] = defaultdict(list)
for a, b in pairs:
winner, loser = (a, b) if rank[a] < rank[b] else (b, a)
higher[loser].append(winner)
out = list(geoms)
# Process losers from highest priority down, so every subtracted neighbour is
# already finalised.
for i in sorted(higher, key=lambda idx: rank[idx]):
cut = unary_union([out[j] for j in higher[i]])
trimmed = out[i].difference(cut)
if not trimmed.is_valid:
trimmed = make_valid(trimmed)
# Keep all polygonal parts: these geometries are in WGS84 degrees, so an
# area threshold here would wrongly drop everything but the largest part
# and re-open the very gaps the seam fix closed.
trimmed = _polygonal(trimmed)
if trimmed is not None and not trimmed.is_empty:
out[i] = trimmed
return [(pc, out[i]) for i, (pc, _) in enumerate(items)]
def _round_coords(coords, ndigits=6):
if coords and isinstance(coords[0], (int, float)):
return [round(coords[0], ndigits), round(coords[1], ndigits)]
return [_round_coords(c, ndigits) for c in coords]
def _geojson_geometry(geom) -> dict | None:
"""Serialize a WGS84 polygon/multipolygon to a 6dp GeoJSON dict, or None."""
geom = _polygonal(geom if geom.is_valid else make_valid(geom))
if geom is None or geom.is_empty:
return None
gj = mapping(geom)
return {"type": gj["type"], "coordinates": _round_coords(gj["coordinates"])}
def write_district_geojson(
postcodes: dict[str, Polygon | MultiPolygon], output_dir: Path
) -> int:
"""Group postcodes by district, write GeoJSON files. Returns file count."""
"""Group postcodes by district, write GeoJSON files. Returns file count.
Before writing, the postcode polygons are converted to their final WGS84 form
and made a partition (overlaps removed) so the output never has two postcodes
covering the same ground.
"""
units_dir = output_dir / "units"
tmp_units_dir = output_dir / "units.tmp"
output_dir.mkdir(parents=True, exist_ok=True)
@ -166,38 +361,46 @@ def write_district_geojson(
shutil.rmtree(tmp_units_dir)
tmp_units_dir.mkdir(parents=True)
skipped: list[str] = []
# Pass 1: convert every postcode to its final WGS84 geometry (simplify, snap,
# sliver-rescue, multi-part preserved). Sorted → deterministic de-overlap
# priority. to_wgs84_geojson_multi returns None only for a genuinely empty
# input, which is skipped and reported rather than aborting a multi-hour run.
converted: list[tuple[str, Polygon | MultiPolygon]] = []
for pc in sorted(postcodes):
gj = to_wgs84_geojson_multi(postcodes[pc])
if gj is None:
skipped.append(pc)
continue
converted.append((pc, shape(gj)))
# Remove overlap strips so the output is a clean partition.
converted = _resolve_overlaps(converted)
by_district: dict[str, list[tuple[str, Polygon | MultiPolygon]]] = defaultdict(list)
for pc, geom in postcodes.items():
for pc, geom in converted:
parts = pc.split()
district = parts[0] if parts else pc[:4]
by_district[district].append((pc, geom))
file_count = 0
seen_postcodes: set[str] = set()
for district, entries in tqdm(
sorted(by_district.items()), desc="Writing GeoJSON", unit="file"
):
features = []
for pc, geom in sorted(entries, key=lambda x: x[0]):
if pc in seen_postcodes:
raise ValueError(f"Duplicate postcode boundary feature: {pc}")
seen_postcodes.add(pc)
geojson_geom = to_wgs84_geojson(geom)
geojson_geom = _geojson_geometry(geom)
if geojson_geom is None:
raise ValueError(f"Postcode boundary collapsed to empty geometry: {pc}")
written_geom = shape(geojson_geom)
if written_geom.is_empty or not written_geom.is_valid:
raise ValueError(
f"Invalid postcode boundary geometry after output: {pc}"
)
mapit_code = pc.replace(" ", "")
skipped.append(pc)
continue
features.append(
{
"type": "Feature",
"geometry": geojson_geom,
"properties": {
"postcodes": pc,
"mapit_code": mapit_code,
"mapit_code": pc.replace(" ", ""),
},
}
)
@ -211,6 +414,14 @@ def write_district_geojson(
json.dump(collection, f, separators=(",", ":"))
file_count += 1
if skipped:
preview = ", ".join(skipped[:10])
suffix = "" if len(skipped) > 10 else ""
print(
f" Skipped {len(skipped)} postcode(s) with degenerate (sub-grid) "
f"geometry: {preview}{suffix}"
)
if units_dir.exists():
shutil.rmtree(units_dir)
tmp_units_dir.replace(units_dir)