428 lines
17 KiB
Python
428 lines
17 KiB
Python
import json
|
|
import shutil
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
from pyproj import Transformer
|
|
from shapely import STRtree, make_valid, set_precision
|
|
from shapely.errors import GEOSException
|
|
from shapely.geometry import MultiPolygon, Polygon, mapping, shape
|
|
from shapely.ops import transform as transform_geometry
|
|
from shapely.ops import unary_union
|
|
from tqdm import tqdm
|
|
|
|
_to_wgs84 = None
|
|
|
|
|
|
def _get_to_wgs84():
|
|
global _to_wgs84
|
|
if _to_wgs84 is None:
|
|
_to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
|
|
return _to_wgs84
|
|
|
|
|
|
def _largest_polygonal(geom) -> Polygon | None:
|
|
if geom is None or geom.is_empty:
|
|
return None
|
|
if not geom.is_valid:
|
|
geom = make_valid(geom)
|
|
if geom.geom_type == "Polygon":
|
|
return geom
|
|
if geom.geom_type == "MultiPolygon":
|
|
return max(geom.geoms, key=lambda g: g.area)
|
|
if geom.geom_type == "GeometryCollection":
|
|
polygons = [
|
|
polygon
|
|
for part in geom.geoms
|
|
if (polygon := _largest_polygonal(part)) is not None
|
|
]
|
|
if polygons:
|
|
return max(polygons, key=lambda g: g.area)
|
|
return None
|
|
|
|
|
|
# Output coordinate grid (~0.11 m at UK latitudes). Polygons whose extent is
|
|
# below this in any direction snap to empty during serialization.
|
|
_OUTPUT_PRECISION_DEG = 0.000001
|
|
# Minimal BNG buffer used to rescue sub-grid slivers into a representable
|
|
# footprint. A near-zero-area Voronoi/INSPIRE spike (e.g. three almost-collinear
|
|
# vertices) would otherwise vanish at output precision; since every *active*
|
|
# postcode must keep a boundary (validate_outputs enforces this with zero
|
|
# tolerance), we fatten it just enough to survive snapping rather than drop it.
|
|
_MIN_FOOTPRINT_BUFFER_M = 0.5
|
|
|
|
|
|
def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
|
|
"""Transform a BNG polygon to WGS84, snap to output precision, validate.
|
|
|
|
Validates the *serialized* GeoJSON dict (via a ``shape()`` round-trip), not
|
|
just the intermediate Shapely object: coordinate snapping during
|
|
serialization can otherwise leave a self-intersecting ring that only shows up
|
|
once the feature is read back from disk. Returns ``None`` if the geometry
|
|
collapses to empty (a sub-grid sliver).
|
|
"""
|
|
transformer = _get_to_wgs84()
|
|
wgs84 = transform_geometry(transformer.transform, geom_bng)
|
|
try:
|
|
wgs84 = set_precision(wgs84, _OUTPUT_PRECISION_DEG, mode="valid_output")
|
|
except GEOSException:
|
|
# Precision snapping can fail on pathological geometries; fall back to a
|
|
# plain validity repair without coordinate snapping.
|
|
wgs84 = make_valid(wgs84)
|
|
wgs84 = _largest_polygonal(wgs84)
|
|
if wgs84 is None:
|
|
return None
|
|
|
|
geojson_dict = mapping(wgs84)
|
|
|
|
# The geometry that actually reaches disk is the GeoJSON dict, so validate
|
|
# *that* (not the pre-serialization object) and repair if needed.
|
|
round_trip = shape(geojson_dict)
|
|
if round_trip.is_empty or not round_trip.is_valid:
|
|
round_trip = _largest_polygonal(make_valid(round_trip))
|
|
if round_trip is None or round_trip.is_empty:
|
|
return None
|
|
geojson_dict = mapping(round_trip)
|
|
|
|
return geojson_dict
|
|
|
|
|
|
def _rescue_footprint(geom_bng) -> dict | None:
|
|
"""Fatten a degenerate BNG geometry into a representable footprint and snap."""
|
|
footprint = _largest_polygonal(geom_bng.buffer(_MIN_FOOTPRINT_BUFFER_M))
|
|
if footprint is None:
|
|
return None
|
|
return _snap_to_wgs84_geojson(footprint)
|
|
|
|
|
|
def to_wgs84_geojson(
|
|
geom: Polygon | MultiPolygon, tolerance: float = 1.0
|
|
) -> dict | None:
|
|
"""Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
|
|
|
|
A few thousand postcodes reduce to a sub-grid sliver that snaps to empty at
|
|
output precision. Dropping them would leave an active postcode with no
|
|
boundary (validate_outputs rejects that with zero tolerance), so instead they
|
|
are fattened into a minimal footprint at the right location: first by buffering
|
|
the (often elongated) sliver itself, then -- for fully-degenerate input -- a
|
|
small disc around ``representative_point()``, which lies inside any non-empty
|
|
geometry. ``None`` is returned only for a genuinely empty input.
|
|
"""
|
|
if geom is None or geom.is_empty:
|
|
return None
|
|
|
|
cleaned = _largest_polygonal(geom)
|
|
if cleaned is not None:
|
|
simplified = _largest_polygonal(
|
|
cleaned.simplify(tolerance, preserve_topology=True)
|
|
)
|
|
if simplified is None:
|
|
simplified = cleaned
|
|
# Normal path; if snapping erases a thin sliver, fatten its real shape.
|
|
result = _snap_to_wgs84_geojson(simplified)
|
|
if result is None:
|
|
result = _rescue_footprint(simplified)
|
|
if result is not None:
|
|
return result
|
|
|
|
# Universal fallback for input too degenerate to clean or fatten in place.
|
|
return _rescue_footprint(geom.representative_point())
|
|
|
|
|
|
def to_wgs84_geojson_multi(
|
|
geom: Polygon | MultiPolygon, tolerance: float = 1.0
|
|
) -> dict | None:
|
|
"""Convert a (possibly multi-part) postcode geometry to a GeoJSON dict,
|
|
preserving every part. Each part is simplified/snapped/rescued independently
|
|
via :func:`to_wgs84_geojson`; the result is a ``Polygon`` for a single part or
|
|
a ``MultiPolygon`` for several. ``None`` only if every part is degenerate.
|
|
"""
|
|
parts = list(geom.geoms) if geom.geom_type == "MultiPolygon" else [geom]
|
|
part_dicts = [d for part in parts if (d := to_wgs84_geojson(part, tolerance))]
|
|
if not part_dicts:
|
|
return None
|
|
if len(part_dicts) == 1:
|
|
return part_dicts[0]
|
|
return {
|
|
"type": "MultiPolygon",
|
|
"coordinates": [pd["coordinates"] for pd in part_dicts],
|
|
}
|
|
|
|
|
|
# Interior holes from the INSPIRE+Voronoi+make_valid chain are small artifacts and
|
|
# get filled. A hole at least this large is likely a genuinely enclosed postcode
|
|
# (kept, so we never solidify over a neighbour); the de-overlap pass is the real
|
|
# guarantee, this is defence-in-depth.
|
|
_MAX_ARTIFACT_HOLE_AREA = 1000.0
|
|
|
|
|
|
def _fill_small_holes(poly: Polygon) -> Polygon:
|
|
kept = [r for r in poly.interiors if Polygon(r).area >= _MAX_ARTIFACT_HOLE_AREA]
|
|
return Polygon(poly.exterior, kept)
|
|
|
|
|
|
def _fill_holes(geom):
|
|
"""Fill small artifact interior rings; keep large (real-enclosed) holes."""
|
|
if geom.geom_type == "Polygon":
|
|
return _fill_small_holes(geom)
|
|
elif geom.geom_type == "MultiPolygon":
|
|
return MultiPolygon([_fill_small_holes(p) for p in geom.geoms])
|
|
return geom
|
|
|
|
|
|
# A postcode genuinely split across an OA seam (by a railway, river, or main road
|
|
# wider than the merge buffer) arrives here as a MultiPolygon. Keeping only the
|
|
# largest part used to discard the rest, leaving ~1.8% of merged area as uncovered
|
|
# gaps (often 3000-5000 m² building blocks). Keep every part at least this big;
|
|
# smaller detached bits are Voronoi/clipping noise and are still dropped.
|
|
_MIN_DETACHED_PART_AREA = 100.0
|
|
|
|
|
|
def _keep_polygon_parts(geom):
|
|
"""Keep all MultiPolygon parts >= _MIN_DETACHED_PART_AREA (largest if none)."""
|
|
if geom.geom_type != "MultiPolygon":
|
|
return geom
|
|
parts = [g for g in geom.geoms if g.area >= _MIN_DETACHED_PART_AREA]
|
|
if not parts:
|
|
parts = [max(geom.geoms, key=lambda g: g.area)]
|
|
return parts[0] if len(parts) == 1 else MultiPolygon(parts)
|
|
|
|
|
|
def merge_fragments(
|
|
all_fragments: list[tuple[str, Polygon | MultiPolygon]],
|
|
greenspace_tree=None,
|
|
greenspace_geoms=None,
|
|
) -> dict[str, Polygon | MultiPolygon]:
|
|
"""Merge cross-OA fragments for postcodes spanning multiple OAs.
|
|
|
|
Args:
|
|
all_fragments: List of (postcode, geometry) pairs.
|
|
greenspace_tree: Optional STRtree of park/water polygons.
|
|
greenspace_geoms: Optional list of park/water geometries (indexed by tree).
|
|
"""
|
|
by_postcode: dict[str, list] = defaultdict(list)
|
|
for pc, geom in all_fragments:
|
|
by_postcode[pc].append(geom)
|
|
|
|
merged = {}
|
|
for pc, parts in by_postcode.items():
|
|
combined = unary_union(parts)
|
|
if combined.is_empty:
|
|
continue
|
|
if not combined.is_valid:
|
|
combined = make_valid(combined)
|
|
# Close tiny gaps between adjacent OA boundary edges (float mismatches).
|
|
# The closing can erode a tiny MultiPolygon (e.g. a postcode with only a
|
|
# sliver fragment) to nothing, which would leave the postcode with no
|
|
# geometry at all — keep the un-closed shape if that happens.
|
|
if combined.geom_type == "MultiPolygon":
|
|
closed = combined.buffer(5.0).buffer(-5.0)
|
|
if not closed.is_valid:
|
|
closed = make_valid(closed)
|
|
if not closed.is_empty:
|
|
combined = closed
|
|
# Keep the postcode whole: the largest part plus any other substantial
|
|
# part (a genuine railway/river split), dropping only tiny noise slivers.
|
|
combined = _keep_polygon_parts(combined)
|
|
# Remove artifact interior holes from INSPIRE+Voronoi+make_valid chain
|
|
combined = _fill_holes(combined)
|
|
# Subtract parks/water if provided
|
|
if greenspace_tree is not None and greenspace_geoms is not None:
|
|
from .greenspace import subtract_greenspace
|
|
|
|
pre_green = combined
|
|
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
|
|
combined = _keep_polygon_parts(combined)
|
|
# Do NOT _fill_holes here: interior holes carved by the greenspace
|
|
# subtraction (lakes, enclosed parks) are intentional, not artifacts.
|
|
# Filling them would re-add the removed area and negate the
|
|
# subtraction. Artifact holes from the INSPIRE+Voronoi+make_valid
|
|
# chain were already removed by the _fill_holes above (pre-subtraction).
|
|
# Revert if subtraction + fragment selection lost >90% of area
|
|
if pre_green.area > 0 and combined.area / pre_green.area < 0.1:
|
|
combined = pre_green
|
|
merged[pc] = combined
|
|
return merged
|
|
|
|
|
|
def _polygonal(geom):
|
|
"""Return only the polygonal part(s) of a geometry, or None if none remain."""
|
|
if geom is None or geom.is_empty:
|
|
return None
|
|
if geom.geom_type in ("Polygon", "MultiPolygon"):
|
|
return geom
|
|
if geom.geom_type == "GeometryCollection":
|
|
polys = [
|
|
g
|
|
for g in geom.geoms
|
|
if g.geom_type in ("Polygon", "MultiPolygon") and not g.is_empty
|
|
]
|
|
if not polys:
|
|
return None
|
|
merged = unary_union(polys)
|
|
return merged if not merged.is_empty else None
|
|
return None
|
|
|
|
|
|
def _resolve_overlaps(
|
|
items: list[tuple[str, Polygon | MultiPolygon]],
|
|
) -> list[tuple[str, Polygon | MultiPolygon]]:
|
|
"""Make the postcode polygons a partition: no two cover the same ground.
|
|
|
|
Overlap appears at OA seams (the 5m merge buffer expands each postcode
|
|
independently), from simplifying each postcode on its own, and as genuine
|
|
containment (a postcode fully enclosed by another). Each postcode is trimmed
|
|
by the union of its higher-priority overlapping neighbours, where **priority =
|
|
ascending area**: a smaller postcode wins contested ground. That single rule
|
|
handles both cases correctly — an enclosed postcode is always smaller than its
|
|
container, so it keeps its area while the container gets a hole (a `overlaps`
|
|
query alone would miss containment entirely). Run last, on the final output
|
|
geometries, so nothing re-introduces overlap afterwards. A postcode that would
|
|
be emptied keeps its original geometry, so an active postcode is never dropped.
|
|
"""
|
|
geoms = [g for _, g in items]
|
|
n = len(geoms)
|
|
if n < 2:
|
|
return items
|
|
|
|
# rank[i]: 0 = highest priority (smallest area). Postcode string breaks ties
|
|
# for determinism.
|
|
rank = {
|
|
idx: r
|
|
for r, idx in enumerate(
|
|
sorted(range(n), key=lambda i: (geoms[i].area, items[i][0]))
|
|
)
|
|
}
|
|
|
|
tree = STRtree(geoms)
|
|
arr = np.array(geoms, dtype=object)
|
|
pairs: set[tuple[int, int]] = set()
|
|
# "overlaps" gives partial overlaps; "contains" gives containment (which
|
|
# "overlaps" excludes) — together they cover every 2-D overlap without the
|
|
# edge-touch explosion a plain "intersects" query would add.
|
|
for predicate in ("overlaps", "contains"):
|
|
qsrc, qtgt = tree.query(arr, predicate=predicate)
|
|
for s, t in zip(qsrc.tolist(), qtgt.tolist()):
|
|
if s != t:
|
|
pairs.add((s, t) if s < t else (t, s))
|
|
|
|
# For each loser (lower priority) the higher-priority neighbours to subtract.
|
|
higher: dict[int, list[int]] = defaultdict(list)
|
|
for a, b in pairs:
|
|
winner, loser = (a, b) if rank[a] < rank[b] else (b, a)
|
|
higher[loser].append(winner)
|
|
|
|
out = list(geoms)
|
|
# Process losers from highest priority down, so every subtracted neighbour is
|
|
# already finalised.
|
|
for i in sorted(higher, key=lambda idx: rank[idx]):
|
|
cut = unary_union([out[j] for j in higher[i]])
|
|
trimmed = out[i].difference(cut)
|
|
if not trimmed.is_valid:
|
|
trimmed = make_valid(trimmed)
|
|
# Keep all polygonal parts: these geometries are in WGS84 degrees, so an
|
|
# area threshold here would wrongly drop everything but the largest part
|
|
# and re-open the very gaps the seam fix closed.
|
|
trimmed = _polygonal(trimmed)
|
|
if trimmed is not None and not trimmed.is_empty:
|
|
out[i] = trimmed
|
|
return [(pc, out[i]) for i, (pc, _) in enumerate(items)]
|
|
|
|
|
|
def _round_coords(coords, ndigits=6):
|
|
if coords and isinstance(coords[0], (int, float)):
|
|
return [round(coords[0], ndigits), round(coords[1], ndigits)]
|
|
return [_round_coords(c, ndigits) for c in coords]
|
|
|
|
|
|
def _geojson_geometry(geom) -> dict | None:
|
|
"""Serialize a WGS84 polygon/multipolygon to a 6dp GeoJSON dict, or None."""
|
|
geom = _polygonal(geom if geom.is_valid else make_valid(geom))
|
|
if geom is None or geom.is_empty:
|
|
return None
|
|
gj = mapping(geom)
|
|
return {"type": gj["type"], "coordinates": _round_coords(gj["coordinates"])}
|
|
|
|
|
|
def write_district_geojson(
|
|
postcodes: dict[str, Polygon | MultiPolygon], output_dir: Path
|
|
) -> int:
|
|
"""Group postcodes by district, write GeoJSON files. Returns file count.
|
|
|
|
Before writing, the postcode polygons are converted to their final WGS84 form
|
|
and made a partition (overlaps removed) so the output never has two postcodes
|
|
covering the same ground.
|
|
"""
|
|
units_dir = output_dir / "units"
|
|
tmp_units_dir = output_dir / "units.tmp"
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
if tmp_units_dir.exists():
|
|
shutil.rmtree(tmp_units_dir)
|
|
tmp_units_dir.mkdir(parents=True)
|
|
|
|
skipped: list[str] = []
|
|
|
|
# Pass 1: convert every postcode to its final WGS84 geometry (simplify, snap,
|
|
# sliver-rescue, multi-part preserved). Sorted → deterministic de-overlap
|
|
# priority. to_wgs84_geojson_multi returns None only for a genuinely empty
|
|
# input, which is skipped and reported rather than aborting a multi-hour run.
|
|
converted: list[tuple[str, Polygon | MultiPolygon]] = []
|
|
for pc in sorted(postcodes):
|
|
gj = to_wgs84_geojson_multi(postcodes[pc])
|
|
if gj is None:
|
|
skipped.append(pc)
|
|
continue
|
|
converted.append((pc, shape(gj)))
|
|
|
|
# Remove overlap strips so the output is a clean partition.
|
|
converted = _resolve_overlaps(converted)
|
|
|
|
by_district: dict[str, list[tuple[str, Polygon | MultiPolygon]]] = defaultdict(list)
|
|
for pc, geom in converted:
|
|
parts = pc.split()
|
|
district = parts[0] if parts else pc[:4]
|
|
by_district[district].append((pc, geom))
|
|
|
|
file_count = 0
|
|
for district, entries in tqdm(
|
|
sorted(by_district.items()), desc="Writing GeoJSON", unit="file"
|
|
):
|
|
features = []
|
|
for pc, geom in sorted(entries, key=lambda x: x[0]):
|
|
geojson_geom = _geojson_geometry(geom)
|
|
if geojson_geom is None:
|
|
skipped.append(pc)
|
|
continue
|
|
features.append(
|
|
{
|
|
"type": "Feature",
|
|
"geometry": geojson_geom,
|
|
"properties": {
|
|
"postcodes": pc,
|
|
"mapit_code": pc.replace(" ", ""),
|
|
},
|
|
}
|
|
)
|
|
|
|
if not features:
|
|
continue
|
|
|
|
collection = {"type": "FeatureCollection", "features": features}
|
|
out_path = tmp_units_dir / f"{district}.geojson"
|
|
with open(out_path, "w") as f:
|
|
json.dump(collection, f, separators=(",", ":"))
|
|
file_count += 1
|
|
|
|
if skipped:
|
|
preview = ", ".join(skipped[:10])
|
|
suffix = " …" if len(skipped) > 10 else ""
|
|
print(
|
|
f" Skipped {len(skipped)} postcode(s) with degenerate (sub-grid) "
|
|
f"geometry: {preview}{suffix}"
|
|
)
|
|
|
|
if units_dir.exists():
|
|
shutil.rmtree(units_dir)
|
|
tmp_units_dir.replace(units_dir)
|
|
return file_count
|