idk
This commit is contained in:
parent
a04ac2d857
commit
d43da9708c
47 changed files with 4120 additions and 573 deletions
|
|
@ -3,8 +3,9 @@ import shutil
|
|||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from pyproj import Transformer
|
||||
from shapely import make_valid, set_precision
|
||||
from shapely import STRtree, make_valid, set_precision
|
||||
from shapely.errors import GEOSException
|
||||
from shapely.geometry import MultiPolygon, Polygon, mapping, shape
|
||||
from shapely.ops import transform as transform_geometry
|
||||
|
|
@ -41,30 +42,30 @@ def _largest_polygonal(geom) -> Polygon | None:
|
|||
return None
|
||||
|
||||
|
||||
def to_wgs84_geojson(
|
||||
geom: Polygon | MultiPolygon, tolerance: float = 1.0
|
||||
) -> dict | None:
|
||||
"""Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
|
||||
# Output coordinate grid (~0.11 m at UK latitudes). Polygons whose extent is
|
||||
# below this in any direction snap to empty during serialization.
|
||||
_OUTPUT_PRECISION_DEG = 0.000001
|
||||
# Minimal BNG buffer used to rescue sub-grid slivers into a representable
|
||||
# footprint. A near-zero-area Voronoi/INSPIRE spike (e.g. three almost-collinear
|
||||
# vertices) would otherwise vanish at output precision; since every *active*
|
||||
# postcode must keep a boundary (validate_outputs enforces this with zero
|
||||
# tolerance), we fatten it just enough to survive snapping rather than drop it.
|
||||
_MIN_FOOTPRINT_BUFFER_M = 0.5
|
||||
|
||||
|
||||
def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
|
||||
"""Transform a BNG polygon to WGS84, snap to output precision, validate.
|
||||
|
||||
Validates the *serialized* GeoJSON dict (via a ``shape()`` round-trip), not
|
||||
just the intermediate Shapely object: coordinate snapping during
|
||||
serialization can otherwise leave a self-intersecting ring that only shows up
|
||||
once the feature is read back from disk. Any such geometry is repaired with
|
||||
``make_valid`` before returning so written features are always valid.
|
||||
once the feature is read back from disk. Returns ``None`` if the geometry
|
||||
collapses to empty (a sub-grid sliver).
|
||||
"""
|
||||
geom = _largest_polygonal(geom)
|
||||
if geom is None:
|
||||
return None
|
||||
|
||||
simplified = geom.simplify(tolerance, preserve_topology=True)
|
||||
simplified = _largest_polygonal(simplified)
|
||||
if simplified is None:
|
||||
return None
|
||||
|
||||
transformer = _get_to_wgs84()
|
||||
wgs84 = transform_geometry(transformer.transform, simplified)
|
||||
wgs84 = transform_geometry(transformer.transform, geom_bng)
|
||||
try:
|
||||
wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
|
||||
wgs84 = set_precision(wgs84, _OUTPUT_PRECISION_DEG, mode="valid_output")
|
||||
except GEOSException:
|
||||
# Precision snapping can fail on pathological geometries; fall back to a
|
||||
# plain validity repair without coordinate snapping.
|
||||
|
|
@ -87,20 +88,105 @@ def to_wgs84_geojson(
|
|||
return geojson_dict
|
||||
|
||||
|
||||
def _rescue_footprint(geom_bng) -> dict | None:
|
||||
"""Fatten a degenerate BNG geometry into a representable footprint and snap."""
|
||||
footprint = _largest_polygonal(geom_bng.buffer(_MIN_FOOTPRINT_BUFFER_M))
|
||||
if footprint is None:
|
||||
return None
|
||||
return _snap_to_wgs84_geojson(footprint)
|
||||
|
||||
|
||||
def to_wgs84_geojson(
|
||||
geom: Polygon | MultiPolygon, tolerance: float = 1.0
|
||||
) -> dict | None:
|
||||
"""Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
|
||||
|
||||
A few thousand postcodes reduce to a sub-grid sliver that snaps to empty at
|
||||
output precision. Dropping them would leave an active postcode with no
|
||||
boundary (validate_outputs rejects that with zero tolerance), so instead they
|
||||
are fattened into a minimal footprint at the right location: first by buffering
|
||||
the (often elongated) sliver itself, then -- for fully-degenerate input -- a
|
||||
small disc around ``representative_point()``, which lies inside any non-empty
|
||||
geometry. ``None`` is returned only for a genuinely empty input.
|
||||
"""
|
||||
if geom is None or geom.is_empty:
|
||||
return None
|
||||
|
||||
cleaned = _largest_polygonal(geom)
|
||||
if cleaned is not None:
|
||||
simplified = _largest_polygonal(
|
||||
cleaned.simplify(tolerance, preserve_topology=True)
|
||||
)
|
||||
if simplified is None:
|
||||
simplified = cleaned
|
||||
# Normal path; if snapping erases a thin sliver, fatten its real shape.
|
||||
result = _snap_to_wgs84_geojson(simplified)
|
||||
if result is None:
|
||||
result = _rescue_footprint(simplified)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
# Universal fallback for input too degenerate to clean or fatten in place.
|
||||
return _rescue_footprint(geom.representative_point())
|
||||
|
||||
|
||||
def to_wgs84_geojson_multi(
|
||||
geom: Polygon | MultiPolygon, tolerance: float = 1.0
|
||||
) -> dict | None:
|
||||
"""Convert a (possibly multi-part) postcode geometry to a GeoJSON dict,
|
||||
preserving every part. Each part is simplified/snapped/rescued independently
|
||||
via :func:`to_wgs84_geojson`; the result is a ``Polygon`` for a single part or
|
||||
a ``MultiPolygon`` for several. ``None`` only if every part is degenerate.
|
||||
"""
|
||||
parts = list(geom.geoms) if geom.geom_type == "MultiPolygon" else [geom]
|
||||
part_dicts = [d for part in parts if (d := to_wgs84_geojson(part, tolerance))]
|
||||
if not part_dicts:
|
||||
return None
|
||||
if len(part_dicts) == 1:
|
||||
return part_dicts[0]
|
||||
return {
|
||||
"type": "MultiPolygon",
|
||||
"coordinates": [pd["coordinates"] for pd in part_dicts],
|
||||
}
|
||||
|
||||
|
||||
# Interior holes from the INSPIRE+Voronoi+make_valid chain are small artifacts and
|
||||
# get filled. A hole at least this large is likely a genuinely enclosed postcode
|
||||
# (kept, so we never solidify over a neighbour); the de-overlap pass is the real
|
||||
# guarantee, this is defence-in-depth.
|
||||
_MAX_ARTIFACT_HOLE_AREA = 1000.0
|
||||
|
||||
|
||||
def _fill_small_holes(poly: Polygon) -> Polygon:
|
||||
kept = [r for r in poly.interiors if Polygon(r).area >= _MAX_ARTIFACT_HOLE_AREA]
|
||||
return Polygon(poly.exterior, kept)
|
||||
|
||||
|
||||
def _fill_holes(geom):
|
||||
"""Remove all interior rings (holes) from a polygon or multipolygon."""
|
||||
"""Fill small artifact interior rings; keep large (real-enclosed) holes."""
|
||||
if geom.geom_type == "Polygon":
|
||||
return Polygon(geom.exterior)
|
||||
return _fill_small_holes(geom)
|
||||
elif geom.geom_type == "MultiPolygon":
|
||||
return MultiPolygon([Polygon(p.exterior) for p in geom.geoms])
|
||||
return MultiPolygon([_fill_small_holes(p) for p in geom.geoms])
|
||||
return geom
|
||||
|
||||
|
||||
def _largest_polygon(geom):
|
||||
"""Extract the largest polygon from a MultiPolygon."""
|
||||
if geom.geom_type == "MultiPolygon":
|
||||
return max(geom.geoms, key=lambda g: g.area)
|
||||
return geom
|
||||
# A postcode genuinely split across an OA seam (by a railway, river, or main road
|
||||
# wider than the merge buffer) arrives here as a MultiPolygon. Keeping only the
|
||||
# largest part used to discard the rest, leaving ~1.8% of merged area as uncovered
|
||||
# gaps (often 3000-5000 m² building blocks). Keep every part at least this big;
|
||||
# smaller detached bits are Voronoi/clipping noise and are still dropped.
|
||||
_MIN_DETACHED_PART_AREA = 100.0
|
||||
|
||||
|
||||
def _keep_polygon_parts(geom):
|
||||
"""Keep all MultiPolygon parts >= _MIN_DETACHED_PART_AREA (largest if none)."""
|
||||
if geom.geom_type != "MultiPolygon":
|
||||
return geom
|
||||
parts = [g for g in geom.geoms if g.area >= _MIN_DETACHED_PART_AREA]
|
||||
if not parts:
|
||||
parts = [max(geom.geoms, key=lambda g: g.area)]
|
||||
return parts[0] if len(parts) == 1 else MultiPolygon(parts)
|
||||
|
||||
|
||||
def merge_fragments(
|
||||
|
|
@ -126,14 +212,19 @@ def merge_fragments(
|
|||
continue
|
||||
if not combined.is_valid:
|
||||
combined = make_valid(combined)
|
||||
# Close tiny gaps between adjacent OA boundary edges (float mismatches)
|
||||
# Close tiny gaps between adjacent OA boundary edges (float mismatches).
|
||||
# The closing can erode a tiny MultiPolygon (e.g. a postcode with only a
|
||||
# sliver fragment) to nothing, which would leave the postcode with no
|
||||
# geometry at all — keep the un-closed shape if that happens.
|
||||
if combined.geom_type == "MultiPolygon":
|
||||
combined = combined.buffer(5.0).buffer(-5.0)
|
||||
if not combined.is_valid:
|
||||
combined = make_valid(combined)
|
||||
# Postcodes are contiguous delivery routes — keep only the largest
|
||||
# polygon; small detached fragments are algorithm artifacts
|
||||
combined = _largest_polygon(combined)
|
||||
closed = combined.buffer(5.0).buffer(-5.0)
|
||||
if not closed.is_valid:
|
||||
closed = make_valid(closed)
|
||||
if not closed.is_empty:
|
||||
combined = closed
|
||||
# Keep the postcode whole: the largest part plus any other substantial
|
||||
# part (a genuine railway/river split), dropping only tiny noise slivers.
|
||||
combined = _keep_polygon_parts(combined)
|
||||
# Remove artifact interior holes from INSPIRE+Voronoi+make_valid chain
|
||||
combined = _fill_holes(combined)
|
||||
# Subtract parks/water if provided
|
||||
|
|
@ -142,7 +233,7 @@ def merge_fragments(
|
|||
|
||||
pre_green = combined
|
||||
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
|
||||
combined = _largest_polygon(combined)
|
||||
combined = _keep_polygon_parts(combined)
|
||||
# Do NOT _fill_holes here: interior holes carved by the greenspace
|
||||
# subtraction (lakes, enclosed parks) are intentional, not artifacts.
|
||||
# Filling them would re-add the removed area and negate the
|
||||
|
|
@ -155,10 +246,114 @@ def merge_fragments(
|
|||
return merged
|
||||
|
||||
|
||||
def _polygonal(geom):
|
||||
"""Return only the polygonal part(s) of a geometry, or None if none remain."""
|
||||
if geom is None or geom.is_empty:
|
||||
return None
|
||||
if geom.geom_type in ("Polygon", "MultiPolygon"):
|
||||
return geom
|
||||
if geom.geom_type == "GeometryCollection":
|
||||
polys = [
|
||||
g
|
||||
for g in geom.geoms
|
||||
if g.geom_type in ("Polygon", "MultiPolygon") and not g.is_empty
|
||||
]
|
||||
if not polys:
|
||||
return None
|
||||
merged = unary_union(polys)
|
||||
return merged if not merged.is_empty else None
|
||||
return None
|
||||
|
||||
|
||||
def _resolve_overlaps(
|
||||
items: list[tuple[str, Polygon | MultiPolygon]],
|
||||
) -> list[tuple[str, Polygon | MultiPolygon]]:
|
||||
"""Make the postcode polygons a partition: no two cover the same ground.
|
||||
|
||||
Overlap appears at OA seams (the 5m merge buffer expands each postcode
|
||||
independently), from simplifying each postcode on its own, and as genuine
|
||||
containment (a postcode fully enclosed by another). Each postcode is trimmed
|
||||
by the union of its higher-priority overlapping neighbours, where **priority =
|
||||
ascending area**: a smaller postcode wins contested ground. That single rule
|
||||
handles both cases correctly — an enclosed postcode is always smaller than its
|
||||
container, so it keeps its area while the container gets a hole (a `overlaps`
|
||||
query alone would miss containment entirely). Run last, on the final output
|
||||
geometries, so nothing re-introduces overlap afterwards. A postcode that would
|
||||
be emptied keeps its original geometry, so an active postcode is never dropped.
|
||||
"""
|
||||
geoms = [g for _, g in items]
|
||||
n = len(geoms)
|
||||
if n < 2:
|
||||
return items
|
||||
|
||||
# rank[i]: 0 = highest priority (smallest area). Postcode string breaks ties
|
||||
# for determinism.
|
||||
rank = {
|
||||
idx: r
|
||||
for r, idx in enumerate(
|
||||
sorted(range(n), key=lambda i: (geoms[i].area, items[i][0]))
|
||||
)
|
||||
}
|
||||
|
||||
tree = STRtree(geoms)
|
||||
arr = np.array(geoms, dtype=object)
|
||||
pairs: set[tuple[int, int]] = set()
|
||||
# "overlaps" gives partial overlaps; "contains" gives containment (which
|
||||
# "overlaps" excludes) — together they cover every 2-D overlap without the
|
||||
# edge-touch explosion a plain "intersects" query would add.
|
||||
for predicate in ("overlaps", "contains"):
|
||||
qsrc, qtgt = tree.query(arr, predicate=predicate)
|
||||
for s, t in zip(qsrc.tolist(), qtgt.tolist()):
|
||||
if s != t:
|
||||
pairs.add((s, t) if s < t else (t, s))
|
||||
|
||||
# For each loser (lower priority) the higher-priority neighbours to subtract.
|
||||
higher: dict[int, list[int]] = defaultdict(list)
|
||||
for a, b in pairs:
|
||||
winner, loser = (a, b) if rank[a] < rank[b] else (b, a)
|
||||
higher[loser].append(winner)
|
||||
|
||||
out = list(geoms)
|
||||
# Process losers from highest priority down, so every subtracted neighbour is
|
||||
# already finalised.
|
||||
for i in sorted(higher, key=lambda idx: rank[idx]):
|
||||
cut = unary_union([out[j] for j in higher[i]])
|
||||
trimmed = out[i].difference(cut)
|
||||
if not trimmed.is_valid:
|
||||
trimmed = make_valid(trimmed)
|
||||
# Keep all polygonal parts: these geometries are in WGS84 degrees, so an
|
||||
# area threshold here would wrongly drop everything but the largest part
|
||||
# and re-open the very gaps the seam fix closed.
|
||||
trimmed = _polygonal(trimmed)
|
||||
if trimmed is not None and not trimmed.is_empty:
|
||||
out[i] = trimmed
|
||||
return [(pc, out[i]) for i, (pc, _) in enumerate(items)]
|
||||
|
||||
|
||||
def _round_coords(coords, ndigits=6):
|
||||
if coords and isinstance(coords[0], (int, float)):
|
||||
return [round(coords[0], ndigits), round(coords[1], ndigits)]
|
||||
return [_round_coords(c, ndigits) for c in coords]
|
||||
|
||||
|
||||
def _geojson_geometry(geom) -> dict | None:
|
||||
"""Serialize a WGS84 polygon/multipolygon to a 6dp GeoJSON dict, or None."""
|
||||
geom = _polygonal(geom if geom.is_valid else make_valid(geom))
|
||||
if geom is None or geom.is_empty:
|
||||
return None
|
||||
gj = mapping(geom)
|
||||
return {"type": gj["type"], "coordinates": _round_coords(gj["coordinates"])}
|
||||
|
||||
|
||||
def write_district_geojson(
|
||||
postcodes: dict[str, Polygon | MultiPolygon], output_dir: Path
|
||||
) -> int:
|
||||
"""Group postcodes by district, write GeoJSON files. Returns file count."""
|
||||
"""Group postcodes by district, write GeoJSON files. Returns file count.
|
||||
|
||||
Before writing, the postcode polygons are converted to their final WGS84 form
|
||||
and made a partition (overlaps removed) so the output never has two postcodes
|
||||
covering the same ground.
|
||||
"""
|
||||
units_dir = output_dir / "units"
|
||||
tmp_units_dir = output_dir / "units.tmp"
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
|
@ -166,38 +361,46 @@ def write_district_geojson(
|
|||
shutil.rmtree(tmp_units_dir)
|
||||
tmp_units_dir.mkdir(parents=True)
|
||||
|
||||
skipped: list[str] = []
|
||||
|
||||
# Pass 1: convert every postcode to its final WGS84 geometry (simplify, snap,
|
||||
# sliver-rescue, multi-part preserved). Sorted → deterministic de-overlap
|
||||
# priority. to_wgs84_geojson_multi returns None only for a genuinely empty
|
||||
# input, which is skipped and reported rather than aborting a multi-hour run.
|
||||
converted: list[tuple[str, Polygon | MultiPolygon]] = []
|
||||
for pc in sorted(postcodes):
|
||||
gj = to_wgs84_geojson_multi(postcodes[pc])
|
||||
if gj is None:
|
||||
skipped.append(pc)
|
||||
continue
|
||||
converted.append((pc, shape(gj)))
|
||||
|
||||
# Remove overlap strips so the output is a clean partition.
|
||||
converted = _resolve_overlaps(converted)
|
||||
|
||||
by_district: dict[str, list[tuple[str, Polygon | MultiPolygon]]] = defaultdict(list)
|
||||
for pc, geom in postcodes.items():
|
||||
for pc, geom in converted:
|
||||
parts = pc.split()
|
||||
district = parts[0] if parts else pc[:4]
|
||||
by_district[district].append((pc, geom))
|
||||
|
||||
file_count = 0
|
||||
seen_postcodes: set[str] = set()
|
||||
for district, entries in tqdm(
|
||||
sorted(by_district.items()), desc="Writing GeoJSON", unit="file"
|
||||
):
|
||||
features = []
|
||||
for pc, geom in sorted(entries, key=lambda x: x[0]):
|
||||
if pc in seen_postcodes:
|
||||
raise ValueError(f"Duplicate postcode boundary feature: {pc}")
|
||||
seen_postcodes.add(pc)
|
||||
geojson_geom = to_wgs84_geojson(geom)
|
||||
geojson_geom = _geojson_geometry(geom)
|
||||
if geojson_geom is None:
|
||||
raise ValueError(f"Postcode boundary collapsed to empty geometry: {pc}")
|
||||
written_geom = shape(geojson_geom)
|
||||
if written_geom.is_empty or not written_geom.is_valid:
|
||||
raise ValueError(
|
||||
f"Invalid postcode boundary geometry after output: {pc}"
|
||||
)
|
||||
mapit_code = pc.replace(" ", "")
|
||||
skipped.append(pc)
|
||||
continue
|
||||
features.append(
|
||||
{
|
||||
"type": "Feature",
|
||||
"geometry": geojson_geom,
|
||||
"properties": {
|
||||
"postcodes": pc,
|
||||
"mapit_code": mapit_code,
|
||||
"mapit_code": pc.replace(" ", ""),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
|
@ -211,6 +414,14 @@ def write_district_geojson(
|
|||
json.dump(collection, f, separators=(",", ":"))
|
||||
file_count += 1
|
||||
|
||||
if skipped:
|
||||
preview = ", ".join(skipped[:10])
|
||||
suffix = " …" if len(skipped) > 10 else ""
|
||||
print(
|
||||
f" Skipped {len(skipped)} postcode(s) with degenerate (sub-grid) "
|
||||
f"geometry: {preview}{suffix}"
|
||||
)
|
||||
|
||||
if units_dir.exists():
|
||||
shutil.rmtree(units_dir)
|
||||
tmp_units_dir.replace(units_dir)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue