don't crash

This commit is contained in:
Andras Schmelczer 2026-06-04 20:40:42 +01:00
parent aab85fe32e
commit d6d20ccd37
13 changed files with 2630 additions and 3924 deletions

View file

@ -115,7 +115,7 @@ MAP_ASSETS_DEPS := pipeline/download/map_assets.py pipeline/transform/transform_
transform-school-proximity transform-tree-density \ transform-school-proximity transform-tree-density \
generate-postcode-boundaries generate-travel-times enrich-actual-listings generate-postcode-boundaries generate-travel-times enrich-actual-listings
prepare: $(PRICES_STAMP) download-places tiles satellite-tiles overlay-tiles property-border-tiles generate-postcode-boundaries download-map-assets generate-travel-times | $(POSTCODES_PQ) $(PROPERTIES_PQ) $(PRICE_INDEX) prepare: $(PRICES_STAMP) download-places tiles satellite-tiles overlay-tiles property-border-tiles tree-overlay-tiles crime-hotspot-tiles property-border-tiles generate-postcode-boundaries download-map-assets generate-travel-times | $(POSTCODES_PQ) $(PROPERTIES_PQ) $(PRICE_INDEX)
$(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ) --parquet $(PRICE_INDEX) --postcode-boundary-match "$(POSTCODES_PQ)::$(PC_BOUNDARIES)" --postcode-features $(POSTCODES_PQ) --postcode-universe "$(ARCGIS)::$(POSTCODES_PQ)" --properties-subset "$(PROPERTIES_PQ)::$(POSTCODES_PQ)" --price-index $(PRICE_INDEX) $(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ) --parquet $(PRICE_INDEX) --postcode-boundary-match "$(POSTCODES_PQ)::$(PC_BOUNDARIES)" --postcode-features $(POSTCODES_PQ) --postcode-universe "$(ARCGIS)::$(POSTCODES_PQ)" --properties-subset "$(PROPERTIES_PQ)::$(POSTCODES_PQ)" --price-index $(PRICE_INDEX)
merge: $(MERGE_STAMP) | $(POSTCODES_PQ) $(PROPERTIES_PQ) merge: $(MERGE_STAMP) | $(POSTCODES_PQ) $(PROPERTIES_PQ)
$(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ) --postcode-features $(POSTCODES_PQ) --postcode-universe "$(ARCGIS)::$(POSTCODES_PQ)" --properties-subset "$(PROPERTIES_PQ)::$(POSTCODES_PQ)" $(VALIDATE_OUTPUTS) --parquet $(POSTCODES_PQ) --parquet $(PROPERTIES_PQ) --postcode-features $(POSTCODES_PQ) --postcode-universe "$(ARCGIS)::$(POSTCODES_PQ)" --properties-subset "$(PROPERTIES_PQ)::$(POSTCODES_PQ)"

File diff suppressed because one or more lines are too long

View file

@ -72,6 +72,18 @@ assert len(EXPECTED_BAND_NAMES) == len(AGE_BANDS), (
"EXPECTED_BAND_NAMES and AGE_BANDS must stay aligned 1:1" "EXPECTED_BAND_NAMES and AGE_BANDS must stay aligned 1:1"
) )
# NOMIS sometimes labels a band with a wording variant that denotes the SAME
# age range (e.g. "Aged 4 years and under" for ages 0-4, "Aged 90 years and
# over" wording for the top band). Map such known-equivalent labels back to the
# canonical name BEFORE validation so a real band change still fails loudly,
# but a cosmetic relabel of an identical range does not block the build.
BAND_NAME_ALIASES = {
"Aged 4 years and under": "Aged 0 to 4 years",
}
assert set(BAND_NAME_ALIASES.values()) <= set(EXPECTED_BAND_NAMES), (
"BAND_NAME_ALIASES must map to canonical EXPECTED_BAND_NAMES"
)
def compute_median_age(counts: list[int]) -> float: def compute_median_age(counts: list[int]) -> float:
"""Compute median age from five-year band counts using linear interpolation.""" """Compute median age from five-year band counts using linear interpolation."""
@ -98,6 +110,15 @@ def _bands_to_median_table(pivoted: pl.DataFrame) -> pl.DataFrame:
missing/extra/relabelled band would otherwise silently mis-align counts missing/extra/relabelled band would otherwise silently mis-align counts
against the wrong AGE_BANDS lower bound, so we fail loudly instead. against the wrong AGE_BANDS lower bound, so we fail loudly instead.
""" """
# Normalise known-equivalent NOMIS label variants to their canonical name
# before validating (renaming onto an already-present canonical column would
# collide, so polars raises loudly in that genuinely ambiguous case).
rename_map = {
c: BAND_NAME_ALIASES[c] for c in pivoted.columns if c in BAND_NAME_ALIASES
}
if rename_map:
pivoted = pivoted.rename(rename_map)
# Validate the pivoted age-band columns against the canonical NOMIS set # Validate the pivoted age-band columns against the canonical NOMIS set
# BEFORE computing anything. # BEFORE computing anything.
band_cols = [c for c in pivoted.columns if c != "GEOGRAPHY_CODE"] band_cols = [c for c in pivoted.columns if c != "GEOGRAPHY_CODE"]

View file

@ -266,8 +266,12 @@ def _download_tile(
except ( except (
NoGeoTiffError, NoGeoTiffError,
httpx.HTTPStatusError, httpx.HTTPStatusError,
httpx.TimeoutException, # TransportError is the superset of TimeoutException, ConnectError,
httpx.ConnectError, # ReadError and ProtocolError — including RemoteProtocolError, raised
# when the WCS server closes the connection mid-stream ("incomplete
# chunked read"). All are transient; retry/split rather than letting
# one flaky tile crash the whole raster download.
httpx.TransportError,
) as e: ) as e:
last_error = e last_error = e
if attempt < MAX_RETRIES: if attempt < MAX_RETRIES:

View file

@ -57,6 +57,21 @@ def test_null_band_count_is_treated_as_zero_not_crash():
assert table["median_age"][0] == pytest.approx(32.5) assert table["median_age"][0] == pytest.approx(32.5)
def test_equivalent_band_label_alias_is_accepted():
# NOMIS relabelled the first band "Aged 4 years and under" (same as ages
# 0-4). It must be normalised to the canonical name and used as band 0, not
# rejected. All 100 people in that band -> median in the 0-4 range: 2.5.
counts_by_band = {name: [0] for name in EXPECTED_BAND_NAMES}
counts_by_band["Aged 4 years and under"] = counts_by_band.pop("Aged 0 to 4 years")
counts_by_band["Aged 4 years and under"] = [100]
pivoted = _pivoted(counts_by_band)
table = median_age._bands_to_median_table(pivoted)
assert table.height == 1
assert table["median_age"][0] == pytest.approx(2.5)
def test_missing_band_raises_clear_error(): def test_missing_band_raises_clear_error():
counts_by_band = {name: [10] for name in EXPECTED_BAND_NAMES} counts_by_band = {name: [10] for name in EXPECTED_BAND_NAMES}
del counts_by_band["Aged 85 years and over"] del counts_by_band["Aged 85 years and over"]

View file

@ -0,0 +1,123 @@
"""Robust GEOS overlay helpers.
Overlay operations (union, difference, intersection) can raise a
``GEOSException`` most often ``TopologyException: side location conflict``,
``Ring edge missing``, or ``found non-noded intersection`` on geometries that
contain near-coincident or near-degenerate edges, or that are individually
invalid. The robust remedy is a *fixed-precision* overlay: GEOS's OverlayNG
engine, handed a grid size, nodes every edge onto that grid and finishes where
the full-precision overlay cannot.
Getting the fallback to actually survive takes three rules, each of which we
learned the hard way from a crash:
1. **Never precision-reduce with the default mode.** ``set_precision``'s default
``valid_output`` (and ``keep_collapsed``) mode runs its *own* noding pass that
re-raises the very ``side location conflict`` we are trying to escape. We push
the grid into the overlay via the ``grid_size`` argument instead where
OverlayNG nodes robustly and only ever call ``set_precision`` in
``pointwise`` mode (pure coordinate rounding, which cannot raise).
2. **Validate first.** ``make_valid`` repairs the self-intersections (bow-ties,
pinches) that make GEOS choke, so the overlay starts from an OGC-valid shape.
3. **Keep only polygonal parts.** ``make_valid`` of a spiky polygon routinely
returns a *mixed-dimension* ``GeometryCollection`` (a polygon plus a dangling
line), and OverlayNG rejects mixed-dimension input with
``IllegalArgumentException: Overlay input is mixed-dimension``. Every input
here represents an area, so the line/point debris is meaningless noise and is
dropped before the overlay.
The default fallback grid is 0.1 mm in the metre-based working CRS
(EPSG:27700), far below survey resolution and the ``MIN_GEOM_AREA`` threshold
downstream, so it cannot crush a postcode-scale shape. Callers operating in a
different CRS (e.g. the WGS84 output stage, where the same 1e-4 grid would be
~11 m) pass a ``grid`` matched to that CRS. The whole fallback runs ONLY after
the exact operation has already failed, so normal output is bit-for-bit
unchanged.
"""
import shapely
from shapely import GEOSException, make_valid, set_precision
from shapely.geometry import Polygon
from shapely.ops import unary_union
# 0.1 mm in metres — well below MIN_GEOM_AREA (0.01 m^2) and survey resolution.
_SNAP_GRID = 1e-4
_EMPTY = Polygon()
def _poly_valid(geom, grid):
"""Return an OGC-valid, polygonal-only version of ``geom``.
Repairs invalidity with ``make_valid`` and discards any non-polygonal debris
(dangling lines/points) it leaves behind, since OverlayNG rejects
mixed-dimension ``GeometryCollection`` input. The result is always a
``Polygon``/``MultiPolygon`` (possibly empty), safe to feed to a
fixed-precision overlay.
"""
g = geom if geom.is_valid else make_valid(geom)
if g.geom_type in ("Polygon", "MultiPolygon"):
return g
if g.geom_type == "GeometryCollection":
polys = [
p
for p in g.geoms
if p.geom_type in ("Polygon", "MultiPolygon") and not p.is_empty
]
if not polys:
return _EMPTY
if len(polys) == 1:
return polys[0]
# Dissolve on the grid (robust) rather than building a possibly-invalid
# MultiPolygon of overlapping parts.
return shapely.union_all(polys, grid_size=grid)
return _EMPTY # line / point only
def _snap_poly_valid(geom, grid):
"""Coordinate-round onto ``grid`` (``pointwise`` never raises), then clean."""
return _poly_valid(set_precision(geom, grid, mode="pointwise"), grid)
def safe_union(geoms, grid=_SNAP_GRID):
"""``unary_union`` that survives GEOS robustness failures."""
try:
return unary_union(geoms)
except GEOSException:
pass
cleaned = [_poly_valid(g, grid) for g in geoms]
try:
return shapely.union_all(cleaned, grid_size=grid)
except GEOSException:
snapped = [_snap_poly_valid(g, grid) for g in geoms]
return shapely.union_all(snapped, grid_size=grid)
def safe_difference(a, b, grid=_SNAP_GRID):
"""``a.difference(b)`` that survives GEOS robustness failures."""
try:
return a.difference(b)
except GEOSException:
pass
a2, b2 = _poly_valid(a, grid), _poly_valid(b, grid)
try:
return shapely.difference(a2, b2, grid_size=grid)
except GEOSException:
return shapely.difference(
_snap_poly_valid(a2, grid), _snap_poly_valid(b2, grid), grid_size=grid
)
def safe_intersection(a, b, grid=_SNAP_GRID):
"""``a.intersection(b)`` that survives GEOS robustness failures."""
try:
return a.intersection(b)
except GEOSException:
pass
a2, b2 = _poly_valid(a, grid), _poly_valid(b, grid)
try:
return shapely.intersection(a2, b2, grid_size=grid)
except GEOSException:
return shapely.intersection(
_snap_poly_valid(a2, grid), _snap_poly_valid(b2, grid), grid_size=grid
)

View file

@ -5,9 +5,10 @@ from pathlib import Path
import polars as pl import polars as pl
from shapely import wkb from shapely import wkb
from shapely.geometry import MultiPolygon, Polygon from shapely.geometry import MultiPolygon, Polygon
from shapely.ops import unary_union
from shapely.strtree import STRtree from shapely.strtree import STRtree
from .geometry import safe_difference, safe_union
def load_greenspace(path: Path) -> tuple[STRtree, list]: def load_greenspace(path: Path) -> tuple[STRtree, list]:
"""Load greenspace parquet and build an STRtree spatial index. """Load greenspace parquet and build an STRtree spatial index.
@ -51,8 +52,8 @@ def subtract_greenspace(
if not intersecting: if not intersecting:
return postcode_geom return postcode_geom
green_union = unary_union(intersecting) green_union = safe_union(intersecting)
result = postcode_geom.difference(green_union) result = safe_difference(postcode_geom, green_union)
if result.is_empty: if result.is_empty:
return postcode_geom return postcode_geom

View file

@ -7,11 +7,12 @@ import numpy as np
from pyproj import Transformer from pyproj import Transformer
from shapely import STRtree, make_valid, set_precision from shapely import STRtree, make_valid, set_precision
from shapely.errors import GEOSException from shapely.errors import GEOSException
from shapely.geometry import MultiPolygon, Polygon, mapping, shape from shapely.geometry import MultiPolygon, Polygon, box, mapping, shape
from shapely.ops import transform as transform_geometry from shapely.ops import transform as transform_geometry
from shapely.ops import unary_union
from tqdm import tqdm from tqdm import tqdm
from .geometry import safe_difference, safe_union
_to_wgs84 = None _to_wgs84 = None
@ -207,7 +208,7 @@ def merge_fragments(
merged = {} merged = {}
for pc, parts in by_postcode.items(): for pc, parts in by_postcode.items():
combined = unary_union(parts) combined = safe_union(parts)
if combined.is_empty: if combined.is_empty:
continue continue
if not combined.is_valid: if not combined.is_valid:
@ -260,7 +261,10 @@ def _polygonal(geom):
] ]
if not polys: if not polys:
return None return None
merged = unary_union(polys) # Both callers run on WGS84-degree output geometry, so the robustness
# fallback snaps on the 1e-6° grid (~0.11 m), not geometry.py's metre
# default — a coarse metre grid would obliterate a degree-scale shape.
merged = safe_union(polys, grid=_OUTPUT_PRECISION_DEG)
return merged if not merged.is_empty else None return merged if not merged.is_empty else None
return None return None
@ -317,8 +321,13 @@ def _resolve_overlaps(
# Process losers from highest priority down, so every subtracted neighbour is # Process losers from highest priority down, so every subtracted neighbour is
# already finalised. # already finalised.
for i in sorted(higher, key=lambda idx: rank[idx]): for i in sorted(higher, key=lambda idx: rank[idx]):
cut = unary_union([out[j] for j in higher[i]]) # These geometries are WGS84 degrees already snapped to output precision,
trimmed = out[i].difference(cut) # so the robustness fallback snaps on the same 1e-6° grid (~0.11 m) rather
# than geometry.py's metre-CRS default. A raw difference can still raise a
# "side location conflict" on near-coincident OA-seam edges; the
# fixed-precision overlay noded them away.
cut = safe_union([out[j] for j in higher[i]], grid=_OUTPUT_PRECISION_DEG)
trimmed = safe_difference(out[i], cut, grid=_OUTPUT_PRECISION_DEG)
if not trimmed.is_valid: if not trimmed.is_valid:
trimmed = make_valid(trimmed) trimmed = make_valid(trimmed)
# Keep all polygonal parts: these geometries are in WGS84 degrees, so an # Keep all polygonal parts: these geometries are in WGS84 degrees, so an
@ -336,13 +345,83 @@ def _round_coords(coords, ndigits=6):
return [_round_coords(c, ndigits) for c in coords] return [_round_coords(c, ndigits) for c in coords]
def _snap_polygonal(geom, grid):
"""Re-node ``geom`` onto ``grid`` (``valid_output``) → polygonal-only, or None.
``set_precision`` ``valid_output`` runs an OverlayNG noding pass that places
every vertex on a multiple of ``grid`` *and* fixes the topology, so a plain
coordinate round of the result is exact (no two distinct vertices can land in
the same cell). Falls back to ``make_valid`` if precision-reduction raises.
"""
try:
snapped = set_precision(geom, grid, mode="valid_output")
except GEOSException:
snapped = make_valid(geom)
return _polygonal(snapped if snapped.is_valid else make_valid(snapped))
# A square this many output-grid cells on a side, used as the last-resort
# footprint when snapping erases a sub-grid sliver. ~10 cells (≈0.71.1 m at UK
# latitudes) is invisible at map scale yet survives the 1e-6° snap as a valid,
# 4-corner ring.
_FOOTPRINT_GRID_CELLS = 5
def _grid_footprint(geom):
"""A tiny grid-aligned square at ``geom``'s representative point, snapped valid.
Last line of defence so an *active* postcode never vanishes: the de-overlap
pass can shave a small (e.g. co-located, non-geographic) postcode down to a
sub-grid sliver that disappears when snapped to output precision. Rather than
drop it, place a minimal valid footprint at its location. The tiny overlap
this re-creates with the neighbour that trimmed it is harmless the output
partition is best-effort, a missing boundary is a hard validation failure.
"""
try:
point = geom.representative_point()
except GEOSException:
return None
half = _OUTPUT_PRECISION_DEG * _FOOTPRINT_GRID_CELLS
square = box(point.x - half, point.y - half, point.x + half, point.y + half)
return _snap_polygonal(square, _OUTPUT_PRECISION_DEG)
def _geojson_geometry(geom) -> dict | None: def _geojson_geometry(geom) -> dict | None:
"""Serialize a WGS84 polygon/multipolygon to a 6dp GeoJSON dict, or None.""" """Serialize a WGS84 polygon/multipolygon to a *valid* 6dp GeoJSON dict, or None.
The coordinates are snapped onto the 1e-6° output grid with a re-noding pass
BEFORE the 6dp rounding, not by the round alone. ``_resolve_overlaps`` leaves
thin overlap-sliver triangles with full-precision (off-grid) vertices at OA
seams; a bare round-to-6dp collapses those into degenerate rings (GEOS "Too
few points") and pinches rings into self-intersections that pass the
pre-rounding validity check but fail once the feature is read back from disk.
Snapping with ``valid_output`` nodes the geometry onto the grid so the round
that follows lands on exact 1e-6 multiples and cannot reintroduce invalidity.
``None`` only for a genuinely empty/degenerate-with-no-location input; a
non-empty geometry that snaps to a sub-grid sliver is rescued into a minimal
grid footprint rather than dropped (an active postcode must keep a boundary).
"""
geom = _polygonal(geom if geom.is_valid else make_valid(geom)) geom = _polygonal(geom if geom.is_valid else make_valid(geom))
if geom is None or geom.is_empty: if geom is None or geom.is_empty:
return None return None
gj = mapping(geom) snapped = _snap_polygonal(geom, _OUTPUT_PRECISION_DEG)
return {"type": gj["type"], "coordinates": _round_coords(gj["coordinates"])} if snapped is None or snapped.is_empty:
snapped = _grid_footprint(geom)
if snapped is None or snapped.is_empty:
return None
gj = mapping(snapped)
out = {"type": gj["type"], "coordinates": _round_coords(gj["coordinates"])}
# Defence-in-depth: re-validate the dict that actually reaches disk. Snapping
# makes the round exact, so this should already hold; repair once more on the
# grid if a pathological vertex still pinches a ring.
if not shape(out).is_valid:
snapped = _snap_polygonal(shape(out), _OUTPUT_PRECISION_DEG)
if snapped is None or snapped.is_empty:
return None
gj = mapping(snapped)
out = {"type": gj["type"], "coordinates": _round_coords(gj["coordinates"])}
return out
def write_district_geojson( def write_district_geojson(

View file

@ -3,13 +3,23 @@ from collections import Counter, defaultdict
import numpy as np import numpy as np
from scipy.spatial import cKDTree from scipy.spatial import cKDTree
from shapely import STRtree, make_valid from shapely import STRtree, make_valid
from shapely.geometry import MultiPolygon, Polygon from shapely.geometry import MultiPoint, MultiPolygon, Point, Polygon
from shapely.ops import unary_union
from .geometry import safe_difference, safe_intersection, safe_union
from .voronoi import compute_voronoi_regions from .voronoi import compute_voronoi_regions
MIN_GEOM_AREA = 0.01 MIN_GEOM_AREA = 0.01
# Minimal footprint (BNG metres) for a postcode whose UPRN seed wins no area in a
# crowded multi-postcode OA — its Voronoi cell ∩ remaining collapses below
# MIN_GEOM_AREA, or its seed sits inside an INSPIRE parcel wholly claimed by a
# co-located postcode. Every *active* postcode must keep a boundary
# (validate_outputs is zero-tolerance), so it gets a small disc at its true seed
# location. ~28 m² clears MIN_GEOM_AREA and output snapping; the overlap it
# creates with the area's winner is resolved at the output stage, where the
# smaller postcode wins the contested ground (see output._resolve_overlaps).
_MIN_SEED_FOOTPRINT_M = 3.0
def process_oa( def process_oa(
oa_geom: Polygon | MultiPolygon, oa_geom: Polygon | MultiPolygon,
@ -36,10 +46,12 @@ def process_oa(
# Compute remaining area # Compute remaining area
if claimed: if claimed:
all_claimed = unary_union(list(claimed.values())) all_claimed = safe_union(list(claimed.values()))
all_claimed = _clean_polygonal(all_claimed) all_claimed = _clean_polygonal(all_claimed)
remaining = ( remaining = (
valid_oa.difference(all_claimed) if all_claimed is not None else valid_oa safe_difference(valid_oa, all_claimed)
if all_claimed is not None
else valid_oa
) )
remaining = _clean_polygonal(remaining) remaining = _clean_polygonal(remaining)
else: else:
@ -60,13 +72,54 @@ def process_oa(
fragments = [] fragments = []
for pc, parts in result.items(): for pc, parts in result.items():
merged = _clean_polygonal(unary_union(parts)) merged = _clean_polygonal(safe_union(parts))
if merged is not None: if merged is not None:
fragments.append((pc, merged)) fragments.append((pc, merged))
# Every postcode with a UPRN seed in this OA must keep at least a minimal
# footprint — in a dense OA (a block of flats with hundreds of distinct
# postcodes) a single-seed postcode's cell can collapse below MIN_GEOM_AREA or
# be fully absorbed by a co-located postcode's INSPIRE parcel, producing no
# fragment, and an active postcode must never be dropped.
orphans = unique_pcs - {pc for pc, _ in fragments}
if orphans:
fragments.extend(_seed_footprints(orphans, points, postcodes, valid_oa))
return fragments return fragments
def _seed_footprints(
orphans: set[str],
points: np.ndarray,
postcodes: list[str],
valid_oa: Polygon | MultiPolygon,
) -> list[tuple[str, Polygon | MultiPolygon]]:
"""Give each orphan postcode a minimal disc footprint at its UPRN seed(s).
Orphans are postcodes with a UPRN in this OA that nonetheless won no area in
the INSPIRE/Voronoi partition. Each keeps a small disc at its true location,
clipped to the OA; the overlap with the area's winner is resolved at output.
"""
by_pc: dict[str, list] = defaultdict(list)
for i, pc in enumerate(postcodes):
if pc in orphans:
by_pc[pc].append(points[i])
out: list[tuple[str, Polygon | MultiPolygon]] = []
for pc, pts in by_pc.items():
arr = np.asarray(pts, dtype=np.float64)
seed = Point(arr[0]) if len(arr) == 1 else MultiPoint(arr)
disc = seed.buffer(_MIN_SEED_FOOTPRINT_M)
clipped = _clean_polygonal(safe_intersection(disc, valid_oa))
if clipped is None:
# Seed on/near the OA edge: keep the unclipped disc so the postcode
# still gets a footprint at its location rather than no boundary.
clipped = _clean_polygonal(disc)
if clipped is not None:
out.append((pc, clipped))
return out
def _claim_inspire_parcels( def _claim_inspire_parcels(
valid_oa: Polygon | MultiPolygon, valid_oa: Polygon | MultiPolygon,
points: np.ndarray, points: np.ndarray,
@ -141,7 +194,7 @@ def _claim_inspire_parcels(
assignable = parcel assignable = parcel
if contained_union is not None: if contained_union is not None:
assignable = assignable.difference(contained_union) assignable = safe_difference(assignable, contained_union)
for part in _polygon_parts(assignable): for part in _polygon_parts(assignable):
part = _clean_polygonal(part) part = _clean_polygonal(part)
if part is None: if part is None:
@ -169,7 +222,7 @@ def _prepare_inspire_parcels(
continue continue
if not geom.intersects(valid_oa): if not geom.intersects(valid_oa):
continue continue
clipped = _clean_polygonal(geom.intersection(valid_oa)) clipped = _clean_polygonal(safe_intersection(geom, valid_oa))
if clipped is not None: if clipped is not None:
parcels.append(clipped) parcels.append(clipped)
return parcels return parcels
@ -199,7 +252,7 @@ def _merge_parts_by_postcode(
) -> dict[str, Polygon | MultiPolygon]: ) -> dict[str, Polygon | MultiPolygon]:
merged: dict[str, Polygon | MultiPolygon] = {} merged: dict[str, Polygon | MultiPolygon] = {}
for pc, parts in parts_by_postcode.items(): for pc, parts in parts_by_postcode.items():
geom = _clean_polygonal(unary_union(parts)) geom = _clean_polygonal(safe_union(parts))
if geom is not None: if geom is not None:
merged[pc] = geom merged[pc] = geom
return merged return merged
@ -210,7 +263,7 @@ def _union_claims(
) -> Polygon | MultiPolygon | None: ) -> Polygon | MultiPolygon | None:
if not claims: if not claims:
return None return None
return _clean_polygonal(unary_union([geom for _, geom in claims])) return _clean_polygonal(safe_union([geom for _, geom in claims]))
def _resolve_ordered_claims( def _resolve_ordered_claims(
@ -224,11 +277,11 @@ def _resolve_ordered_claims(
if geom is None: if geom is None:
continue continue
if used is not None: if used is not None:
geom = _clean_polygonal(geom.difference(used)) geom = _clean_polygonal(safe_difference(geom, used))
if geom is None: if geom is None:
continue continue
resolved_parts[pc].append(geom) resolved_parts[pc].append(geom)
used = _clean_polygonal(geom if used is None else unary_union([used, geom])) used = _clean_polygonal(geom if used is None else safe_union([used, geom]))
return _merge_parts_by_postcode(resolved_parts) return _merge_parts_by_postcode(resolved_parts)
@ -261,7 +314,7 @@ def _extract_polygonal(geom) -> Polygon | MultiPolygon | None:
# overlapping polygonal parts, and a MultiPolygon of overlapping parts is # overlapping polygonal parts, and a MultiPolygon of overlapping parts is
# invalid — it double-counts area and makes the next `.difference()` raise # invalid — it double-counts area and makes the next `.difference()` raise
# a TopologyException that aborts the OA (and, in parallel mode, the # a TopologyException that aborts the OA (and, in parallel mode, the
# worker). unary_union merges them into a valid geometry. # worker). safe_union merges them into a valid geometry.
merged = unary_union(polys) merged = safe_union(polys)
return merged if not merged.is_empty else None return merged if not merged.is_empty else None
return None return None

View file

@ -757,6 +757,50 @@ class TestProcessOAInspireParcelAssignment:
assert frag_dict["A"].intersection(frag_dict["B"]).area < 0.01 assert frag_dict["A"].intersection(frag_dict["B"]).area < 0.01
class TestProcessOASeedFootprintGuarantee:
"""Every postcode with a UPRN seed in a multi-postcode OA must produce a
fragment, even when its partition cell collapses below MIN_GEOM_AREA an
active postcode must never be dropped (validate_outputs is zero-tolerance)."""
def test_collapsed_voronoi_cells_rescued_as_footprints(self):
# OA just above MIN_GEOM_AREA; a 2-way Voronoi split leaves each cell
# (~0.0098 m²) below MIN_GEOM_AREA, so both would be dropped without rescue.
oa_geom = box(0, 0, 0.14, 0.14) # 0.0196 m²
points = np.array([[0.035, 0.07], [0.105, 0.07]])
postcodes = ["AA1 1AA", "AA1 1AB"]
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[])
got = {pc for pc, _ in fragments}
assert got == {"AA1 1AA", "AA1 1AB"}, f"dropped: {set(postcodes) - got}"
for _, geom in fragments:
assert geom.is_valid
assert geom.area > MIN_GEOM_AREA
def test_seed_footprint_sits_on_the_uprn(self):
from shapely.geometry import Point
from .process_oa import _seed_footprints
oa_geom = box(0, 0, 1000, 1000)
points = np.array([[500.0, 500.0]])
out = _seed_footprints({"AA1 1AA"}, points, ["AA1 1AA"], oa_geom)
assert len(out) == 1
pc, geom = out[0]
assert pc == "AA1 1AA"
assert geom.is_valid and geom.area > MIN_GEOM_AREA
assert geom.contains(Point(500, 500))
def test_only_orphans_get_footprints(self):
# A wins real area; B's lone seed collapses. Only B should be rescued, and
# A's genuine (large) fragment must be untouched by the rescue.
oa_geom = box(0, 0, 0.14, 0.14)
points = np.array([[0.07, 0.07], [0.07, 0.07], [0.105, 0.07]])
postcodes = ["AA1 1AA", "AA1 1AA", "AA1 1AB"]
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[])
assert {pc for pc, _ in fragments} == {"AA1 1AA", "AA1 1AB"}
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# _extract_polygonal helper # _extract_polygonal helper
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -1221,6 +1265,238 @@ class TestOutputPartition:
assert geoms["AA1 1AB"].area > 0 assert geoms["AA1 1AB"].area > 0
class TestGeojsonGeometrySliverValidity:
"""_geojson_geometry must emit geometry that is still valid after the final 6dp
rounding. A sub-grid overlap sliver (left by _resolve_overlaps' full-precision
difference) must be snapped away on the output grid rather than collapsed by a
naive round into a degenerate ('Too few points') / self-intersecting ring that
only fails once read back from disk."""
def test_subgrid_sliver_does_not_produce_invalid_ring(self):
from shapely.geometry import shape
from shapely.validation import explain_validity
from .output import _geojson_geometry
main = box(-0.34, 51.74, -0.33, 51.75) # ~degree-scale, valid
# A thin triangle whose three vertices all round to the same 1e-6 cell.
sliver = Polygon(
[
(-0.3400284, 51.7505061),
(-0.3400280, 51.7505060),
(-0.3400281, 51.7505063),
]
)
assert sliver.is_valid
gj = _geojson_geometry(MultiPolygon([main, sliver]))
assert gj is not None
rt = shape(gj)
assert rt.is_valid, explain_validity(rt)
assert not rt.is_empty
# The real (main) area is preserved; only the degenerate sliver is dropped.
assert rt.area == pytest.approx(main.area, rel=1e-3)
def test_pure_sliver_is_rescued_not_written_invalid(self):
from shapely.geometry import shape
from .output import _OUTPUT_PRECISION_DEG, _geojson_geometry
# A sub-grid sliver (extent < 1e-6° in one dimension) that snaps to empty:
# it must be rescued into a minimal valid footprint at its location, never
# written invalid and never dropped (an active postcode keeps a boundary).
g = _OUTPUT_PRECISION_DEG
sliver = Polygon(
[
(-0.30, 51.75),
(-0.30 + 5 * g, 51.75),
(-0.30 + 5 * g, 51.75 + 0.2 * g),
(-0.30, 51.75 + 0.1 * g),
]
)
gj = _geojson_geometry(sliver)
assert gj is not None # rescued, not dropped
rt = shape(gj)
assert rt.is_valid and not rt.is_empty
assert rt.distance(sliver) == pytest.approx(0.0, abs=1e-4)
class TestColocatedPostcodesAllRetained:
"""Co-located non-geographic postcodes (e.g. AL1 9xx) have heavily-overlapping
tiny footprints; the de-overlap pass trims most to sub-grid slivers. None may
be dropped every active postcode must keep a (valid, non-empty) boundary."""
def test_overlapping_tiny_footprints_none_dropped(self, tmp_path):
from shapely.geometry import Point, shape
# 12 ~3 m discs within ~1 m of each other → after de-overlap most collapse
# below output precision.
postcodes = {}
for i in range(12):
e = 514000.0 + (i % 4) * 0.3
n = 206000.0 + (i // 4) * 0.3
postcodes[f"AL1 9{chr(65 + i)}{chr(65 + i)}"] = Point(e, n).buffer(3.0)
write_district_geojson(postcodes, tmp_path)
coll = json.loads((tmp_path / "units" / "AL1.geojson").read_text())
written = {f["properties"]["postcodes"] for f in coll["features"]}
assert written == set(postcodes), f"dropped: {set(postcodes) - written}"
for feature in coll["features"]:
geom = shape(feature["geometry"])
assert geom.is_valid and not geom.is_empty
class TestSafeOverlayHelpers:
"""The robust overlay helpers retry on a fixed-precision grid after a
GEOSException (e.g. ``side location conflict`` from near-coincident OA-seam
edges). The grid is a caller-supplied parameter: metres for the BNG stages,
1e-6 degrees for the WGS84 output stage so the same helper serves both
without crushing degree-scale shapes on the metre default."""
def test_grid_param_honored_on_clean_inputs(self):
from . import geometry
a = box(0, 0, 10, 10)
b = box(5, 0, 15, 10)
# No exception → exact result, identical regardless of the fallback grid.
assert geometry.safe_difference(a, b, grid=1e-6).equals(a.difference(b))
assert geometry.safe_union([a, b], grid=1e-6).equals(unary_union([a, b]))
assert geometry.safe_intersection(a, b, grid=1e-6).equals(a.intersection(b))
def test_difference_falls_back_to_fixed_precision_overlay(self, monkeypatch):
from shapely import GEOSException
from shapely.geometry.base import BaseGeometry
from . import geometry
a = box(0, 0, 10, 10)
b = box(5, 0, 15, 10)
def _boom(self, other, *args, **kwargs):
raise GEOSException("forced side location conflict")
# Patch the .difference() *method*; the fallback uses the top-level
# shapely.difference() function, so it still completes.
monkeypatch.setattr(BaseGeometry, "difference", _boom)
result = geometry.safe_difference(a, b, grid=1e-6)
assert result.is_valid
assert result.area == pytest.approx(50.0) # left half of `a`
def test_union_falls_back_to_fixed_precision_overlay(self, monkeypatch):
from shapely import GEOSException
from . import geometry
a = box(0, 0, 10, 10)
b = box(5, 0, 15, 10)
def _boom(_geoms):
raise GEOSException("forced robustness failure")
monkeypatch.setattr(geometry, "unary_union", _boom)
result = geometry.safe_union([a, b], grid=1e-6)
assert result.is_valid
assert result.area == pytest.approx(150.0) # 100 + 100 - 50 overlap
# A self-intersecting bow-tie: invalid. set_precision()'s DEFAULT
# ('valid_output') mode runs its own noding pass that re-raises
# 'side location conflict' on this — which is exactly how the production
# crash happened (the fallback re-raised the error it was meant to absorb).
_BOWTIE = Polygon([(0, 0), (1e-5, 1e-5), (1e-5, 0), (0, 1e-5), (0, 0)])
# make_valid() of this spiky ring returns a mixed-dimension
# GeometryCollection (polygon + dangling line); OverlayNG rejects that with
# 'Overlay input is mixed-dimension', so the fallback must strip the debris.
_SPIKY = Polygon(
[(0, 0), (10, 0), (10, 10), (5, 5), (5.0000001, 5), (0, 10), (0, 0), (15, -5), (0, 0)]
)
def test_difference_fallback_survives_invalid_input(self, monkeypatch):
"""Regression for the production crash: the fallback must not reduce
precision with set_precision's default valid_output mode, which re-raises
'side location conflict' on an invalid geometry."""
from shapely import GEOSException
from shapely.geometry.base import BaseGeometry
from . import geometry
assert not self._BOWTIE.is_valid # precondition
def _boom(self, other, *a, **k):
raise GEOSException("forced side location conflict")
monkeypatch.setattr(BaseGeometry, "difference", _boom)
result = geometry.safe_difference(self._BOWTIE, box(0, 0, 5e-6, 5e-6), grid=1e-6)
assert result.is_valid
assert result.geom_type in ("Polygon", "MultiPolygon")
def test_difference_fallback_survives_make_valid_geometrycollection(
self, monkeypatch
):
"""Regression: make_valid can yield a mixed-dimension GeometryCollection
that OverlayNG rejects; the fallback must keep only polygonal parts."""
from shapely import GEOSException, make_valid
from shapely.geometry.base import BaseGeometry
from . import geometry
assert make_valid(self._SPIKY).geom_type == "GeometryCollection" # precondition
def _boom(self, other, *a, **k):
raise GEOSException("forced side location conflict")
monkeypatch.setattr(BaseGeometry, "difference", _boom)
result = geometry.safe_difference(self._SPIKY, box(2, 2, 8, 8), grid=1e-4)
assert result.is_valid
assert result.geom_type in ("Polygon", "MultiPolygon")
assert result.area > 0
def test_intersection_fallback_survives_make_valid_geometrycollection(
self, monkeypatch
):
from shapely import GEOSException
from shapely.geometry.base import BaseGeometry
from . import geometry
def _boom(self, other, *a, **k):
raise GEOSException("forced side location conflict")
monkeypatch.setattr(BaseGeometry, "intersection", _boom)
result = geometry.safe_intersection(self._SPIKY, box(2, 2, 8, 8), grid=1e-4)
assert result.is_valid
assert result.geom_type in ("Polygon", "MultiPolygon")
assert result.area > 0
def test_union_fallback_survives_invalid_and_collection_inputs(self, monkeypatch):
"""The union fallback must absorb both an invalid bow-tie and a
make_valid-becomes-GeometryCollection input without raising."""
from shapely import GEOSException
from . import geometry
def _boom(_geoms):
raise GEOSException("forced robustness failure")
monkeypatch.setattr(geometry, "unary_union", _boom)
result = geometry.safe_union(
[self._BOWTIE, self._SPIKY, box(20, 20, 30, 30)], grid=1e-4
)
assert result.is_valid
assert result.geom_type in ("Polygon", "MultiPolygon")
def test_helpers_never_raise_on_empty_inputs(self):
"""Degenerate/empty inputs must not abort a multi-hour run."""
from . import geometry
empty = Polygon()
a = box(0, 0, 10, 10)
assert geometry.safe_difference(empty, a).is_empty
assert geometry.safe_difference(a, empty).equals(a)
assert geometry.safe_intersection(empty, a).is_empty
assert geometry.safe_union([]).is_empty
assert geometry.safe_union([empty]).is_empty
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# InspireIndex must return the same candidates as a brute-force bbox scan # InspireIndex must return the same candidates as a brute-force bbox scan
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------

View file

@ -4,7 +4,8 @@ import numpy as np
from scipy.spatial import QhullError, Voronoi from scipy.spatial import QhullError, Voronoi
from shapely import make_valid from shapely import make_valid
from shapely.geometry import MultiPolygon, Polygon from shapely.geometry import MultiPolygon, Polygon
from shapely.ops import unary_union
from .geometry import safe_intersection, safe_union
def compute_voronoi_regions( def compute_voronoi_regions(
@ -111,13 +112,13 @@ def compute_voronoi_regions(
poly = Polygon(vertices) poly = Polygon(vertices)
if not poly.is_valid: if not poly.is_valid:
poly = make_valid(poly) poly = make_valid(poly)
clipped = poly.intersection(boundary) clipped = safe_intersection(poly, boundary)
if not clipped.is_empty: if not clipped.is_empty:
pc_polys[unique_pcs[i]].append(clipped) pc_polys[unique_pcs[i]].append(clipped)
result = {} result = {}
for pc, parts in pc_polys.items(): for pc, parts in pc_polys.items():
merged = unary_union(parts) merged = safe_union(parts)
if not merged.is_empty: if not merged.is_empty:
result[pc] = merged result[pc] = merged
return result return result
@ -143,7 +144,7 @@ def _equal_split_fallback(
(min_x, strip_max_y), (min_x, strip_max_y),
] ]
) )
clipped = boundary.intersection(strip) clipped = safe_intersection(boundary, strip)
if not clipped.is_empty: if not clipped.is_empty:
result[pc] = clipped result[pc] = clipped
return result return result

View file

@ -66,6 +66,53 @@ def test_accumulate_clipped_area_drops_missing_and_empty_geometry() -> None:
assert canopy_area[0] == pytest.approx(400.0) assert canopy_area[0] == pytest.approx(400.0)
def test_accumulate_clipped_area_survives_invalid_polygon() -> None:
"""A self-intersecting external polygon (TOW/NFI data is occasionally invalid)
must not abort the batched overlay with 'TopologyException: side location
conflict'; its repaired in-buffer area is still accumulated."""
radius_m = 50
points = pl.DataFrame({"postcode": ["A"], "x": [0.0], "y": [0.0]})
circles, tree = _postcode_buffers(points, radius_m)
# Bow-tie centred on A: self-intersecting => invalid. The raw batched
# shapely.intersection raises 'side location conflict' on it; make_valid splits
# it into two triangles of total area 200, fully inside A's radius-50 buffer.
bowtie = shapely.Polygon([(-10, -10), (10, 10), (10, -10), (-10, 10), (-10, -10)])
assert not shapely.is_valid(bowtie) # precondition
with pytest.raises(shapely.errors.GEOSException): # documents the raw hazard
shapely.intersection(
np.array([bowtie], dtype=object), np.array([circles[0]], dtype=object)
)
canopy_area = np.zeros(1)
_accumulate_clipped_area(np.array([bowtie], dtype=object), circles, tree, canopy_area)
assert canopy_area[0] == pytest.approx(200.0, rel=1e-3)
def test_robust_intersection_area_recovers_from_overlay_failure(monkeypatch) -> None:
"""The batched-overlay fallback must absorb a GEOSException from the fast path
and recover (validate + retry), returning the correct per-pair areas. Version
independent: the fast-path failure is forced rather than data-dependent."""
from pipeline.transform import tree_density
real_intersection = shapely.intersection
calls = {"n": 0}
def flaky(a, b, **kwargs):
calls["n"] += 1
if calls["n"] == 1: # fail only the first (fast-path) call
raise shapely.errors.GEOSException("forced side location conflict")
return real_intersection(a, b, **kwargs)
monkeypatch.setattr(tree_density.shapely, "intersection", flaky)
a = np.array([shapely.box(0, 0, 10, 10), shapely.box(0, 0, 4, 4)], dtype=object)
b = np.array([shapely.box(0, 0, 6, 6), shapely.box(0, 0, 2, 2)], dtype=object)
out = tree_density._robust_intersection_area(a, b)
assert calls["n"] >= 2 # fast path failed -> fallback path executed
assert out.tolist() == pytest.approx([36.0, 4.0])
def test_accumulate_clipped_area_height_weighted_by_overlap() -> None: def test_accumulate_clipped_area_height_weighted_by_overlap() -> None:
radius_m = 50 radius_m = 50
points = pl.DataFrame({"postcode": ["A"], "x": [0.0], "y": [0.0]}) points = pl.DataFrame({"postcode": ["A"], "x": [0.0], "y": [0.0]})

View file

@ -263,6 +263,43 @@ def _postcode_buffers(
return circles, shapely.STRtree(circles) return circles, shapely.STRtree(circles)
# 0.1 mm in the BNG working CRS (EPSG:27700) — far below survey resolution; the
# same grid the postcode_boundaries overlay uses.
_OVERLAY_GRID_M = 1e-4
def _robust_intersection_area(a: np.ndarray, b: np.ndarray) -> np.ndarray:
"""Vectorized ``area(a[i] ∩ b[i])`` that survives GEOS robustness failures.
External Forest Research TOW/NFI polygons are occasionally invalid
(self-intersections), and a single bad polygon makes the batched
``shapely.intersection`` raise ``TopologyException: side location conflict``,
aborting the whole run. The fast path is the raw batched overlay unchanged,
full-speed, when the data is clean and only a failure triggers repair.
The repair deliberately uses a *plain* overlay rather than the fixed-precision
(``grid_size``) one: ``make_valid`` can emit a mixed-dimension
``GeometryCollection`` (a polygon plus a dangling line), which OverlayNG
rejects with ``Overlay input is mixed-dimension`` whereas a plain overlay
accepts it, and its non-polygonal debris has zero area and is dropped by the
``clipped_area > 0`` filter downstream anyway. A final pointwise coordinate
snap (which never raises) collapses the near-coincident edges behind any
residual full-precision robustness failure.
"""
try:
return shapely.area(shapely.intersection(a, b))
except shapely.errors.GEOSException:
pass
a = shapely.make_valid(a)
b = shapely.make_valid(b)
try:
return shapely.area(shapely.intersection(a, b))
except shapely.errors.GEOSException:
a = shapely.make_valid(shapely.set_precision(a, _OVERLAY_GRID_M, mode="pointwise"))
b = shapely.make_valid(shapely.set_precision(b, _OVERLAY_GRID_M, mode="pointwise"))
return shapely.area(shapely.intersection(a, b))
def _accumulate_clipped_area( def _accumulate_clipped_area(
geoms: np.ndarray, geoms: np.ndarray,
circles: np.ndarray, circles: np.ndarray,
@ -294,8 +331,8 @@ def _accumulate_clipped_area(
if geom_index.size == 0: if geom_index.size == 0:
return return
clipped_area = shapely.area( clipped_area = _robust_intersection_area(
shapely.intersection(geoms[geom_index], circles[postcode_index]) geoms[geom_index], circles[postcode_index]
) )
positive = clipped_area > 0 positive = clipped_area > 0
geom_index = geom_index[positive] geom_index = geom_index[positive]