This commit is contained in:
Andras Schmelczer 2026-06-02 13:46:18 +01:00
parent a04ac2d857
commit d43da9708c
47 changed files with 4120 additions and 573 deletions

View file

@ -11,12 +11,20 @@ import pytest
from shapely.geometry import MultiPolygon, Polygon, box
from shapely.ops import unary_union
from .fragments_cache import (
fragments_cache_is_fresh,
load_fragments,
save_fragments,
)
from .__main__ import _oa_fragments, _process_oas
from .inspire import build_inspire_index
from .oa_boundaries import parse_gpkg_geometry
from .greenspace import subtract_greenspace
from .output import (
_fill_holes,
merge_fragments,
to_wgs84_geojson,
to_wgs84_geojson_multi,
write_district_geojson,
)
from .process_oa import _extract_polygonal, process_oa
@ -173,6 +181,52 @@ class TestWhitespacePostcodes:
assert loaded_df["PCDS"].to_list() == ["AA1 1AB"]
def test_remapped_terminated_postcode_adopts_successor_oa(self, tmp_path):
"""When a terminated postcode is remapped to its active successor, the
remapped seed point must carry the SUCCESSOR's OA (and coords), not the
terminated postcode's original OA. Pre-fix the row kept OA21CD of the
terminated postcode, seeding the successor into an OA it doesn't belong
to and splitting its boundary across OAs."""
# Terminated AA1 1AA sits in OA E00000001. Its nearest active successor
# AA1 1AB lives in a DIFFERENT OA (E00000002) far away.
uprns = pl.DataFrame(
{
"GRIDGB1E": [500010],
"GRIDGB1N": [180010],
"PCDS": ["AA1 1AA"],
"OA21CD": ["E00000001"],
}
)
uprn_path = tmp_path / "uprn.parquet"
uprns.write_parquet(uprn_path)
arcgis = pl.DataFrame(
{
"pcds": ["AA1 1AA", "AA1 1AB"],
"east1m": [500010, 500030],
"north1m": [180010, 180020],
# AA1 1AA terminated → only AA1 1AB is an active successor, and
# it belongs to a different OA than the terminated postcode.
"oa21cd": ["E00000001", "E00000002"],
"doterm": ["2020-01-01", None],
"ctry25cd": ["E92000001", "E92000001"],
}
)
arcgis_path = tmp_path / "arcgis.parquet"
arcgis.write_parquet(arcgis_path)
loaded_df, offsets = load_uprns(uprn_path, arcgis_path)
# The remapped point must be grouped under the successor's OA, not the
# terminated postcode's OA.
assert "E00000002" in offsets, "Successor OA missing — remap kept old OA"
assert "E00000001" not in offsets, (
"Remapped point still lives in the terminated postcode's OA"
)
points, postcodes = get_oa_uprns(loaded_df, offsets, "E00000002")
assert postcodes == ["AA1 1AB"]
# It should also adopt the successor's authoritative coordinates.
assert points.tolist() == [[500030.0, 180020.0]]
def test_arcgis_filters_to_active_english_postcodes(self, tmp_path):
uprns = pl.DataFrame(
{
@ -617,6 +671,32 @@ class TestProcessOAInspireParcelAssignment:
for _, geom in fragments:
assert geom.difference(oa_geom).area < 0.01
def test_shared_parcel_keeps_every_contained_postcode(self):
"""A single parcel containing UPRNs for [A, A, B] must yield a fragment
for BOTH A and B. Pre-fix the majority winner (A) claimed the whole
parcel, excluding it from `remaining`, so B's UPRNs were trapped inside
claimed land and B vanished entirely (no fragment)."""
oa_geom = box(0, 0, 100, 100)
parcel = box(0, 0, 100, 100) # one parcel covering the whole OA
points = np.array(
[
[20, 50], # postcode A
[30, 50], # postcode A (majority)
[80, 50], # postcode B (minority — would be dropped pre-fix)
]
)
postcodes = ["A", "A", "B"]
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[parcel])
frag_dict = dict(fragments)
assert "A" in frag_dict, "Majority postcode A must keep a fragment"
assert "B" in frag_dict, "Minority postcode B must not be dropped"
assert frag_dict["A"].area > 0
assert frag_dict["B"].area > 0
# The split must partition the parcel without overlap.
assert frag_dict["A"].intersection(frag_dict["B"]).area < 0.01
# ---------------------------------------------------------------------------
# _extract_polygonal helper
@ -656,6 +736,21 @@ class TestExtractPolygonal:
assert _extract_polygonal(LineString([(0, 0), (1, 1)])) is None
def test_overlapping_collection_unioned_to_valid(self):
"""A GeometryCollection with OVERLAPPING polygons must be unioned into a
VALID geometry (not a raw MultiPolygon, which would be invalid and crash
the next .difference()), and must not double-count the overlap area."""
from shapely.geometry import GeometryCollection
a = box(0, 0, 100, 100)
b = box(50, 50, 150, 150) # overlaps a by 50x50
result = _extract_polygonal(GeometryCollection([a, b]))
assert result is not None
assert result.is_valid
assert result.area == pytest.approx(unary_union([a, b]).area)
# And the formerly-crashing op now works:
assert result.difference(box(0, 0, 10, 10)).is_valid
# ---------------------------------------------------------------------------
# Edge case: merge_fragments handles single-OA postcodes
@ -763,12 +858,12 @@ class TestParseGpkgGeometry:
class TestFillHoles:
"""_fill_holes must remove all interior holes from polygons."""
"""_fill_holes fills small artifact holes but keeps large (real-enclosed) ones."""
def test_polygon_with_hole(self):
"""A polygon with an interior ring should become a solid polygon."""
def test_small_artifact_hole_filled(self):
"""A small (<1000 m²) interior ring is an artifact and gets filled."""
outer = [(0, 0), (100, 0), (100, 100), (0, 100), (0, 0)]
hole = [(30, 30), (70, 30), (70, 70), (30, 70), (30, 30)]
hole = [(40, 40), (60, 40), (60, 60), (40, 60), (40, 40)] # 20x20 = 400 m²
poly_with_hole = Polygon(outer, [hole])
assert len(list(poly_with_hole.interiors)) == 1
result = _fill_holes(poly_with_hole)
@ -776,6 +871,15 @@ class TestFillHoles:
assert len(list(result.interiors)) == 0
assert result.area == pytest.approx(Polygon(outer).area)
def test_large_hole_kept(self):
"""A large (>=1000 m²) hole is likely a real enclosed postcode — keep it."""
outer = [(0, 0), (100, 0), (100, 100), (0, 100), (0, 0)]
hole = [(20, 20), (80, 20), (80, 80), (20, 80), (20, 20)] # 60x60 = 3600 m²
poly_with_hole = Polygon(outer, [hole])
result = _fill_holes(poly_with_hole)
assert len(list(result.interiors)) == 1
assert result.area == pytest.approx(10000 - 3600)
def test_multipolygon_with_holes(self):
"""A MultiPolygon where each part has holes should have all holes removed."""
outer1 = [(0, 0), (50, 0), (50, 50), (0, 50), (0, 0)]
@ -944,3 +1048,356 @@ class TestGreenspaceHolePreserved:
merged = result["TEST1"]
assert len(list(merged.interiors)) == 1
assert merged.area == pytest.approx(10000 - 1600, rel=0.05)
# ---------------------------------------------------------------------------
# merge_fragments keeps substantial detached parts (no OA-seam coverage gaps)
# ---------------------------------------------------------------------------
class TestKeepDetachedParts:
"""A postcode split across an OA seam (railway/river) must keep both parts
instead of dropping all but the largest, which left ~1.8% uncovered gaps."""
def test_far_apart_parts_both_kept(self):
# Two 50x50m blocks 30m apart — wider than the 10m merge buffer.
a = box(0, 0, 50, 50) # 2500 m²
b = box(80, 0, 130, 50) # 2500 m², 30m gap
geom = merge_fragments([("AA1 1AA", a), ("AA1 1AA", b)])["AA1 1AA"]
assert geom.geom_type == "MultiPolygon"
assert len(geom.geoms) == 2
assert geom.area == pytest.approx(5000, rel=0.01)
def test_tiny_noise_part_dropped(self):
main = box(0, 0, 100, 100) # 10000 m²
noise = box(200, 200, 205, 205) # 25 m² < 100 m² threshold
geom = merge_fragments([("AA1 1AA", main), ("AA1 1AA", noise)])["AA1 1AA"]
assert geom.geom_type == "Polygon"
assert geom.area == pytest.approx(10000, rel=0.01)
class TestMultiPolygonOutput:
"""to_wgs84_geojson_multi / the writer must emit MultiPolygon for split
postcodes (the Rust server + loader already parse MultiPolygon)."""
def test_multipolygon_preserves_all_parts(self):
from shapely.geometry import shape
mp = MultiPolygon(
[
box(530000, 180000, 530100, 180100),
box(531000, 180000, 531100, 180100),
]
)
gj = to_wgs84_geojson_multi(mp)
assert gj["type"] == "MultiPolygon"
assert len(gj["coordinates"]) == 2
rt = shape(gj)
assert rt.is_valid and not rt.is_empty
assert len(rt.geoms) == 2
def test_single_part_stays_polygon(self):
gj = to_wgs84_geojson_multi(box(530000, 180000, 530100, 180100))
assert gj["type"] == "Polygon"
def test_writer_emits_multipolygon_feature(self, tmp_path):
mp = MultiPolygon(
[
box(530000, 180000, 530100, 180100),
box(531000, 180000, 531100, 180100),
]
)
assert write_district_geojson({"AA1 1AA": mp}, tmp_path) == 1
coll = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
assert coll["features"][0]["geometry"]["type"] == "MultiPolygon"
class TestOutputPartition:
"""The writer must emit a partition: overlapping postcodes are made disjoint
(no two cover the same ground) without dropping an active postcode."""
def test_overlapping_postcodes_made_disjoint(self, tmp_path):
from shapely.geometry import shape
a = box(530000, 180000, 530100, 180100)
b = box(530090, 180000, 530200, 180100) # overlaps `a` in a 10m strip
assert a.intersection(b).area > 0 # precondition: they overlap
write_district_geojson({"AA1 1AA": a, "AA1 1AB": b}, tmp_path)
coll = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
geoms = {
f["properties"]["postcodes"]: shape(f["geometry"])
for f in coll["features"]
}
assert set(geoms) == {"AA1 1AA", "AA1 1AB"} # neither dropped
# Disjoint interiors (share at most an edge).
assert geoms["AA1 1AA"].intersection(geoms["AA1 1AB"]).area == pytest.approx(
0.0, abs=1e-12
)
assert all(g.area > 0 for g in geoms.values())
def test_enclosed_postcode_makes_container_a_donut(self, tmp_path):
"""A postcode fully INSIDE another must stay disjoint: the smaller (inner)
keeps its area, the container gets a hole. A plain `overlaps` query misses
containment, so this is the regression guard for that fix."""
from shapely.geometry import shape
outer = box(530000, 180000, 530300, 180300) # 90,000 m²
inner = box(530100, 180100, 530200, 180200) # 10,000 m², fully inside outer
assert outer.contains(inner) # precondition
write_district_geojson({"AA1 1AA": outer, "AA1 1AB": inner}, tmp_path)
coll = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
geoms = {
f["properties"]["postcodes"]: shape(f["geometry"])
for f in coll["features"]
}
assert set(geoms) == {"AA1 1AA", "AA1 1AB"} # neither dropped
assert geoms["AA1 1AA"].intersection(geoms["AA1 1AB"]).area == pytest.approx(
0.0, abs=1e-12
)
# Container is now a donut around the enclosed postcode.
assert geoms["AA1 1AA"].geom_type == "Polygon"
assert len(list(geoms["AA1 1AA"].interiors)) == 1
assert geoms["AA1 1AB"].area > 0
# ---------------------------------------------------------------------------
# InspireIndex must return the same candidates as a brute-force bbox scan
# ---------------------------------------------------------------------------
class TestInspireIndex:
"""The grid index replaces a per-OA linear scan of all parcel bboxes; it must
return an identical candidate set (and order) so Phase 3 output is unchanged."""
@staticmethod
def _brute(bboxes, box):
e0, n0, e1, n1 = box
mask = (
(bboxes[:, 2] >= e0)
& (bboxes[:, 0] <= e1)
& (bboxes[:, 3] >= n0)
& (bboxes[:, 1] <= n1)
)
return np.where(mask)[0]
def test_matches_brute_force_over_random_queries(self):
rng = np.random.default_rng(0)
x = rng.uniform(0, 10000, 5000)
y = rng.uniform(0, 10000, 5000)
w = rng.uniform(1, 60, 5000) # all <= 500m cell → CSR path
h = rng.uniform(1, 60, 5000)
bboxes = np.column_stack([x, y, x + w, y + h]).astype(np.float64)
idx = build_inspire_index(bboxes, None, None, cell_size=500.0)
for _ in range(400):
cx, cy = rng.uniform(0, 10000), rng.uniform(0, 10000)
sz = float(rng.choice([30.0, 200.0, 1000.0, 3000.0]))
box = (cx, cy, cx + sz, cy + sz)
got = idx.candidate_indices(box)
expected = np.sort(self._brute(bboxes, box))
assert np.array_equal(got, expected)
def test_oversized_parcel_is_found(self):
# A parcel larger than a cell goes to the overflow list, not the grid;
# a query deep inside it (away from the small parcels) must still find it.
bboxes = np.array(
[
[0.0, 0.0, 5000.0, 5000.0], # 5km parcel >> 500m cell
[100.0, 100.0, 120.0, 120.0],
[4000.0, 4000.0, 4020.0, 4020.0],
]
)
idx = build_inspire_index(bboxes, None, None, cell_size=500.0)
box = (2000.0, 2000.0, 2050.0, 2050.0)
got = idx.candidate_indices(box)
assert 0 in got
assert np.array_equal(got, np.sort(self._brute(bboxes, box)))
def test_no_overlap_returns_empty(self):
bboxes = np.array([[0.0, 0.0, 10.0, 10.0], [20.0, 20.0, 30.0, 30.0]])
idx = build_inspire_index(bboxes, None, None, cell_size=500.0)
assert len(idx.candidate_indices((100.0, 100.0, 110.0, 110.0))) == 0
# ---------------------------------------------------------------------------
# Parallel OA processing must match the sequential result exactly
# ---------------------------------------------------------------------------
class TestParallelProcessing:
"""_process_oas across workers must produce the same fragments as workers=1.
Uses single-postcode OAs (fast path), so it exercises the chunking + WKB
round-trip + fork machinery without needing INSPIRE data."""
@staticmethod
def _inputs(n_oas=60):
import pyarrow as pa
oa_geoms = {
f"E{i:08d}": box(i * 100.0, 0.0, i * 100.0 + 50.0, 50.0)
for i in range(n_oas)
}
codes = sorted(oa_geoms)
east, north, pcs = [], [], []
offsets = {}
pos = 0
for i, code in enumerate(codes):
east += [i * 100.0 + 10.0, i * 100.0 + 20.0]
north += [10.0, 20.0]
pcs += [f"AA{i % 5} {i % 9}AA"] * 2 # one postcode per OA → fast path
offsets[code] = (pos, pos + 2)
pos += 2
return (
codes,
oa_geoms,
np.array(east),
np.array(north),
pa.array(pcs, type=pa.large_string()),
offsets,
)
@staticmethod
def _norm(frags):
return sorted((pc, geom.wkb_hex) for pc, geom in frags)
def test_parallel_matches_sequential(self):
codes, oa, east, north, pcs, offs = self._inputs()
seq, s1 = _process_oas(codes, oa, east, north, pcs, offs, None, workers=1)
par, s2 = _process_oas(codes, oa, east, north, pcs, offs, None, workers=3)
assert len(seq) == len(codes) # one fragment per single-postcode OA
assert s1 == s2 == len(codes)
assert self._norm(seq) == self._norm(par)
def test_oa_failure_is_tagged_with_oa_code(self):
"""A failure inside per-OA processing must re-raise with the OA code, so a
single bad OA is attributable instead of an anonymous worker abort."""
# Missing OA in the geoms dict → KeyError, wrapped with the OA code.
with pytest.raises(RuntimeError, match="E00099999"):
_oa_fragments("E00099999", {}, None, None, None, {}, None)
class TestDegenerateGeometryHandling:
"""Every active postcode must keep a boundary (validate_outputs is strict),
so a sub-grid sliver is fattened rather than dropped. A genuinely empty
geometry is skipped without aborting the whole write (the 10h regression)."""
# Three near-collinear vertices in BNG: bbox ~28m x 7m but area ~0.04 m²,
# i.e. AL10 0TU. Without the rescue it snaps to empty at output precision.
SLIVER = Polygon(
[(523045.34, 209625.56), (523040.47, 209624.33), (523017.0, 209618.42)]
)
def test_sliver_is_rescued_to_valid_geometry(self):
from shapely.geometry import shape
result = to_wgs84_geojson(self.SLIVER)
assert result is not None, "sliver must be rescued, not dropped"
rt = shape(result)
assert not rt.is_empty
assert rt.is_valid
def test_collinear_zero_area_input_is_rescued(self):
"""A zero-area collinear 'polygon' (can't be cleaned to a polygon) must
still be rescued via the representative-point fallback, not dropped."""
from shapely.geometry import shape
degenerate = Polygon(
[(523000, 209600), (523010, 209600), (523020, 209600), (523000, 209600)]
)
assert degenerate.area == 0.0
result = to_wgs84_geojson(degenerate)
assert result is not None, "degenerate input must be rescued, not dropped"
rt = shape(result)
assert not rt.is_empty
assert rt.is_valid
def test_sliver_postcode_present_in_output(self, tmp_path):
postcodes = {
"AA1 1AA": box(530000, 180000, 530100, 180100),
"AA1 1AB": self.SLIVER, # must survive
}
file_count = write_district_geojson(postcodes, tmp_path)
assert file_count == 1
collection = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
written = {f["properties"]["postcodes"] for f in collection["features"]}
assert written == {"AA1 1AA", "AA1 1AB"}
def test_empty_geometry_skipped_not_raised(self, tmp_path):
# The last-resort safety net: an unrescuable (empty) geometry is skipped
# so one bad postcode can never abort a multi-hour run.
postcodes = {
"AA1 1AA": box(530000, 180000, 530100, 180100),
"AA1 1AB": Polygon(), # genuinely empty
}
file_count = write_district_geojson(postcodes, tmp_path)
assert file_count == 1
collection = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
written = {f["properties"]["postcodes"] for f in collection["features"]}
assert written == {"AA1 1AA"}
# ---------------------------------------------------------------------------
# fragments_cache round-trips Phase 3 output and validates freshness
# ---------------------------------------------------------------------------
class TestFragmentsCache:
"""Persisting Phase 3 lets a crashed run resume without the ~10h OA loop."""
def test_round_trip_preserves_postcodes_and_geometry(self, tmp_path):
fragments = [
("AA1 1AA", box(0, 0, 100, 100)),
("AA1 1AB", box(200, 200, 250, 260)),
# A postcode spanning multiple OAs appears as repeated entries.
("AA1 1AA", box(100, 0, 150, 100)),
("AA1 1AC", MultiPolygon([box(0, 0, 10, 10), box(20, 20, 30, 30)])),
]
cache = tmp_path / "fragments_cache.parquet"
save_fragments(cache, fragments)
loaded = load_fragments(cache)
assert [pc for pc, _ in loaded] == [pc for pc, _ in fragments]
for (_, original), (_, restored) in zip(fragments, loaded):
assert restored.equals(original)
def test_save_is_atomic_no_tmp_left_behind(self, tmp_path):
cache = tmp_path / "fragments_cache.parquet"
save_fragments(cache, [("AA1 1AA", box(0, 0, 1, 1))])
assert cache.exists()
assert not (tmp_path / "fragments_cache.parquet.tmp").exists()
def test_missing_cache_is_not_fresh(self, tmp_path):
cache = tmp_path / "fragments_cache.parquet"
inp = tmp_path / "uprn.parquet"
inp.write_text("x")
assert fragments_cache_is_fresh(cache, [inp]) is False
def test_cache_newer_than_inputs_is_fresh(self, tmp_path):
import os
inp = tmp_path / "uprn.parquet"
inp.write_text("x")
cache = tmp_path / "fragments_cache.parquet"
cache.write_text("c")
os.utime(inp, (1_000, 1_000))
os.utime(cache, (2_000, 2_000))
assert fragments_cache_is_fresh(cache, [inp, None]) is True
def test_cache_older_than_any_input_is_stale(self, tmp_path):
import os
inp = tmp_path / "oa.gpkg"
inp.write_text("x")
cache = tmp_path / "fragments_cache.parquet"
cache.write_text("c")
os.utime(cache, (1_000, 1_000))
os.utime(inp, (2_000, 2_000)) # input touched after the cache
assert fragments_cache_is_fresh(cache, [inp]) is False
def test_missing_input_is_ignored(self, tmp_path):
cache = tmp_path / "fragments_cache.parquet"
cache.write_text("c")
# arcgis is optional/absent — it cannot have invalidated the cache.
assert fragments_cache_is_fresh(cache, [tmp_path / "absent.parquet"]) is True