This commit is contained in:
Andras Schmelczer 2026-02-10 22:21:15 +00:00
parent 1f68ca0512
commit 3599803589
43 changed files with 3578 additions and 262 deletions

View file

@ -32,6 +32,12 @@ def main() -> None:
parser.add_argument(
"--limit", type=int, default=0, help="Process only first N OAs (0=all)"
)
parser.add_argument(
"--greenspace",
type=Path,
default=None,
help="Greenspace/water parquet for boundary trimming (optional)",
)
args = parser.parse_args()
# Phase 1: Load all data
@ -115,7 +121,20 @@ def main() -> None:
print("Phase 4: Merging fragments and writing GeoJSON")
print("=" * 60)
merged = merge_fragments(all_fragments)
greenspace_tree = None
greenspace_geoms = None
if args.greenspace and args.greenspace.exists():
from .greenspace import load_greenspace
print(f" Loading greenspace/water from {args.greenspace}...")
greenspace_tree, greenspace_geoms = load_greenspace(args.greenspace)
print(f" Loaded {len(greenspace_geoms)} greenspace/water polygons")
merged = merge_fragments(
all_fragments,
greenspace_tree=greenspace_tree,
greenspace_geoms=greenspace_geoms,
)
print(f" Merged into {len(merged)} unique postcodes")
file_count = write_district_geojson(merged, args.output)

View file

@ -0,0 +1,65 @@
"""Load greenspace/water polygons and subtract them from postcode boundaries."""
from pathlib import Path
import polars as pl
from shapely import wkb
from shapely.geometry import MultiPolygon, Polygon
from shapely.ops import unary_union
from shapely.strtree import STRtree
def load_greenspace(path: Path) -> tuple[STRtree, list]:
"""Load greenspace parquet and build an STRtree spatial index.
Returns:
(tree, geoms) where tree is a Shapely STRtree and geoms is
the list of geometries indexed by the tree.
"""
df = pl.read_parquet(path)
geoms = [wkb.loads(g) for g in df["geometry"].to_list()]
tree = STRtree(geoms)
return tree, geoms
MAX_REMOVAL_FRACTION = 0.9 # Keep original if >90% would be removed
def subtract_greenspace(
postcode_geom: Polygon | MultiPolygon,
tree: STRtree,
geoms: list,
) -> Polygon | MultiPolygon:
"""Subtract park/water polygons that overlap the postcode geometry.
Uses the STRtree for fast candidate lookup, then subtracts the union
of intersecting greenspace from the postcode polygon. If subtraction
would remove >90% of the area, keeps the original (the postcode
genuinely covers that land, e.g. churchyards, riverside addresses).
"""
candidate_idxs = tree.query(postcode_geom)
if len(candidate_idxs) == 0:
return postcode_geom
# Collect geometries that actually intersect (not just bbox overlap)
intersecting = []
for idx in candidate_idxs:
g = geoms[idx]
if g.intersects(postcode_geom):
intersecting.append(g)
if not intersecting:
return postcode_geom
green_union = unary_union(intersecting)
result = postcode_geom.difference(green_union)
if result.is_empty:
return postcode_geom
# Don't over-trim postcodes that genuinely cover green/water areas
original_area = postcode_geom.area
if original_area > 0 and result.area / original_area < (1 - MAX_REMOVAL_FRACTION):
return postcode_geom
return result

View file

@ -63,10 +63,34 @@ def to_wgs84_geojson(
}
def _fill_holes(geom):
"""Remove all interior rings (holes) from a polygon or multipolygon."""
if geom.geom_type == "Polygon":
return Polygon(geom.exterior)
elif geom.geom_type == "MultiPolygon":
return MultiPolygon([Polygon(p.exterior) for p in geom.geoms])
return geom
def _largest_polygon(geom):
"""Extract the largest polygon from a MultiPolygon."""
if geom.geom_type == "MultiPolygon":
return max(geom.geoms, key=lambda g: g.area)
return geom
def merge_fragments(
all_fragments: list[tuple[str, Polygon | MultiPolygon]],
greenspace_tree=None,
greenspace_geoms=None,
) -> dict[str, Polygon | MultiPolygon]:
"""Merge cross-OA fragments for postcodes spanning multiple OAs."""
"""Merge cross-OA fragments for postcodes spanning multiple OAs.
Args:
all_fragments: List of (postcode, geometry) pairs.
greenspace_tree: Optional STRtree of park/water polygons.
greenspace_geoms: Optional list of park/water geometries (indexed by tree).
"""
by_postcode: dict[str, list] = defaultdict(list)
for pc, geom in all_fragments:
by_postcode[pc].append(geom)
@ -80,13 +104,25 @@ def merge_fragments(
combined = make_valid(combined)
# Close tiny gaps between adjacent OA boundary edges (float mismatches)
if combined.geom_type == "MultiPolygon":
combined = combined.buffer(1.0).buffer(-1.0)
combined = combined.buffer(5.0).buffer(-5.0)
if not combined.is_valid:
combined = make_valid(combined)
# Postcodes are contiguous delivery routes — keep only the largest
# polygon; small detached fragments are algorithm artifacts
if combined.geom_type == "MultiPolygon":
combined = max(combined.geoms, key=lambda g: g.area)
combined = _largest_polygon(combined)
# Remove artifact interior holes from INSPIRE+Voronoi+make_valid chain
combined = _fill_holes(combined)
# Subtract parks/water if provided
if greenspace_tree is not None and greenspace_geoms is not None:
from .greenspace import subtract_greenspace
pre_green = combined
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
combined = _largest_polygon(combined)
combined = _fill_holes(combined)
# Revert if subtraction + fragment selection lost >90% of area
if pre_green.area > 0 and combined.area / pre_green.area < 0.1:
combined = pre_green
merged[pc] = combined
return merged

View file

@ -9,7 +9,8 @@ import pytest
from shapely.geometry import MultiPolygon, Polygon, box
from .oa_boundaries import parse_gpkg_geometry
from .output import merge_fragments, to_wgs84_geojson
from .greenspace import subtract_greenspace
from .output import _fill_holes, merge_fragments, to_wgs84_geojson
from .process_oa import _extract_polygonal, process_oa
from .uprn import get_oa_uprns, load_uprns
from .voronoi import _equal_split_fallback, compute_voronoi_regions
@ -426,3 +427,143 @@ class TestParseGpkgGeometry:
blob = bytes([0x47, 0x50, 0x00, 0b00001010]) + b"\x00" * 100
with pytest.raises(ValueError, match="Unknown GeoPackage envelope type 5"):
parse_gpkg_geometry(blob)
# ---------------------------------------------------------------------------
# _fill_holes removes interior rings
# ---------------------------------------------------------------------------
class TestFillHoles:
"""_fill_holes must remove all interior holes from polygons."""
def test_polygon_with_hole(self):
"""A polygon with an interior ring should become a solid polygon."""
outer = [(0, 0), (100, 0), (100, 100), (0, 100), (0, 0)]
hole = [(30, 30), (70, 30), (70, 70), (30, 70), (30, 30)]
poly_with_hole = Polygon(outer, [hole])
assert len(list(poly_with_hole.interiors)) == 1
result = _fill_holes(poly_with_hole)
assert result.geom_type == "Polygon"
assert len(list(result.interiors)) == 0
assert result.area == pytest.approx(Polygon(outer).area)
def test_multipolygon_with_holes(self):
"""A MultiPolygon where each part has holes should have all holes removed."""
outer1 = [(0, 0), (50, 0), (50, 50), (0, 50), (0, 0)]
hole1 = [(10, 10), (20, 10), (20, 20), (10, 20), (10, 10)]
outer2 = [(60, 60), (110, 60), (110, 110), (60, 110), (60, 60)]
hole2 = [(70, 70), (80, 70), (80, 80), (70, 80), (70, 70)]
mp = MultiPolygon(
[Polygon(outer1, [hole1]), Polygon(outer2, [hole2])]
)
result = _fill_holes(mp)
assert result.geom_type == "MultiPolygon"
for p in result.geoms:
assert len(list(p.interiors)) == 0
def test_polygon_without_hole_unchanged(self):
"""A polygon with no holes should pass through unchanged."""
poly = box(0, 0, 100, 100)
result = _fill_holes(poly)
assert result.area == pytest.approx(poly.area)
# ---------------------------------------------------------------------------
# Improved merge with 5m buffer closes 3m gaps
# ---------------------------------------------------------------------------
class TestMergeImprovedBuffer:
"""The 5m buffer should close gaps that the old 1m buffer could not."""
def test_3m_gap_merged(self):
"""Two fragments with a 3m gap should merge into a single polygon."""
left = box(0, 0, 50, 100)
right = box(53, 0, 100, 100) # 3m gap at x=50..53
result = merge_fragments([("AA1 1AA", left), ("AA1 1AA", right)])
assert "AA1 1AA" in result
geom = result["AA1 1AA"]
assert geom.geom_type == "Polygon", (
f"Expected single Polygon after merging 3m gap, got {geom.geom_type}"
)
def test_holes_removed_after_merge(self):
"""Interior holes created by merging should be filled."""
# Create a donut-like shape from fragments
outer = box(0, 0, 100, 100)
inner = box(30, 30, 70, 70)
ring = outer.difference(inner)
# Add the inner piece as a separate fragment
result = merge_fragments([("AA1 1AA", ring), ("AA1 1AA", inner)])
assert "AA1 1AA" in result
geom = result["AA1 1AA"]
assert len(list(geom.interiors)) == 0, "Merged polygon should have no holes"
# ---------------------------------------------------------------------------
# subtract_greenspace
# ---------------------------------------------------------------------------
class TestSubtractGreenspace:
"""subtract_greenspace must remove park/water area from postcode polygons."""
def test_park_subtracted(self):
"""A park overlapping a postcode should reduce its area."""
from shapely.strtree import STRtree
postcode = box(0, 0, 100, 100) # 10000 sqm
park = box(60, 0, 100, 100) # 4000 sqm overlap on the right
tree = STRtree([park])
geoms = [park]
result = subtract_greenspace(postcode, tree, geoms)
# Should have lost ~4000 sqm
assert result.area == pytest.approx(6000, rel=0.01)
def test_no_greenspace_unchanged(self):
"""With no overlapping greenspace, the geometry should be unchanged."""
from shapely.strtree import STRtree
postcode = box(0, 0, 100, 100)
park = box(200, 200, 300, 300) # far away
tree = STRtree([park])
geoms = [park]
result = subtract_greenspace(postcode, tree, geoms)
assert result.area == pytest.approx(postcode.area)
def test_full_overlap_preserves_postcode(self):
"""If greenspace covers the entire postcode, keep the original."""
from shapely.strtree import STRtree
postcode = box(0, 0, 100, 100)
park = box(-10, -10, 110, 110) # completely covers postcode
tree = STRtree([park])
geoms = [park]
result = subtract_greenspace(postcode, tree, geoms)
# Should keep original since subtraction would erase entirely
assert result.area == pytest.approx(postcode.area)
def test_over_90pct_removal_preserves_postcode(self):
"""If greenspace would remove >90% of area, keep the original."""
from shapely.strtree import STRtree
postcode = box(0, 0, 100, 100) # 10000 sqm
park = box(5, 0, 100, 100) # 9500 sqm overlap = 95% removal
tree = STRtree([park])
geoms = [park]
result = subtract_greenspace(postcode, tree, geoms)
# Should keep original since >90% would be removed
assert result.area == pytest.approx(postcode.area)
def test_under_90pct_removal_subtracts(self):
"""If greenspace removes <90%, subtraction should proceed."""
from shapely.strtree import STRtree
postcode = box(0, 0, 100, 100) # 10000 sqm
park = box(20, 0, 100, 100) # 8000 sqm overlap = 80% removal
tree = STRtree([park])
geoms = [park]
result = subtract_greenspace(postcode, tree, geoms)
# 80% < 90% cap, so subtraction should happen
assert result.area == pytest.approx(2000, rel=0.01)

View file

@ -36,9 +36,10 @@ def main():
df = pl.read_parquet(args.input)
print(f" {len(df):,} rows, {len(df.columns)} columns")
# Drop existing estimated price column if re-running
if "Estimated current price" in df.columns:
df = df.drop("Estimated current price")
# Drop existing estimated columns if re-running
for col in ["Estimated current price", "Est. price per sqm"]:
if col in df.columns:
df = df.drop(col)
# Derive helper columns for the join
has_price = (
@ -126,6 +127,14 @@ def main():
.alias("Estimated current price"),
)
# Derive estimated price per sqm where both estimated price and floor area exist
df = df.with_columns(
(pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
.round(0)
.cast(pl.Int32)
.alias("Est. price per sqm"),
)
n_adjusted = df.filter(
has_price & pl.col("_log_index_sale").is_not_null()
).height