More
This commit is contained in:
parent
1f68ca0512
commit
3599803589
43 changed files with 3578 additions and 262 deletions
|
|
@ -32,6 +32,12 @@ def main() -> None:
|
|||
parser.add_argument(
|
||||
"--limit", type=int, default=0, help="Process only first N OAs (0=all)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--greenspace",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Greenspace/water parquet for boundary trimming (optional)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Phase 1: Load all data
|
||||
|
|
@ -115,7 +121,20 @@ def main() -> None:
|
|||
print("Phase 4: Merging fragments and writing GeoJSON")
|
||||
print("=" * 60)
|
||||
|
||||
merged = merge_fragments(all_fragments)
|
||||
greenspace_tree = None
|
||||
greenspace_geoms = None
|
||||
if args.greenspace and args.greenspace.exists():
|
||||
from .greenspace import load_greenspace
|
||||
|
||||
print(f" Loading greenspace/water from {args.greenspace}...")
|
||||
greenspace_tree, greenspace_geoms = load_greenspace(args.greenspace)
|
||||
print(f" Loaded {len(greenspace_geoms)} greenspace/water polygons")
|
||||
|
||||
merged = merge_fragments(
|
||||
all_fragments,
|
||||
greenspace_tree=greenspace_tree,
|
||||
greenspace_geoms=greenspace_geoms,
|
||||
)
|
||||
print(f" Merged into {len(merged)} unique postcodes")
|
||||
|
||||
file_count = write_district_geojson(merged, args.output)
|
||||
|
|
|
|||
65
pipeline/transform/postcode_boundaries/greenspace.py
Normal file
65
pipeline/transform/postcode_boundaries/greenspace.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
"""Load greenspace/water polygons and subtract them from postcode boundaries."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
from shapely import wkb
|
||||
from shapely.geometry import MultiPolygon, Polygon
|
||||
from shapely.ops import unary_union
|
||||
from shapely.strtree import STRtree
|
||||
|
||||
|
||||
def load_greenspace(path: Path) -> tuple[STRtree, list]:
|
||||
"""Load greenspace parquet and build an STRtree spatial index.
|
||||
|
||||
Returns:
|
||||
(tree, geoms) where tree is a Shapely STRtree and geoms is
|
||||
the list of geometries indexed by the tree.
|
||||
"""
|
||||
df = pl.read_parquet(path)
|
||||
geoms = [wkb.loads(g) for g in df["geometry"].to_list()]
|
||||
tree = STRtree(geoms)
|
||||
return tree, geoms
|
||||
|
||||
|
||||
MAX_REMOVAL_FRACTION = 0.9 # Keep original if >90% would be removed
|
||||
|
||||
|
||||
def subtract_greenspace(
|
||||
postcode_geom: Polygon | MultiPolygon,
|
||||
tree: STRtree,
|
||||
geoms: list,
|
||||
) -> Polygon | MultiPolygon:
|
||||
"""Subtract park/water polygons that overlap the postcode geometry.
|
||||
|
||||
Uses the STRtree for fast candidate lookup, then subtracts the union
|
||||
of intersecting greenspace from the postcode polygon. If subtraction
|
||||
would remove >90% of the area, keeps the original (the postcode
|
||||
genuinely covers that land, e.g. churchyards, riverside addresses).
|
||||
"""
|
||||
candidate_idxs = tree.query(postcode_geom)
|
||||
if len(candidate_idxs) == 0:
|
||||
return postcode_geom
|
||||
|
||||
# Collect geometries that actually intersect (not just bbox overlap)
|
||||
intersecting = []
|
||||
for idx in candidate_idxs:
|
||||
g = geoms[idx]
|
||||
if g.intersects(postcode_geom):
|
||||
intersecting.append(g)
|
||||
|
||||
if not intersecting:
|
||||
return postcode_geom
|
||||
|
||||
green_union = unary_union(intersecting)
|
||||
result = postcode_geom.difference(green_union)
|
||||
|
||||
if result.is_empty:
|
||||
return postcode_geom
|
||||
|
||||
# Don't over-trim postcodes that genuinely cover green/water areas
|
||||
original_area = postcode_geom.area
|
||||
if original_area > 0 and result.area / original_area < (1 - MAX_REMOVAL_FRACTION):
|
||||
return postcode_geom
|
||||
|
||||
return result
|
||||
|
|
@ -63,10 +63,34 @@ def to_wgs84_geojson(
|
|||
}
|
||||
|
||||
|
||||
def _fill_holes(geom):
|
||||
"""Remove all interior rings (holes) from a polygon or multipolygon."""
|
||||
if geom.geom_type == "Polygon":
|
||||
return Polygon(geom.exterior)
|
||||
elif geom.geom_type == "MultiPolygon":
|
||||
return MultiPolygon([Polygon(p.exterior) for p in geom.geoms])
|
||||
return geom
|
||||
|
||||
|
||||
def _largest_polygon(geom):
|
||||
"""Extract the largest polygon from a MultiPolygon."""
|
||||
if geom.geom_type == "MultiPolygon":
|
||||
return max(geom.geoms, key=lambda g: g.area)
|
||||
return geom
|
||||
|
||||
|
||||
def merge_fragments(
|
||||
all_fragments: list[tuple[str, Polygon | MultiPolygon]],
|
||||
greenspace_tree=None,
|
||||
greenspace_geoms=None,
|
||||
) -> dict[str, Polygon | MultiPolygon]:
|
||||
"""Merge cross-OA fragments for postcodes spanning multiple OAs."""
|
||||
"""Merge cross-OA fragments for postcodes spanning multiple OAs.
|
||||
|
||||
Args:
|
||||
all_fragments: List of (postcode, geometry) pairs.
|
||||
greenspace_tree: Optional STRtree of park/water polygons.
|
||||
greenspace_geoms: Optional list of park/water geometries (indexed by tree).
|
||||
"""
|
||||
by_postcode: dict[str, list] = defaultdict(list)
|
||||
for pc, geom in all_fragments:
|
||||
by_postcode[pc].append(geom)
|
||||
|
|
@ -80,13 +104,25 @@ def merge_fragments(
|
|||
combined = make_valid(combined)
|
||||
# Close tiny gaps between adjacent OA boundary edges (float mismatches)
|
||||
if combined.geom_type == "MultiPolygon":
|
||||
combined = combined.buffer(1.0).buffer(-1.0)
|
||||
combined = combined.buffer(5.0).buffer(-5.0)
|
||||
if not combined.is_valid:
|
||||
combined = make_valid(combined)
|
||||
# Postcodes are contiguous delivery routes — keep only the largest
|
||||
# polygon; small detached fragments are algorithm artifacts
|
||||
if combined.geom_type == "MultiPolygon":
|
||||
combined = max(combined.geoms, key=lambda g: g.area)
|
||||
combined = _largest_polygon(combined)
|
||||
# Remove artifact interior holes from INSPIRE+Voronoi+make_valid chain
|
||||
combined = _fill_holes(combined)
|
||||
# Subtract parks/water if provided
|
||||
if greenspace_tree is not None and greenspace_geoms is not None:
|
||||
from .greenspace import subtract_greenspace
|
||||
|
||||
pre_green = combined
|
||||
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
|
||||
combined = _largest_polygon(combined)
|
||||
combined = _fill_holes(combined)
|
||||
# Revert if subtraction + fragment selection lost >90% of area
|
||||
if pre_green.area > 0 and combined.area / pre_green.area < 0.1:
|
||||
combined = pre_green
|
||||
merged[pc] = combined
|
||||
return merged
|
||||
|
||||
|
|
|
|||
|
|
@ -9,7 +9,8 @@ import pytest
|
|||
from shapely.geometry import MultiPolygon, Polygon, box
|
||||
|
||||
from .oa_boundaries import parse_gpkg_geometry
|
||||
from .output import merge_fragments, to_wgs84_geojson
|
||||
from .greenspace import subtract_greenspace
|
||||
from .output import _fill_holes, merge_fragments, to_wgs84_geojson
|
||||
from .process_oa import _extract_polygonal, process_oa
|
||||
from .uprn import get_oa_uprns, load_uprns
|
||||
from .voronoi import _equal_split_fallback, compute_voronoi_regions
|
||||
|
|
@ -426,3 +427,143 @@ class TestParseGpkgGeometry:
|
|||
blob = bytes([0x47, 0x50, 0x00, 0b00001010]) + b"\x00" * 100
|
||||
with pytest.raises(ValueError, match="Unknown GeoPackage envelope type 5"):
|
||||
parse_gpkg_geometry(blob)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _fill_holes removes interior rings
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestFillHoles:
|
||||
"""_fill_holes must remove all interior holes from polygons."""
|
||||
|
||||
def test_polygon_with_hole(self):
|
||||
"""A polygon with an interior ring should become a solid polygon."""
|
||||
outer = [(0, 0), (100, 0), (100, 100), (0, 100), (0, 0)]
|
||||
hole = [(30, 30), (70, 30), (70, 70), (30, 70), (30, 30)]
|
||||
poly_with_hole = Polygon(outer, [hole])
|
||||
assert len(list(poly_with_hole.interiors)) == 1
|
||||
result = _fill_holes(poly_with_hole)
|
||||
assert result.geom_type == "Polygon"
|
||||
assert len(list(result.interiors)) == 0
|
||||
assert result.area == pytest.approx(Polygon(outer).area)
|
||||
|
||||
def test_multipolygon_with_holes(self):
|
||||
"""A MultiPolygon where each part has holes should have all holes removed."""
|
||||
outer1 = [(0, 0), (50, 0), (50, 50), (0, 50), (0, 0)]
|
||||
hole1 = [(10, 10), (20, 10), (20, 20), (10, 20), (10, 10)]
|
||||
outer2 = [(60, 60), (110, 60), (110, 110), (60, 110), (60, 60)]
|
||||
hole2 = [(70, 70), (80, 70), (80, 80), (70, 80), (70, 70)]
|
||||
mp = MultiPolygon(
|
||||
[Polygon(outer1, [hole1]), Polygon(outer2, [hole2])]
|
||||
)
|
||||
result = _fill_holes(mp)
|
||||
assert result.geom_type == "MultiPolygon"
|
||||
for p in result.geoms:
|
||||
assert len(list(p.interiors)) == 0
|
||||
|
||||
def test_polygon_without_hole_unchanged(self):
|
||||
"""A polygon with no holes should pass through unchanged."""
|
||||
poly = box(0, 0, 100, 100)
|
||||
result = _fill_holes(poly)
|
||||
assert result.area == pytest.approx(poly.area)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Improved merge with 5m buffer closes 3m gaps
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestMergeImprovedBuffer:
|
||||
"""The 5m buffer should close gaps that the old 1m buffer could not."""
|
||||
|
||||
def test_3m_gap_merged(self):
|
||||
"""Two fragments with a 3m gap should merge into a single polygon."""
|
||||
left = box(0, 0, 50, 100)
|
||||
right = box(53, 0, 100, 100) # 3m gap at x=50..53
|
||||
result = merge_fragments([("AA1 1AA", left), ("AA1 1AA", right)])
|
||||
assert "AA1 1AA" in result
|
||||
geom = result["AA1 1AA"]
|
||||
assert geom.geom_type == "Polygon", (
|
||||
f"Expected single Polygon after merging 3m gap, got {geom.geom_type}"
|
||||
)
|
||||
|
||||
def test_holes_removed_after_merge(self):
|
||||
"""Interior holes created by merging should be filled."""
|
||||
# Create a donut-like shape from fragments
|
||||
outer = box(0, 0, 100, 100)
|
||||
inner = box(30, 30, 70, 70)
|
||||
ring = outer.difference(inner)
|
||||
# Add the inner piece as a separate fragment
|
||||
result = merge_fragments([("AA1 1AA", ring), ("AA1 1AA", inner)])
|
||||
assert "AA1 1AA" in result
|
||||
geom = result["AA1 1AA"]
|
||||
assert len(list(geom.interiors)) == 0, "Merged polygon should have no holes"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# subtract_greenspace
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestSubtractGreenspace:
|
||||
"""subtract_greenspace must remove park/water area from postcode polygons."""
|
||||
|
||||
def test_park_subtracted(self):
|
||||
"""A park overlapping a postcode should reduce its area."""
|
||||
from shapely.strtree import STRtree
|
||||
|
||||
postcode = box(0, 0, 100, 100) # 10000 sqm
|
||||
park = box(60, 0, 100, 100) # 4000 sqm overlap on the right
|
||||
tree = STRtree([park])
|
||||
geoms = [park]
|
||||
result = subtract_greenspace(postcode, tree, geoms)
|
||||
# Should have lost ~4000 sqm
|
||||
assert result.area == pytest.approx(6000, rel=0.01)
|
||||
|
||||
def test_no_greenspace_unchanged(self):
|
||||
"""With no overlapping greenspace, the geometry should be unchanged."""
|
||||
from shapely.strtree import STRtree
|
||||
|
||||
postcode = box(0, 0, 100, 100)
|
||||
park = box(200, 200, 300, 300) # far away
|
||||
tree = STRtree([park])
|
||||
geoms = [park]
|
||||
result = subtract_greenspace(postcode, tree, geoms)
|
||||
assert result.area == pytest.approx(postcode.area)
|
||||
|
||||
def test_full_overlap_preserves_postcode(self):
|
||||
"""If greenspace covers the entire postcode, keep the original."""
|
||||
from shapely.strtree import STRtree
|
||||
|
||||
postcode = box(0, 0, 100, 100)
|
||||
park = box(-10, -10, 110, 110) # completely covers postcode
|
||||
tree = STRtree([park])
|
||||
geoms = [park]
|
||||
result = subtract_greenspace(postcode, tree, geoms)
|
||||
# Should keep original since subtraction would erase entirely
|
||||
assert result.area == pytest.approx(postcode.area)
|
||||
|
||||
def test_over_90pct_removal_preserves_postcode(self):
|
||||
"""If greenspace would remove >90% of area, keep the original."""
|
||||
from shapely.strtree import STRtree
|
||||
|
||||
postcode = box(0, 0, 100, 100) # 10000 sqm
|
||||
park = box(5, 0, 100, 100) # 9500 sqm overlap = 95% removal
|
||||
tree = STRtree([park])
|
||||
geoms = [park]
|
||||
result = subtract_greenspace(postcode, tree, geoms)
|
||||
# Should keep original since >90% would be removed
|
||||
assert result.area == pytest.approx(postcode.area)
|
||||
|
||||
def test_under_90pct_removal_subtracts(self):
|
||||
"""If greenspace removes <90%, subtraction should proceed."""
|
||||
from shapely.strtree import STRtree
|
||||
|
||||
postcode = box(0, 0, 100, 100) # 10000 sqm
|
||||
park = box(20, 0, 100, 100) # 8000 sqm overlap = 80% removal
|
||||
tree = STRtree([park])
|
||||
geoms = [park]
|
||||
result = subtract_greenspace(postcode, tree, geoms)
|
||||
# 80% < 90% cap, so subtraction should happen
|
||||
assert result.area == pytest.approx(2000, rel=0.01)
|
||||
|
|
|
|||
|
|
@ -36,9 +36,10 @@ def main():
|
|||
df = pl.read_parquet(args.input)
|
||||
print(f" {len(df):,} rows, {len(df.columns)} columns")
|
||||
|
||||
# Drop existing estimated price column if re-running
|
||||
if "Estimated current price" in df.columns:
|
||||
df = df.drop("Estimated current price")
|
||||
# Drop existing estimated columns if re-running
|
||||
for col in ["Estimated current price", "Est. price per sqm"]:
|
||||
if col in df.columns:
|
||||
df = df.drop(col)
|
||||
|
||||
# Derive helper columns for the join
|
||||
has_price = (
|
||||
|
|
@ -126,6 +127,14 @@ def main():
|
|||
.alias("Estimated current price"),
|
||||
)
|
||||
|
||||
# Derive estimated price per sqm where both estimated price and floor area exist
|
||||
df = df.with_columns(
|
||||
(pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
|
||||
.round(0)
|
||||
.cast(pl.Int32)
|
||||
.alias("Est. price per sqm"),
|
||||
)
|
||||
|
||||
n_adjusted = df.filter(
|
||||
has_price & pl.col("_log_index_sale").is_not_null()
|
||||
).height
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue