More
This commit is contained in:
parent
1f68ca0512
commit
3599803589
43 changed files with 3578 additions and 262 deletions
0
pipeline/download/__init__.py
Normal file
0
pipeline/download/__init__.py
Normal file
|
|
@ -129,15 +129,21 @@ def main() -> None:
|
|||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pbf", type=Path, default=None, help="Path to existing PBF file (skips download)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
with tempfile.TemporaryDirectory() as cache_dir:
|
||||
pbf_file = Path(cache_dir) / "great-britain-latest.osm.pbf"
|
||||
|
||||
if not pbf_file.exists():
|
||||
download_pbf(pbf_file)
|
||||
if args.pbf and args.pbf.exists():
|
||||
pbf_file = args.pbf
|
||||
print(f"Using provided PBF file at {pbf_file}")
|
||||
else:
|
||||
print(f"Using cached PBF file at {pbf_file}")
|
||||
pbf_file = Path(cache_dir) / "great-britain-latest.osm.pbf"
|
||||
if not pbf_file.exists():
|
||||
download_pbf(pbf_file)
|
||||
else:
|
||||
print(f"Using cached PBF file at {pbf_file}")
|
||||
|
||||
print(f"Tag keys: {POI_TAG_KEYS}")
|
||||
|
||||
|
|
|
|||
354
pipeline/download/transit_network.py
Normal file
354
pipeline/download/transit_network.py
Normal file
|
|
@ -0,0 +1,354 @@
|
|||
"""Download and prepare transit network data for R5 routing.
|
||||
|
||||
Downloads:
|
||||
- England OSM PBF from Geofabrik (~1.5GB)
|
||||
- BODS GTFS from Bus Open Data Service (~1.5GB, all England bus/tram/ferry)
|
||||
|
||||
Then processes for R5 compatibility:
|
||||
- Cleans GTFS (fixes stop_times >72h, feed_info year >2100)
|
||||
- Crops OSM PBF to London bounding box via osmium
|
||||
- Crops GTFS to London bounding box (keeps only London-touching trips)
|
||||
|
||||
Requires: osmium-tool (apt install osmium-tool)
|
||||
|
||||
Output directory: property-data/transit/
|
||||
Final files: london.osm.pbf + bods_gtfs.zip (London-only, R5-ready)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import io
|
||||
import os
|
||||
import subprocess
|
||||
import urllib.request
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
ENGLAND_PBF_URL = (
|
||||
"https://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf"
|
||||
)
|
||||
|
||||
# Bus Open Data Service — pre-converted GTFS covering all England bus/tram/ferry
|
||||
BODS_GTFS_URL = "https://data.bus-data.dft.gov.uk/timetable/download/gtfs-file/all/"
|
||||
|
||||
USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"
|
||||
|
||||
# London + Home Counties bounding box (~50km buffer around Greater London)
|
||||
LONDON_BBOX = {"min_lat": 51.2, "max_lat": 51.85, "min_lon": -0.65, "max_lon": 0.35}
|
||||
|
||||
|
||||
def _download_http(url: str, dest: Path, *, desc: str) -> None:
|
||||
"""Stream-download a URL to a file with progress bar."""
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = dest.with_suffix(dest.suffix + ".tmp")
|
||||
|
||||
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||
|
||||
with (
|
||||
tqdm(unit="B", unit_scale=True, desc=desc) as bar,
|
||||
urllib.request.urlopen(req) as resp,
|
||||
open(tmp, "wb") as f,
|
||||
):
|
||||
length = resp.headers.get("Content-Length")
|
||||
if length:
|
||||
bar.total = int(length)
|
||||
while chunk := resp.read(1 << 20):
|
||||
f.write(chunk)
|
||||
bar.update(len(chunk))
|
||||
|
||||
tmp.rename(dest)
|
||||
print(f" Saved to {dest}")
|
||||
|
||||
|
||||
def download_osm_pbf(output_dir: Path) -> Path:
|
||||
"""Download England OSM PBF extract from Geofabrik."""
|
||||
dest = output_dir / "england.osm.pbf"
|
||||
if dest.exists():
|
||||
print(f"OSM PBF already exists: {dest}")
|
||||
return dest
|
||||
|
||||
print("Downloading England OSM PBF (~1.5 GB)...")
|
||||
_download_http(ENGLAND_PBF_URL, dest, desc="england.osm.pbf")
|
||||
return dest
|
||||
|
||||
|
||||
def download_bods_gtfs(output_dir: Path) -> Path:
|
||||
"""Download BODS GTFS (all England bus/tram/ferry timetables)."""
|
||||
dest = output_dir / "bods_gtfs_raw.zip"
|
||||
if dest.exists():
|
||||
print(f"BODS GTFS already exists: {dest}")
|
||||
return dest
|
||||
|
||||
print("Downloading BODS GTFS (~1.5 GB)...")
|
||||
_download_http(BODS_GTFS_URL, dest, desc="bods_gtfs_raw.zip")
|
||||
return dest
|
||||
|
||||
|
||||
def clean_gtfs(src: Path, dst: Path) -> None:
|
||||
"""Fix R5-incompatible entries in GTFS.
|
||||
|
||||
- Removes stop_times with arrival/departure hour > 72
|
||||
- Caps feed_info end_date year to 2099
|
||||
"""
|
||||
if dst.exists():
|
||||
print(f"Cleaned GTFS already exists: {dst}")
|
||||
return
|
||||
|
||||
print("Cleaning GTFS for R5 compatibility...")
|
||||
with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(
|
||||
dst, "w", zipfile.ZIP_DEFLATED
|
||||
) as zout:
|
||||
for info in zin.infolist():
|
||||
if info.filename == "stop_times.txt":
|
||||
dropped = 0
|
||||
with zin.open(info) as f:
|
||||
header = f.readline()
|
||||
header_str = header.decode("utf-8").strip()
|
||||
cols = header_str.split(",")
|
||||
arr_idx = cols.index("arrival_time") if "arrival_time" in cols else -1
|
||||
dep_idx = (
|
||||
cols.index("departure_time") if "departure_time" in cols else -1
|
||||
)
|
||||
|
||||
import tempfile
|
||||
|
||||
tmp = tempfile.NamedTemporaryFile(
|
||||
mode="wb", delete=False, suffix=".txt"
|
||||
)
|
||||
tmp.write(header)
|
||||
|
||||
for line in f:
|
||||
line_str = line.decode("utf-8", errors="replace").strip()
|
||||
if not line_str:
|
||||
continue
|
||||
parts = line_str.split(",")
|
||||
skip = False
|
||||
for idx in [arr_idx, dep_idx]:
|
||||
if 0 <= idx < len(parts):
|
||||
time_val = parts[idx].strip('"')
|
||||
if ":" in time_val:
|
||||
try:
|
||||
hour = int(time_val.split(":")[0])
|
||||
if hour > 72:
|
||||
skip = True
|
||||
break
|
||||
except ValueError:
|
||||
pass
|
||||
if skip:
|
||||
dropped += 1
|
||||
else:
|
||||
tmp.write(line)
|
||||
|
||||
tmp.close()
|
||||
print(f" stop_times: dropped {dropped} rows with hours > 72")
|
||||
zout.write(tmp.name, "stop_times.txt")
|
||||
os.unlink(tmp.name)
|
||||
|
||||
elif info.filename == "feed_info.txt":
|
||||
data = zin.read(info).decode("utf-8")
|
||||
lines = data.strip().split("\n")
|
||||
header_line = lines[0]
|
||||
feed_cols = header_line.split(",")
|
||||
fixed_lines = [header_line]
|
||||
for line in lines[1:]:
|
||||
parts = line.split(",")
|
||||
for i, col_name in enumerate(feed_cols):
|
||||
if "end_date" in col_name.lower() and i < len(parts):
|
||||
date_val = parts[i].strip('"')
|
||||
if len(date_val) == 8:
|
||||
year = int(date_val[:4])
|
||||
if year > 2100:
|
||||
parts[i] = "20991231"
|
||||
print(f" feed_info: capped end_date {date_val} → 20991231")
|
||||
fixed_lines.append(",".join(parts))
|
||||
zout.writestr("feed_info.txt", "\n".join(fixed_lines) + "\n")
|
||||
else:
|
||||
zout.writestr(info, zin.read(info))
|
||||
|
||||
print(f" Saved to {dst}")
|
||||
|
||||
|
||||
def crop_osm_to_london(src: Path, dst: Path) -> None:
|
||||
"""Extract London bounding box from England OSM PBF using osmium."""
|
||||
if dst.exists():
|
||||
print(f"London OSM PBF already exists: {dst}")
|
||||
return
|
||||
|
||||
bbox = LONDON_BBOX
|
||||
bbox_str = f"{bbox['min_lon']},{bbox['min_lat']},{bbox['max_lon']},{bbox['max_lat']}"
|
||||
|
||||
print(f"Cropping OSM PBF to London bbox ({bbox_str})...")
|
||||
subprocess.run(
|
||||
["osmium", "extract", f"--bbox={bbox_str}", str(src), "-o", str(dst), "--overwrite"],
|
||||
check=True,
|
||||
)
|
||||
size_mb = dst.stat().st_size / (1024 * 1024)
|
||||
print(f" Saved to {dst} ({size_mb:.0f} MB)")
|
||||
|
||||
|
||||
def crop_gtfs_to_london(src: Path, dst: Path) -> None:
|
||||
"""Crop GTFS to trips touching the London bounding box."""
|
||||
if dst.exists():
|
||||
print(f"London GTFS already exists: {dst}")
|
||||
return
|
||||
|
||||
bbox = LONDON_BBOX
|
||||
|
||||
print("Cropping GTFS to London area...")
|
||||
|
||||
with zipfile.ZipFile(src, "r") as zin:
|
||||
# Step 1: Find stops in bbox
|
||||
print(" Finding stops in bbox...")
|
||||
with zin.open("stops.txt") as f:
|
||||
reader = csv.DictReader(io.TextIOWrapper(f))
|
||||
stops_in_bbox = set()
|
||||
all_stops = list(reader)
|
||||
for row in all_stops:
|
||||
lat = float(row["stop_lat"])
|
||||
lon = float(row["stop_lon"])
|
||||
if bbox["min_lat"] <= lat <= bbox["max_lat"] and bbox["min_lon"] <= lon <= bbox["max_lon"]:
|
||||
stops_in_bbox.add(row["stop_id"])
|
||||
print(f" {len(stops_in_bbox):,} / {len(all_stops):,} stops in bbox")
|
||||
|
||||
# Step 2: Find trips touching these stops
|
||||
print(" Finding trips touching London stops...")
|
||||
with zin.open("stop_times.txt") as f:
|
||||
reader = csv.DictReader(io.TextIOWrapper(f))
|
||||
st_fieldnames = reader.fieldnames
|
||||
trips_in_bbox = set()
|
||||
for row in reader:
|
||||
if row["stop_id"] in stops_in_bbox:
|
||||
trips_in_bbox.add(row["trip_id"])
|
||||
print(f" {len(trips_in_bbox):,} trips touch London")
|
||||
|
||||
# Step 3: Collect all stop_times for those trips
|
||||
print(" Collecting stop_times for London trips...")
|
||||
stop_times_kept = []
|
||||
with zin.open("stop_times.txt") as f:
|
||||
reader = csv.DictReader(io.TextIOWrapper(f))
|
||||
for row in reader:
|
||||
if row["trip_id"] in trips_in_bbox:
|
||||
stop_times_kept.append(row)
|
||||
stops_needed = {row["stop_id"] for row in stop_times_kept}
|
||||
print(f" {len(stop_times_kept):,} stop_times kept")
|
||||
|
||||
# Step 4: Read trips and find needed routes/services/shapes
|
||||
print(" Reading trips...")
|
||||
with zin.open("trips.txt") as f:
|
||||
reader = csv.DictReader(io.TextIOWrapper(f))
|
||||
trips_fieldnames = reader.fieldnames
|
||||
all_trips = list(reader)
|
||||
trips_kept = [t for t in all_trips if t["trip_id"] in trips_in_bbox]
|
||||
routes_needed = {t["route_id"] for t in trips_kept}
|
||||
services_needed = {t["service_id"] for t in trips_kept}
|
||||
shapes_needed = {t.get("shape_id", "") for t in trips_kept} - {""}
|
||||
|
||||
# Step 5: Write cropped GTFS
|
||||
print(" Writing cropped GTFS...")
|
||||
with zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout:
|
||||
# stops
|
||||
stops_kept = [s for s in all_stops if s["stop_id"] in stops_needed]
|
||||
_write_csv(zout, "stops.txt", list(all_stops[0].keys()), stops_kept)
|
||||
|
||||
# stop_times
|
||||
_write_csv(zout, "stop_times.txt", st_fieldnames, stop_times_kept)
|
||||
|
||||
# trips
|
||||
_write_csv(zout, "trips.txt", trips_fieldnames, trips_kept)
|
||||
|
||||
# routes
|
||||
with zin.open("routes.txt") as f:
|
||||
reader = csv.DictReader(io.TextIOWrapper(f))
|
||||
routes_fn = reader.fieldnames
|
||||
routes_kept = [r for r in reader if r["route_id"] in routes_needed]
|
||||
_write_csv(zout, "routes.txt", routes_fn, routes_kept)
|
||||
|
||||
# agency (copy all)
|
||||
zout.writestr("agency.txt", zin.read("agency.txt"))
|
||||
|
||||
# calendar
|
||||
with zin.open("calendar.txt") as f:
|
||||
reader = csv.DictReader(io.TextIOWrapper(f))
|
||||
cal_fn = reader.fieldnames
|
||||
cal_kept = [r for r in reader if r["service_id"] in services_needed]
|
||||
_write_csv(zout, "calendar.txt", cal_fn, cal_kept)
|
||||
|
||||
# calendar_dates
|
||||
with zin.open("calendar_dates.txt") as f:
|
||||
reader = csv.DictReader(io.TextIOWrapper(f))
|
||||
cd_fn = reader.fieldnames
|
||||
cd_kept = [r for r in reader if r["service_id"] in services_needed]
|
||||
_write_csv(zout, "calendar_dates.txt", cd_fn, cd_kept)
|
||||
|
||||
# shapes (stream — can be very large)
|
||||
print(" Streaming shapes.txt...")
|
||||
with zin.open("shapes.txt") as f:
|
||||
reader = csv.DictReader(io.TextIOWrapper(f))
|
||||
shapes_fn = reader.fieldnames
|
||||
shapes_rows = [r for r in reader if r["shape_id"] in shapes_needed]
|
||||
_write_csv(zout, "shapes.txt", shapes_fn, shapes_rows)
|
||||
|
||||
# feed_info + frequencies (copy)
|
||||
zout.writestr("feed_info.txt", zin.read("feed_info.txt"))
|
||||
zout.writestr("frequencies.txt", zin.read("frequencies.txt"))
|
||||
|
||||
size_mb = dst.stat().st_size / (1024 * 1024)
|
||||
print(f" Saved to {dst} ({size_mb:.0f} MB)")
|
||||
|
||||
|
||||
def _write_csv(
|
||||
zout: zipfile.ZipFile, name: str, fieldnames: list[str], rows: list[dict]
|
||||
) -> None:
|
||||
buf = io.StringIO()
|
||||
w = csv.DictWriter(buf, fieldnames=fieldnames)
|
||||
w.writeheader()
|
||||
w.writerows(rows)
|
||||
zout.writestr(name, buf.getvalue())
|
||||
print(f" {name}: {len(rows):,} rows")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download and prepare transit network data for R5 routing engine"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Output directory for transit data",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
output_dir: Path = args.output
|
||||
raw_dir = output_dir / "raw"
|
||||
raw_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download raw data
|
||||
england_pbf = download_osm_pbf(raw_dir)
|
||||
bods_raw = download_bods_gtfs(raw_dir)
|
||||
|
||||
# Clean GTFS (fix R5 incompatibilities)
|
||||
bods_clean = raw_dir / "bods_gtfs_clean.zip"
|
||||
clean_gtfs(bods_raw, bods_clean)
|
||||
|
||||
# Crop to London area for R5 (full England requires >30GB RAM)
|
||||
london_pbf = output_dir / "london.osm.pbf"
|
||||
crop_osm_to_london(england_pbf, london_pbf)
|
||||
|
||||
london_gtfs = output_dir / "bods_gtfs.zip"
|
||||
crop_gtfs_to_london(bods_clean, london_gtfs)
|
||||
|
||||
# Summary
|
||||
print()
|
||||
print("Transit data ready for R5:")
|
||||
for f in sorted(output_dir.iterdir()):
|
||||
if f.is_dir() or f.name.startswith("."):
|
||||
continue
|
||||
size_mb = f.stat().st_size / (1024 * 1024)
|
||||
print(f" {f.name}: {size_mb:.1f} MB")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -32,6 +32,12 @@ def main() -> None:
|
|||
parser.add_argument(
|
||||
"--limit", type=int, default=0, help="Process only first N OAs (0=all)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--greenspace",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Greenspace/water parquet for boundary trimming (optional)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Phase 1: Load all data
|
||||
|
|
@ -115,7 +121,20 @@ def main() -> None:
|
|||
print("Phase 4: Merging fragments and writing GeoJSON")
|
||||
print("=" * 60)
|
||||
|
||||
merged = merge_fragments(all_fragments)
|
||||
greenspace_tree = None
|
||||
greenspace_geoms = None
|
||||
if args.greenspace and args.greenspace.exists():
|
||||
from .greenspace import load_greenspace
|
||||
|
||||
print(f" Loading greenspace/water from {args.greenspace}...")
|
||||
greenspace_tree, greenspace_geoms = load_greenspace(args.greenspace)
|
||||
print(f" Loaded {len(greenspace_geoms)} greenspace/water polygons")
|
||||
|
||||
merged = merge_fragments(
|
||||
all_fragments,
|
||||
greenspace_tree=greenspace_tree,
|
||||
greenspace_geoms=greenspace_geoms,
|
||||
)
|
||||
print(f" Merged into {len(merged)} unique postcodes")
|
||||
|
||||
file_count = write_district_geojson(merged, args.output)
|
||||
|
|
|
|||
65
pipeline/transform/postcode_boundaries/greenspace.py
Normal file
65
pipeline/transform/postcode_boundaries/greenspace.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
"""Load greenspace/water polygons and subtract them from postcode boundaries."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
from shapely import wkb
|
||||
from shapely.geometry import MultiPolygon, Polygon
|
||||
from shapely.ops import unary_union
|
||||
from shapely.strtree import STRtree
|
||||
|
||||
|
||||
def load_greenspace(path: Path) -> tuple[STRtree, list]:
|
||||
"""Load greenspace parquet and build an STRtree spatial index.
|
||||
|
||||
Returns:
|
||||
(tree, geoms) where tree is a Shapely STRtree and geoms is
|
||||
the list of geometries indexed by the tree.
|
||||
"""
|
||||
df = pl.read_parquet(path)
|
||||
geoms = [wkb.loads(g) for g in df["geometry"].to_list()]
|
||||
tree = STRtree(geoms)
|
||||
return tree, geoms
|
||||
|
||||
|
||||
MAX_REMOVAL_FRACTION = 0.9 # Keep original if >90% would be removed
|
||||
|
||||
|
||||
def subtract_greenspace(
|
||||
postcode_geom: Polygon | MultiPolygon,
|
||||
tree: STRtree,
|
||||
geoms: list,
|
||||
) -> Polygon | MultiPolygon:
|
||||
"""Subtract park/water polygons that overlap the postcode geometry.
|
||||
|
||||
Uses the STRtree for fast candidate lookup, then subtracts the union
|
||||
of intersecting greenspace from the postcode polygon. If subtraction
|
||||
would remove >90% of the area, keeps the original (the postcode
|
||||
genuinely covers that land, e.g. churchyards, riverside addresses).
|
||||
"""
|
||||
candidate_idxs = tree.query(postcode_geom)
|
||||
if len(candidate_idxs) == 0:
|
||||
return postcode_geom
|
||||
|
||||
# Collect geometries that actually intersect (not just bbox overlap)
|
||||
intersecting = []
|
||||
for idx in candidate_idxs:
|
||||
g = geoms[idx]
|
||||
if g.intersects(postcode_geom):
|
||||
intersecting.append(g)
|
||||
|
||||
if not intersecting:
|
||||
return postcode_geom
|
||||
|
||||
green_union = unary_union(intersecting)
|
||||
result = postcode_geom.difference(green_union)
|
||||
|
||||
if result.is_empty:
|
||||
return postcode_geom
|
||||
|
||||
# Don't over-trim postcodes that genuinely cover green/water areas
|
||||
original_area = postcode_geom.area
|
||||
if original_area > 0 and result.area / original_area < (1 - MAX_REMOVAL_FRACTION):
|
||||
return postcode_geom
|
||||
|
||||
return result
|
||||
|
|
@ -63,10 +63,34 @@ def to_wgs84_geojson(
|
|||
}
|
||||
|
||||
|
||||
def _fill_holes(geom):
|
||||
"""Remove all interior rings (holes) from a polygon or multipolygon."""
|
||||
if geom.geom_type == "Polygon":
|
||||
return Polygon(geom.exterior)
|
||||
elif geom.geom_type == "MultiPolygon":
|
||||
return MultiPolygon([Polygon(p.exterior) for p in geom.geoms])
|
||||
return geom
|
||||
|
||||
|
||||
def _largest_polygon(geom):
|
||||
"""Extract the largest polygon from a MultiPolygon."""
|
||||
if geom.geom_type == "MultiPolygon":
|
||||
return max(geom.geoms, key=lambda g: g.area)
|
||||
return geom
|
||||
|
||||
|
||||
def merge_fragments(
|
||||
all_fragments: list[tuple[str, Polygon | MultiPolygon]],
|
||||
greenspace_tree=None,
|
||||
greenspace_geoms=None,
|
||||
) -> dict[str, Polygon | MultiPolygon]:
|
||||
"""Merge cross-OA fragments for postcodes spanning multiple OAs."""
|
||||
"""Merge cross-OA fragments for postcodes spanning multiple OAs.
|
||||
|
||||
Args:
|
||||
all_fragments: List of (postcode, geometry) pairs.
|
||||
greenspace_tree: Optional STRtree of park/water polygons.
|
||||
greenspace_geoms: Optional list of park/water geometries (indexed by tree).
|
||||
"""
|
||||
by_postcode: dict[str, list] = defaultdict(list)
|
||||
for pc, geom in all_fragments:
|
||||
by_postcode[pc].append(geom)
|
||||
|
|
@ -80,13 +104,25 @@ def merge_fragments(
|
|||
combined = make_valid(combined)
|
||||
# Close tiny gaps between adjacent OA boundary edges (float mismatches)
|
||||
if combined.geom_type == "MultiPolygon":
|
||||
combined = combined.buffer(1.0).buffer(-1.0)
|
||||
combined = combined.buffer(5.0).buffer(-5.0)
|
||||
if not combined.is_valid:
|
||||
combined = make_valid(combined)
|
||||
# Postcodes are contiguous delivery routes — keep only the largest
|
||||
# polygon; small detached fragments are algorithm artifacts
|
||||
if combined.geom_type == "MultiPolygon":
|
||||
combined = max(combined.geoms, key=lambda g: g.area)
|
||||
combined = _largest_polygon(combined)
|
||||
# Remove artifact interior holes from INSPIRE+Voronoi+make_valid chain
|
||||
combined = _fill_holes(combined)
|
||||
# Subtract parks/water if provided
|
||||
if greenspace_tree is not None and greenspace_geoms is not None:
|
||||
from .greenspace import subtract_greenspace
|
||||
|
||||
pre_green = combined
|
||||
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
|
||||
combined = _largest_polygon(combined)
|
||||
combined = _fill_holes(combined)
|
||||
# Revert if subtraction + fragment selection lost >90% of area
|
||||
if pre_green.area > 0 and combined.area / pre_green.area < 0.1:
|
||||
combined = pre_green
|
||||
merged[pc] = combined
|
||||
return merged
|
||||
|
||||
|
|
|
|||
|
|
@ -9,7 +9,8 @@ import pytest
|
|||
from shapely.geometry import MultiPolygon, Polygon, box
|
||||
|
||||
from .oa_boundaries import parse_gpkg_geometry
|
||||
from .output import merge_fragments, to_wgs84_geojson
|
||||
from .greenspace import subtract_greenspace
|
||||
from .output import _fill_holes, merge_fragments, to_wgs84_geojson
|
||||
from .process_oa import _extract_polygonal, process_oa
|
||||
from .uprn import get_oa_uprns, load_uprns
|
||||
from .voronoi import _equal_split_fallback, compute_voronoi_regions
|
||||
|
|
@ -426,3 +427,143 @@ class TestParseGpkgGeometry:
|
|||
blob = bytes([0x47, 0x50, 0x00, 0b00001010]) + b"\x00" * 100
|
||||
with pytest.raises(ValueError, match="Unknown GeoPackage envelope type 5"):
|
||||
parse_gpkg_geometry(blob)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _fill_holes removes interior rings
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestFillHoles:
|
||||
"""_fill_holes must remove all interior holes from polygons."""
|
||||
|
||||
def test_polygon_with_hole(self):
|
||||
"""A polygon with an interior ring should become a solid polygon."""
|
||||
outer = [(0, 0), (100, 0), (100, 100), (0, 100), (0, 0)]
|
||||
hole = [(30, 30), (70, 30), (70, 70), (30, 70), (30, 30)]
|
||||
poly_with_hole = Polygon(outer, [hole])
|
||||
assert len(list(poly_with_hole.interiors)) == 1
|
||||
result = _fill_holes(poly_with_hole)
|
||||
assert result.geom_type == "Polygon"
|
||||
assert len(list(result.interiors)) == 0
|
||||
assert result.area == pytest.approx(Polygon(outer).area)
|
||||
|
||||
def test_multipolygon_with_holes(self):
|
||||
"""A MultiPolygon where each part has holes should have all holes removed."""
|
||||
outer1 = [(0, 0), (50, 0), (50, 50), (0, 50), (0, 0)]
|
||||
hole1 = [(10, 10), (20, 10), (20, 20), (10, 20), (10, 10)]
|
||||
outer2 = [(60, 60), (110, 60), (110, 110), (60, 110), (60, 60)]
|
||||
hole2 = [(70, 70), (80, 70), (80, 80), (70, 80), (70, 70)]
|
||||
mp = MultiPolygon(
|
||||
[Polygon(outer1, [hole1]), Polygon(outer2, [hole2])]
|
||||
)
|
||||
result = _fill_holes(mp)
|
||||
assert result.geom_type == "MultiPolygon"
|
||||
for p in result.geoms:
|
||||
assert len(list(p.interiors)) == 0
|
||||
|
||||
def test_polygon_without_hole_unchanged(self):
|
||||
"""A polygon with no holes should pass through unchanged."""
|
||||
poly = box(0, 0, 100, 100)
|
||||
result = _fill_holes(poly)
|
||||
assert result.area == pytest.approx(poly.area)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Improved merge with 5m buffer closes 3m gaps
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestMergeImprovedBuffer:
|
||||
"""The 5m buffer should close gaps that the old 1m buffer could not."""
|
||||
|
||||
def test_3m_gap_merged(self):
|
||||
"""Two fragments with a 3m gap should merge into a single polygon."""
|
||||
left = box(0, 0, 50, 100)
|
||||
right = box(53, 0, 100, 100) # 3m gap at x=50..53
|
||||
result = merge_fragments([("AA1 1AA", left), ("AA1 1AA", right)])
|
||||
assert "AA1 1AA" in result
|
||||
geom = result["AA1 1AA"]
|
||||
assert geom.geom_type == "Polygon", (
|
||||
f"Expected single Polygon after merging 3m gap, got {geom.geom_type}"
|
||||
)
|
||||
|
||||
def test_holes_removed_after_merge(self):
|
||||
"""Interior holes created by merging should be filled."""
|
||||
# Create a donut-like shape from fragments
|
||||
outer = box(0, 0, 100, 100)
|
||||
inner = box(30, 30, 70, 70)
|
||||
ring = outer.difference(inner)
|
||||
# Add the inner piece as a separate fragment
|
||||
result = merge_fragments([("AA1 1AA", ring), ("AA1 1AA", inner)])
|
||||
assert "AA1 1AA" in result
|
||||
geom = result["AA1 1AA"]
|
||||
assert len(list(geom.interiors)) == 0, "Merged polygon should have no holes"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# subtract_greenspace
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestSubtractGreenspace:
|
||||
"""subtract_greenspace must remove park/water area from postcode polygons."""
|
||||
|
||||
def test_park_subtracted(self):
|
||||
"""A park overlapping a postcode should reduce its area."""
|
||||
from shapely.strtree import STRtree
|
||||
|
||||
postcode = box(0, 0, 100, 100) # 10000 sqm
|
||||
park = box(60, 0, 100, 100) # 4000 sqm overlap on the right
|
||||
tree = STRtree([park])
|
||||
geoms = [park]
|
||||
result = subtract_greenspace(postcode, tree, geoms)
|
||||
# Should have lost ~4000 sqm
|
||||
assert result.area == pytest.approx(6000, rel=0.01)
|
||||
|
||||
def test_no_greenspace_unchanged(self):
|
||||
"""With no overlapping greenspace, the geometry should be unchanged."""
|
||||
from shapely.strtree import STRtree
|
||||
|
||||
postcode = box(0, 0, 100, 100)
|
||||
park = box(200, 200, 300, 300) # far away
|
||||
tree = STRtree([park])
|
||||
geoms = [park]
|
||||
result = subtract_greenspace(postcode, tree, geoms)
|
||||
assert result.area == pytest.approx(postcode.area)
|
||||
|
||||
def test_full_overlap_preserves_postcode(self):
|
||||
"""If greenspace covers the entire postcode, keep the original."""
|
||||
from shapely.strtree import STRtree
|
||||
|
||||
postcode = box(0, 0, 100, 100)
|
||||
park = box(-10, -10, 110, 110) # completely covers postcode
|
||||
tree = STRtree([park])
|
||||
geoms = [park]
|
||||
result = subtract_greenspace(postcode, tree, geoms)
|
||||
# Should keep original since subtraction would erase entirely
|
||||
assert result.area == pytest.approx(postcode.area)
|
||||
|
||||
def test_over_90pct_removal_preserves_postcode(self):
|
||||
"""If greenspace would remove >90% of area, keep the original."""
|
||||
from shapely.strtree import STRtree
|
||||
|
||||
postcode = box(0, 0, 100, 100) # 10000 sqm
|
||||
park = box(5, 0, 100, 100) # 9500 sqm overlap = 95% removal
|
||||
tree = STRtree([park])
|
||||
geoms = [park]
|
||||
result = subtract_greenspace(postcode, tree, geoms)
|
||||
# Should keep original since >90% would be removed
|
||||
assert result.area == pytest.approx(postcode.area)
|
||||
|
||||
def test_under_90pct_removal_subtracts(self):
|
||||
"""If greenspace removes <90%, subtraction should proceed."""
|
||||
from shapely.strtree import STRtree
|
||||
|
||||
postcode = box(0, 0, 100, 100) # 10000 sqm
|
||||
park = box(20, 0, 100, 100) # 8000 sqm overlap = 80% removal
|
||||
tree = STRtree([park])
|
||||
geoms = [park]
|
||||
result = subtract_greenspace(postcode, tree, geoms)
|
||||
# 80% < 90% cap, so subtraction should happen
|
||||
assert result.area == pytest.approx(2000, rel=0.01)
|
||||
|
|
|
|||
|
|
@ -36,9 +36,10 @@ def main():
|
|||
df = pl.read_parquet(args.input)
|
||||
print(f" {len(df):,} rows, {len(df.columns)} columns")
|
||||
|
||||
# Drop existing estimated price column if re-running
|
||||
if "Estimated current price" in df.columns:
|
||||
df = df.drop("Estimated current price")
|
||||
# Drop existing estimated columns if re-running
|
||||
for col in ["Estimated current price", "Est. price per sqm"]:
|
||||
if col in df.columns:
|
||||
df = df.drop(col)
|
||||
|
||||
# Derive helper columns for the join
|
||||
has_price = (
|
||||
|
|
@ -126,6 +127,14 @@ def main():
|
|||
.alias("Estimated current price"),
|
||||
)
|
||||
|
||||
# Derive estimated price per sqm where both estimated price and floor area exist
|
||||
df = df.with_columns(
|
||||
(pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
|
||||
.round(0)
|
||||
.cast(pl.Int32)
|
||||
.alias("Est. price per sqm"),
|
||||
)
|
||||
|
||||
n_adjusted = df.filter(
|
||||
has_price & pl.col("_log_index_sale").is_not_null()
|
||||
).height
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue