Good changes

This commit is contained in:
Andras Schmelczer 2026-02-10 22:09:46 +00:00
parent d39d1b15fd
commit 1f68ca0512
23 changed files with 670 additions and 289 deletions

View file

@ -0,0 +1,135 @@
"""Extract park and water body polygons from OSM PBF for postcode boundary trimming.
Uses pyosmium's FileProcessor with area assembly to convert OSM ways/relations
into Shapely polygons, reprojects to BNG (EPSG:27700), and saves as parquet.
Reuses the same great-britain-latest.osm.pbf as pois.py.
"""
import argparse
from pathlib import Path
import osmium
import polars as pl
from pyproj import Transformer
from shapely import wkb
from shapely.geometry import MultiPolygon, Polygon
from tqdm import tqdm
from .pois import download_pbf
MIN_AREA_SQM = 5_000 # ~70m x 70m — skip pocket parks and small ponds
# OSM tags that indicate non-residential green/water areas
GREENSPACE_TAGS = {
"leisure": {"park", "nature_reserve", "garden", "common"},
"landuse": {"forest"},
"natural": {"wood", "water", "wetland"},
"waterway": {"riverbank"},
}
_to_bng = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
def _to_bng_polygon(geom):
"""Reproject a WGS84 Shapely geometry to BNG (EPSG:27700)."""
def transform_ring(coords):
xs, ys = zip(*coords)
bng_x, bng_y = _to_bng.transform(list(xs), list(ys))
return list(zip(bng_x, bng_y))
if geom.geom_type == "Polygon":
exterior = transform_ring(geom.exterior.coords)
holes = [transform_ring(h.coords) for h in geom.interiors]
return Polygon(exterior, holes)
elif geom.geom_type == "MultiPolygon":
parts = []
for p in geom.geoms:
exterior = transform_ring(p.exterior.coords)
holes = [transform_ring(h.coords) for h in p.interiors]
parts.append(Polygon(exterior, holes))
return MultiPolygon(parts)
return geom
def _matches_tags(tags):
"""Check if an OSM element's tags match our greenspace/water criteria."""
for key, values in GREENSPACE_TAGS.items():
if key in tags and tags[key] in values:
return True
return False
class GreenspaceHandler(osmium.SimpleHandler):
"""Collects area geometries matching greenspace/water tags."""
def __init__(self, progress):
super().__init__()
self._wkb_factory = osmium.geom.WKBFactory()
self._progress = progress
self.geometries = []
def area(self, a):
self._progress.update(1)
if not _matches_tags(a.tags):
return
try:
wkb_data = self._wkb_factory.create_multipolygon(a)
geom = wkb.loads(wkb_data, hex=True)
except Exception:
return
if geom.is_empty or not geom.is_valid:
return
# Reproject to BNG for area calculation
bng_geom = _to_bng_polygon(geom)
if bng_geom.area < MIN_AREA_SQM:
return
self.geometries.append(bng_geom.wkb)
def main():
parser = argparse.ArgumentParser(
description="Extract park/water polygons from OSM PBF"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
parser.add_argument(
"--pbf", type=Path, default=None, help="Path to existing PBF file"
)
args = parser.parse_args()
if args.pbf and args.pbf.exists():
pbf_file = args.pbf
print(f"Using existing PBF: {pbf_file}")
else:
pbf_file = Path("data/great-britain-latest.osm.pbf")
if not pbf_file.exists():
download_pbf(pbf_file)
else:
print(f"Using cached PBF: {pbf_file}")
print("Extracting greenspace/water areas from PBF (two-pass area assembly)...")
with tqdm(unit=" areas", unit_scale=True, desc="Processing", smoothing=0.05) as progress:
handler = GreenspaceHandler(progress)
handler.apply_file(str(pbf_file), locations=True)
print(f"Found {len(handler.geometries)} greenspace/water polygons >= {MIN_AREA_SQM} sqm")
# Merge overlapping geometries per 10km grid cell for efficiency
if handler.geometries:
# Save WKB geometries directly
df = pl.DataFrame({"geometry": handler.geometries})
args.output.parent.mkdir(parents=True, exist_ok=True)
df.write_parquet(args.output)
print(f"Saved to {args.output}")
else:
print("No geometries found — skipping output")
if __name__ == "__main__":
main()

View file

@ -74,6 +74,18 @@ def _build_wide(
iod = pl.scan_parquet(iod_path)
wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
# Invert deprivation scores so that higher values = less deprived (better)
iod_score_cols = [
"Education, Skills and Training Score",
"Income Score (rate)",
"Employment Score (rate)",
"Health Deprivation and Disability Score",
"Living Environment Score",
"Indoors Sub-domain Score",
"Outdoors Sub-domain Score",
]
wide = wide.with_columns(*(pl.col(c).max() - pl.col(c) for c in iod_score_cols))
ethnicity = pl.scan_parquet(ethnicity_path)
wide = wide.join(
ethnicity,
@ -158,10 +170,20 @@ def _build_wide(
geosure = pl.scan_parquet(geosure_path)
wide = wide.join(geosure, on="postcode", how="left")
# Use built_form (Terraced, Semi-detached) when available, otherwise epc_property_type
# Derive property_type: prefer EPC data, fall back to price-paid.
# For Houses, use built_form (e.g. Semi-Detached, Mid-Terrace) for finer detail.
bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
["NO DATA!", "Not Recorded"]
)
has_epc = pl.col("epc_property_type").is_not_null()
is_house = pl.col("epc_property_type") == "House"
wide = wide.with_columns(
pl.when(pl.col("pp_property_type").is_in(["Terraced", "Semi-Detached"]))
pl.when(has_epc & is_house & ~bad_built_form)
.then(pl.col("built_form"))
.when(has_epc & is_house)
.then(pl.col("pp_property_type"))
.when(has_epc)
.then(pl.col("epc_property_type"))
.otherwise(pl.col("pp_property_type"))
.alias("property_type")
)