Good changes
This commit is contained in:
parent
d39d1b15fd
commit
1f68ca0512
23 changed files with 670 additions and 289 deletions
135
pipeline/download/greenspace_water.py
Normal file
135
pipeline/download/greenspace_water.py
Normal file
|
|
@ -0,0 +1,135 @@
|
|||
"""Extract park and water body polygons from OSM PBF for postcode boundary trimming.
|
||||
|
||||
Uses pyosmium's FileProcessor with area assembly to convert OSM ways/relations
|
||||
into Shapely polygons, reprojects to BNG (EPSG:27700), and saves as parquet.
|
||||
|
||||
Reuses the same great-britain-latest.osm.pbf as pois.py.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import osmium
|
||||
import polars as pl
|
||||
from pyproj import Transformer
|
||||
from shapely import wkb
|
||||
from shapely.geometry import MultiPolygon, Polygon
|
||||
from tqdm import tqdm
|
||||
|
||||
from .pois import download_pbf
|
||||
|
||||
MIN_AREA_SQM = 5_000 # ~70m x 70m — skip pocket parks and small ponds
|
||||
|
||||
# OSM tags that indicate non-residential green/water areas
|
||||
GREENSPACE_TAGS = {
|
||||
"leisure": {"park", "nature_reserve", "garden", "common"},
|
||||
"landuse": {"forest"},
|
||||
"natural": {"wood", "water", "wetland"},
|
||||
"waterway": {"riverbank"},
|
||||
}
|
||||
|
||||
_to_bng = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
|
||||
|
||||
|
||||
def _to_bng_polygon(geom):
|
||||
"""Reproject a WGS84 Shapely geometry to BNG (EPSG:27700)."""
|
||||
|
||||
def transform_ring(coords):
|
||||
xs, ys = zip(*coords)
|
||||
bng_x, bng_y = _to_bng.transform(list(xs), list(ys))
|
||||
return list(zip(bng_x, bng_y))
|
||||
|
||||
if geom.geom_type == "Polygon":
|
||||
exterior = transform_ring(geom.exterior.coords)
|
||||
holes = [transform_ring(h.coords) for h in geom.interiors]
|
||||
return Polygon(exterior, holes)
|
||||
elif geom.geom_type == "MultiPolygon":
|
||||
parts = []
|
||||
for p in geom.geoms:
|
||||
exterior = transform_ring(p.exterior.coords)
|
||||
holes = [transform_ring(h.coords) for h in p.interiors]
|
||||
parts.append(Polygon(exterior, holes))
|
||||
return MultiPolygon(parts)
|
||||
return geom
|
||||
|
||||
|
||||
def _matches_tags(tags):
|
||||
"""Check if an OSM element's tags match our greenspace/water criteria."""
|
||||
for key, values in GREENSPACE_TAGS.items():
|
||||
if key in tags and tags[key] in values:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class GreenspaceHandler(osmium.SimpleHandler):
|
||||
"""Collects area geometries matching greenspace/water tags."""
|
||||
|
||||
def __init__(self, progress):
|
||||
super().__init__()
|
||||
self._wkb_factory = osmium.geom.WKBFactory()
|
||||
self._progress = progress
|
||||
self.geometries = []
|
||||
|
||||
def area(self, a):
|
||||
self._progress.update(1)
|
||||
if not _matches_tags(a.tags):
|
||||
return
|
||||
try:
|
||||
wkb_data = self._wkb_factory.create_multipolygon(a)
|
||||
geom = wkb.loads(wkb_data, hex=True)
|
||||
except Exception:
|
||||
return
|
||||
|
||||
if geom.is_empty or not geom.is_valid:
|
||||
return
|
||||
|
||||
# Reproject to BNG for area calculation
|
||||
bng_geom = _to_bng_polygon(geom)
|
||||
if bng_geom.area < MIN_AREA_SQM:
|
||||
return
|
||||
|
||||
self.geometries.append(bng_geom.wkb)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extract park/water polygons from OSM PBF"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pbf", type=Path, default=None, help="Path to existing PBF file"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.pbf and args.pbf.exists():
|
||||
pbf_file = args.pbf
|
||||
print(f"Using existing PBF: {pbf_file}")
|
||||
else:
|
||||
pbf_file = Path("data/great-britain-latest.osm.pbf")
|
||||
if not pbf_file.exists():
|
||||
download_pbf(pbf_file)
|
||||
else:
|
||||
print(f"Using cached PBF: {pbf_file}")
|
||||
|
||||
print("Extracting greenspace/water areas from PBF (two-pass area assembly)...")
|
||||
with tqdm(unit=" areas", unit_scale=True, desc="Processing", smoothing=0.05) as progress:
|
||||
handler = GreenspaceHandler(progress)
|
||||
handler.apply_file(str(pbf_file), locations=True)
|
||||
|
||||
print(f"Found {len(handler.geometries)} greenspace/water polygons >= {MIN_AREA_SQM} sqm")
|
||||
|
||||
# Merge overlapping geometries per 10km grid cell for efficiency
|
||||
if handler.geometries:
|
||||
# Save WKB geometries directly
|
||||
df = pl.DataFrame({"geometry": handler.geometries})
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
df.write_parquet(args.output)
|
||||
print(f"Saved to {args.output}")
|
||||
else:
|
||||
print("No geometries found — skipping output")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -74,6 +74,18 @@ def _build_wide(
|
|||
iod = pl.scan_parquet(iod_path)
|
||||
wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
|
||||
|
||||
# Invert deprivation scores so that higher values = less deprived (better)
|
||||
iod_score_cols = [
|
||||
"Education, Skills and Training Score",
|
||||
"Income Score (rate)",
|
||||
"Employment Score (rate)",
|
||||
"Health Deprivation and Disability Score",
|
||||
"Living Environment Score",
|
||||
"Indoors Sub-domain Score",
|
||||
"Outdoors Sub-domain Score",
|
||||
]
|
||||
wide = wide.with_columns(*(pl.col(c).max() - pl.col(c) for c in iod_score_cols))
|
||||
|
||||
ethnicity = pl.scan_parquet(ethnicity_path)
|
||||
wide = wide.join(
|
||||
ethnicity,
|
||||
|
|
@ -158,10 +170,20 @@ def _build_wide(
|
|||
geosure = pl.scan_parquet(geosure_path)
|
||||
wide = wide.join(geosure, on="postcode", how="left")
|
||||
|
||||
# Use built_form (Terraced, Semi-detached) when available, otherwise epc_property_type
|
||||
# Derive property_type: prefer EPC data, fall back to price-paid.
|
||||
# For Houses, use built_form (e.g. Semi-Detached, Mid-Terrace) for finer detail.
|
||||
bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
|
||||
["NO DATA!", "Not Recorded"]
|
||||
)
|
||||
has_epc = pl.col("epc_property_type").is_not_null()
|
||||
is_house = pl.col("epc_property_type") == "House"
|
||||
wide = wide.with_columns(
|
||||
pl.when(pl.col("pp_property_type").is_in(["Terraced", "Semi-Detached"]))
|
||||
pl.when(has_epc & is_house & ~bad_built_form)
|
||||
.then(pl.col("built_form"))
|
||||
.when(has_epc & is_house)
|
||||
.then(pl.col("pp_property_type"))
|
||||
.when(has_epc)
|
||||
.then(pl.col("epc_property_type"))
|
||||
.otherwise(pl.col("pp_property_type"))
|
||||
.alias("property_type")
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue