England only
This commit is contained in:
parent
4d08f5d08d
commit
02712f41e8
8 changed files with 294 additions and 60 deletions
45
pipeline/download/england_boundary.py
Normal file
45
pipeline/download/england_boundary.py
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
"""Download England country boundary GeoJSON from ONS Open Geography Portal.
|
||||
|
||||
Source: ONS Countries (December 2024) Boundaries UK BGC (Generalised Clipped)
|
||||
Licence: OGL v3
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
|
||||
# ArcGIS REST API — query for England only, generalised (BGC) resolution
|
||||
URL = (
|
||||
"https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/"
|
||||
"Countries_December_2024_Boundaries_UK_BGC/FeatureServer/0/query"
|
||||
"?where=CTRY24NM%3D%27England%27&outFields=CTRY24NM&f=geojson"
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download England country boundary GeoJSON"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output GeoJSON file path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print("Downloading England boundary from ONS...")
|
||||
response = httpx.get(URL, follow_redirects=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
features = data.get("features", [])
|
||||
if len(features) != 1:
|
||||
raise ValueError(f"Expected 1 feature for England, got {len(features)}")
|
||||
|
||||
args.output.write_text(response.text)
|
||||
size_kb = args.output.stat().st_size / 1024
|
||||
print(f"Saved to {args.output} ({size_kb:.0f} KB)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -7,6 +7,7 @@ from pathlib import Path
|
|||
from pipeline.transform.transform_poi import NAPTAN_EMOJIS, _CATEGORIES
|
||||
|
||||
GLYPHS_BASE = "https://protomaps.github.io/basemaps-assets/fonts"
|
||||
SPRITES_BASE = "https://protomaps.github.io/basemaps-assets/sprites/v4"
|
||||
TWEMOJI_BASE = "https://cdn.jsdelivr.net/gh/twitter/twemoji@14.0.2/assets/72x72"
|
||||
|
||||
# Font stacks used by @protomaps/basemaps with lang='en'
|
||||
|
|
@ -77,6 +78,15 @@ def main():
|
|||
url = f"{GLYPHS_BASE}/{font_encoded}/{name}"
|
||||
tasks.append((url, font_dir / name))
|
||||
|
||||
# Sprite sheets (light/dark, 1x and 2x)
|
||||
sprites_dir = out / "sprites"
|
||||
for theme in ("light", "dark"):
|
||||
for suffix in ("json", "png"):
|
||||
url = f"{SPRITES_BASE}/{theme}.{suffix}"
|
||||
tasks.append((url, sprites_dir / f"{theme}.{suffix}"))
|
||||
url_2x = f"{SPRITES_BASE}/{theme}@2x.{suffix}"
|
||||
tasks.append((url_2x, sprites_dir / f"{theme}@2x.{suffix}"))
|
||||
|
||||
# Twemoji PNGs
|
||||
twemoji_dir = out / "twemoji"
|
||||
for code in twemoji_codes:
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
Extracts named place nodes (cities, towns, suburbs, etc.) and railway stations
|
||||
(tube, national rail, DLR, etc.) for typeahead search.
|
||||
Reuses the same great-britain-latest.osm.pbf as pois.py.
|
||||
Reuses the same england-latest.osm.pbf as pois.py.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
|
@ -10,9 +10,16 @@ from pathlib import Path
|
|||
|
||||
import osmium
|
||||
import polars as pl
|
||||
from shapely.geometry import Point
|
||||
from tqdm import tqdm
|
||||
|
||||
from .pois import UK_BBOX_EAST, UK_BBOX_NORTH, UK_BBOX_SOUTH, UK_BBOX_WEST
|
||||
from pipeline.download.pois import (
|
||||
ENGLAND_BBOX_EAST,
|
||||
ENGLAND_BBOX_NORTH,
|
||||
ENGLAND_BBOX_SOUTH,
|
||||
ENGLAND_BBOX_WEST,
|
||||
)
|
||||
from pipeline.utils.england_geometry import load_england_polygon
|
||||
|
||||
PLACE_TYPES = {"city"}
|
||||
|
||||
|
|
@ -57,10 +64,11 @@ def _station_display_name(name: str, tags: dict[str, str]) -> str:
|
|||
|
||||
|
||||
class PlaceHandler(osmium.SimpleHandler):
|
||||
def __init__(self, progress: tqdm) -> None:
|
||||
def __init__(self, progress: tqdm, england_polygon) -> None:
|
||||
super().__init__()
|
||||
self._progress = progress
|
||||
self.places: list[dict] = []
|
||||
self._england = england_polygon
|
||||
|
||||
def _add(
|
||||
self, name: str, place_type: str, lat: float, lon: float, population: int
|
||||
|
|
@ -82,10 +90,12 @@ class PlaceHandler(osmium.SimpleHandler):
|
|||
return
|
||||
lat, lon = n.location.lat, n.location.lon
|
||||
if not (
|
||||
UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH
|
||||
and UK_BBOX_WEST <= lon <= UK_BBOX_EAST
|
||||
ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH
|
||||
and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST
|
||||
):
|
||||
return
|
||||
if not self._england.contains(Point(lon, lat)):
|
||||
return
|
||||
|
||||
name = n.tags.get("name:en", n.tags.get("name", ""))
|
||||
if not name:
|
||||
|
|
@ -124,9 +134,17 @@ def main() -> None:
|
|||
parser.add_argument(
|
||||
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--boundary",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="England boundary GeoJSON file",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
pbf_file = args.pbf
|
||||
england_polygon = load_england_polygon(args.boundary)
|
||||
|
||||
print("Extracting place nodes: cities + railway stations")
|
||||
with tqdm(
|
||||
unit=" elements",
|
||||
|
|
@ -135,7 +153,7 @@ def main() -> None:
|
|||
smoothing=0.05,
|
||||
mininterval=1.0,
|
||||
) as progress:
|
||||
handler = PlaceHandler(progress)
|
||||
handler = PlaceHandler(progress, england_polygon)
|
||||
handler.apply_file(str(pbf_file), locations=True)
|
||||
|
||||
print(f"Extracted {len(handler.places):,} place nodes")
|
||||
|
|
|
|||
|
|
@ -4,17 +4,20 @@ from tempfile import mkdtemp
|
|||
|
||||
import osmium
|
||||
import polars as pl
|
||||
from shapely.geometry import Point
|
||||
from tqdm import tqdm
|
||||
|
||||
from pipeline.utils.england_geometry import load_england_polygon
|
||||
|
||||
BATCH_SIZE = 50_000
|
||||
|
||||
MIN_OCCURENCE_COUNT = 20
|
||||
|
||||
UK_BBOX_WEST = -7.57
|
||||
UK_BBOX_SOUTH = 49.96
|
||||
UK_BBOX_EAST = 1.68
|
||||
UK_BBOX_NORTH = 58.64
|
||||
# Bounding box for fast pre-filtering before the precise polygon check
|
||||
ENGLAND_BBOX_WEST = -6.45
|
||||
ENGLAND_BBOX_SOUTH = 49.85
|
||||
ENGLAND_BBOX_EAST = 1.77
|
||||
ENGLAND_BBOX_NORTH = 55.82
|
||||
|
||||
POI_TAG_KEYS: list[str] = [
|
||||
"amenity",
|
||||
|
|
@ -31,19 +34,23 @@ POI_TAG_KEYS: list[str] = [
|
|||
|
||||
|
||||
class POIHandler(osmium.SimpleHandler):
|
||||
def __init__(self, progress: tqdm, tmp_dir: Path) -> None:
|
||||
def __init__(self, progress: tqdm, tmp_dir: Path, england_polygon) -> None:
|
||||
super().__init__()
|
||||
self._batch: list[dict] = []
|
||||
self._tmp_dir = tmp_dir
|
||||
self._batch_num = 0
|
||||
self.poi_count = 0
|
||||
self._progress = progress
|
||||
self._england = england_polygon
|
||||
|
||||
def _in_uk(self, lat: float, lon: float) -> bool:
|
||||
return (
|
||||
UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH
|
||||
and UK_BBOX_WEST <= lon <= UK_BBOX_EAST
|
||||
)
|
||||
def _in_england(self, lat: float, lon: float) -> bool:
|
||||
# Fast bbox pre-filter, then precise polygon check
|
||||
if not (
|
||||
ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH
|
||||
and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST
|
||||
):
|
||||
return False
|
||||
return self._england.contains(Point(lon, lat))
|
||||
|
||||
def _match_tags(self, tags: osmium.osm.TagList) -> list[str]:
|
||||
return [f"{key}/{tags[key]}" for key in POI_TAG_KEYS if key in tags]
|
||||
|
|
@ -90,7 +97,7 @@ class POIHandler(osmium.SimpleHandler):
|
|||
if not n.location.valid:
|
||||
return
|
||||
lat, lon = n.location.lat, n.location.lon
|
||||
if not self._in_uk(lat, lon):
|
||||
if not self._in_england(lat, lon):
|
||||
return
|
||||
categories = self._match_tags(n.tags)
|
||||
for category in categories:
|
||||
|
|
@ -107,11 +114,19 @@ def main() -> None:
|
|||
parser.add_argument(
|
||||
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--boundary",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="England boundary GeoJSON file",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
pbf_file = args.pbf
|
||||
print(f"Tag keys: {POI_TAG_KEYS}")
|
||||
|
||||
england_polygon = load_england_polygon(args.boundary)
|
||||
|
||||
tmp_dir = Path(mkdtemp(prefix="pois_"))
|
||||
with tqdm(
|
||||
unit=" elements",
|
||||
|
|
@ -120,7 +135,7 @@ def main() -> None:
|
|||
smoothing=0.05,
|
||||
mininterval=1.0,
|
||||
) as progress:
|
||||
handler = POIHandler(progress, tmp_dir)
|
||||
handler = POIHandler(progress, tmp_dir, england_polygon)
|
||||
handler.apply_file(str(pbf_file), locations=True)
|
||||
handler._flush_batch() # write any remaining POIs
|
||||
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@ from pathlib import Path
|
|||
|
||||
import polars as pl
|
||||
|
||||
from pipeline.utils.england_geometry import in_england_mask
|
||||
|
||||
|
||||
DROP_CATEGORIES = {
|
||||
# Street furniture & infrastructure
|
||||
|
|
@ -1056,7 +1058,11 @@ NAPTAN_EMOJIS: dict[str, str] = {
|
|||
}
|
||||
|
||||
|
||||
def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame:
|
||||
def transform(
|
||||
input_path: Path,
|
||||
naptan_path: Path | None = None,
|
||||
boundary_path: Path | None = None,
|
||||
) -> pl.LazyFrame:
|
||||
lf = pl.scan_parquet(input_path)
|
||||
|
||||
# Get all unique categories present in the data
|
||||
|
|
@ -1072,16 +1078,14 @@ def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame
|
|||
if unmapped:
|
||||
raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}")
|
||||
|
||||
# Verify every CATEGORY_MAP key actually exists in the data (catch typos)
|
||||
# Warn about CATEGORY_MAP keys not in data (may be absent in regional extracts)
|
||||
mapped_but_absent = []
|
||||
all_set = set(all_categories)
|
||||
for cat in CATEGORY_MAP:
|
||||
if cat not in all_set:
|
||||
mapped_but_absent.append(cat)
|
||||
if mapped_but_absent:
|
||||
raise ValueError(
|
||||
f"CATEGORY_MAP contains categories not in data: {sorted(mapped_but_absent)}"
|
||||
)
|
||||
print(f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}")
|
||||
|
||||
# Drop unwanted categories
|
||||
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
|
||||
|
|
@ -1105,7 +1109,15 @@ def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame
|
|||
pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
|
||||
)
|
||||
|
||||
naptan = pl.scan_parquet(naptan_path).with_columns(
|
||||
naptan_df = pl.scan_parquet(naptan_path).collect()
|
||||
if boundary_path is not None:
|
||||
mask = in_england_mask(
|
||||
boundary_path,
|
||||
naptan_df["lat"].to_numpy(),
|
||||
naptan_df["lng"].to_numpy(),
|
||||
)
|
||||
naptan_df = naptan_df.filter(pl.Series(mask))
|
||||
naptan = naptan_df.lazy().with_columns(
|
||||
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
|
||||
pl.lit("Public Transport").alias("group"),
|
||||
)
|
||||
|
|
@ -1122,12 +1134,18 @@ def main():
|
|||
parser.add_argument(
|
||||
"--naptan", type=Path, required=True, help="NaPTAN stations parquet file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--boundary",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="England boundary GeoJSON file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
df = transform(args.input, args.naptan).collect(engine="streaming")
|
||||
df = transform(args.input, args.naptan, args.boundary).collect(engine="streaming")
|
||||
|
||||
df.write_parquet(args.output)
|
||||
|
||||
|
|
|
|||
33
pipeline/utils/england_geometry.py
Normal file
33
pipeline/utils/england_geometry.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
"""England boundary polygon for accurate point-in-country filtering.
|
||||
|
||||
Uses shapely prepared geometry for fast single-point checks (osmium handlers)
|
||||
and vectorized shapely.contains for batch checks (Polars DataFrames).
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import shapely
|
||||
from shapely.geometry import shape
|
||||
from shapely.prepared import PreparedGeometry, prep
|
||||
|
||||
|
||||
def load_england_polygon(geojson_path: Path) -> PreparedGeometry:
|
||||
"""Load England boundary as a prepared shapely polygon for fast contains checks."""
|
||||
with open(geojson_path) as f:
|
||||
data = json.load(f)
|
||||
geometry = shape(data["features"][0]["geometry"])
|
||||
return prep(geometry)
|
||||
|
||||
|
||||
def in_england_mask(geojson_path: Path, lats: np.ndarray, lngs: np.ndarray) -> np.ndarray:
|
||||
"""Vectorized check: which (lat, lng) points are within England.
|
||||
|
||||
Returns a boolean numpy array.
|
||||
"""
|
||||
with open(geojson_path) as f:
|
||||
data = json.load(f)
|
||||
polygon = shape(data["features"][0]["geometry"])
|
||||
pts = shapely.points(lngs, lats)
|
||||
return shapely.contains(polygon, pts)
|
||||
Loading…
Add table
Add a link
Reference in a new issue