England only

This commit is contained in:
Andras Schmelczer 2026-03-15 14:03:38 +00:00
parent 4d08f5d08d
commit 02712f41e8
8 changed files with 294 additions and 60 deletions

View file

@ -0,0 +1,45 @@
"""Download England country boundary GeoJSON from ONS Open Geography Portal.
Source: ONS Countries (December 2024) Boundaries UK BGC (Generalised Clipped)
Licence: OGL v3
"""
import argparse
from pathlib import Path
import httpx
# ArcGIS REST API — query for England only, generalised (BGC) resolution
URL = (
"https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/"
"Countries_December_2024_Boundaries_UK_BGC/FeatureServer/0/query"
"?where=CTRY24NM%3D%27England%27&outFields=CTRY24NM&f=geojson"
)
def main() -> None:
parser = argparse.ArgumentParser(
description="Download England country boundary GeoJSON"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output GeoJSON file path"
)
args = parser.parse_args()
args.output.parent.mkdir(parents=True, exist_ok=True)
print("Downloading England boundary from ONS...")
response = httpx.get(URL, follow_redirects=True, timeout=60)
response.raise_for_status()
data = response.json()
features = data.get("features", [])
if len(features) != 1:
raise ValueError(f"Expected 1 feature for England, got {len(features)}")
args.output.write_text(response.text)
size_kb = args.output.stat().st_size / 1024
print(f"Saved to {args.output} ({size_kb:.0f} KB)")
if __name__ == "__main__":
main()

View file

@ -7,6 +7,7 @@ from pathlib import Path
from pipeline.transform.transform_poi import NAPTAN_EMOJIS, _CATEGORIES
GLYPHS_BASE = "https://protomaps.github.io/basemaps-assets/fonts"
SPRITES_BASE = "https://protomaps.github.io/basemaps-assets/sprites/v4"
TWEMOJI_BASE = "https://cdn.jsdelivr.net/gh/twitter/twemoji@14.0.2/assets/72x72"
# Font stacks used by @protomaps/basemaps with lang='en'
@ -77,6 +78,15 @@ def main():
url = f"{GLYPHS_BASE}/{font_encoded}/{name}"
tasks.append((url, font_dir / name))
# Sprite sheets (light/dark, 1x and 2x)
sprites_dir = out / "sprites"
for theme in ("light", "dark"):
for suffix in ("json", "png"):
url = f"{SPRITES_BASE}/{theme}.{suffix}"
tasks.append((url, sprites_dir / f"{theme}.{suffix}"))
url_2x = f"{SPRITES_BASE}/{theme}@2x.{suffix}"
tasks.append((url_2x, sprites_dir / f"{theme}@2x.{suffix}"))
# Twemoji PNGs
twemoji_dir = out / "twemoji"
for code in twemoji_codes:

View file

@ -2,7 +2,7 @@
Extracts named place nodes (cities, towns, suburbs, etc.) and railway stations
(tube, national rail, DLR, etc.) for typeahead search.
Reuses the same great-britain-latest.osm.pbf as pois.py.
Reuses the same england-latest.osm.pbf as pois.py.
"""
import argparse
@ -10,9 +10,16 @@ from pathlib import Path
import osmium
import polars as pl
from shapely.geometry import Point
from tqdm import tqdm
from .pois import UK_BBOX_EAST, UK_BBOX_NORTH, UK_BBOX_SOUTH, UK_BBOX_WEST
from pipeline.download.pois import (
ENGLAND_BBOX_EAST,
ENGLAND_BBOX_NORTH,
ENGLAND_BBOX_SOUTH,
ENGLAND_BBOX_WEST,
)
from pipeline.utils.england_geometry import load_england_polygon
PLACE_TYPES = {"city"}
@ -57,10 +64,11 @@ def _station_display_name(name: str, tags: dict[str, str]) -> str:
class PlaceHandler(osmium.SimpleHandler):
def __init__(self, progress: tqdm) -> None:
def __init__(self, progress: tqdm, england_polygon) -> None:
super().__init__()
self._progress = progress
self.places: list[dict] = []
self._england = england_polygon
def _add(
self, name: str, place_type: str, lat: float, lon: float, population: int
@ -82,10 +90,12 @@ class PlaceHandler(osmium.SimpleHandler):
return
lat, lon = n.location.lat, n.location.lon
if not (
UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH
and UK_BBOX_WEST <= lon <= UK_BBOX_EAST
ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH
and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST
):
return
if not self._england.contains(Point(lon, lat)):
return
name = n.tags.get("name:en", n.tags.get("name", ""))
if not name:
@ -124,9 +134,17 @@ def main() -> None:
parser.add_argument(
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
)
parser.add_argument(
"--boundary",
type=Path,
required=True,
help="England boundary GeoJSON file",
)
args = parser.parse_args()
pbf_file = args.pbf
england_polygon = load_england_polygon(args.boundary)
print("Extracting place nodes: cities + railway stations")
with tqdm(
unit=" elements",
@ -135,7 +153,7 @@ def main() -> None:
smoothing=0.05,
mininterval=1.0,
) as progress:
handler = PlaceHandler(progress)
handler = PlaceHandler(progress, england_polygon)
handler.apply_file(str(pbf_file), locations=True)
print(f"Extracted {len(handler.places):,} place nodes")

View file

@ -4,17 +4,20 @@ from tempfile import mkdtemp
import osmium
import polars as pl
from shapely.geometry import Point
from tqdm import tqdm
from pipeline.utils.england_geometry import load_england_polygon
BATCH_SIZE = 50_000
MIN_OCCURENCE_COUNT = 20
UK_BBOX_WEST = -7.57
UK_BBOX_SOUTH = 49.96
UK_BBOX_EAST = 1.68
UK_BBOX_NORTH = 58.64
# Bounding box for fast pre-filtering before the precise polygon check
ENGLAND_BBOX_WEST = -6.45
ENGLAND_BBOX_SOUTH = 49.85
ENGLAND_BBOX_EAST = 1.77
ENGLAND_BBOX_NORTH = 55.82
POI_TAG_KEYS: list[str] = [
"amenity",
@ -31,19 +34,23 @@ POI_TAG_KEYS: list[str] = [
class POIHandler(osmium.SimpleHandler):
def __init__(self, progress: tqdm, tmp_dir: Path) -> None:
def __init__(self, progress: tqdm, tmp_dir: Path, england_polygon) -> None:
super().__init__()
self._batch: list[dict] = []
self._tmp_dir = tmp_dir
self._batch_num = 0
self.poi_count = 0
self._progress = progress
self._england = england_polygon
def _in_uk(self, lat: float, lon: float) -> bool:
return (
UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH
and UK_BBOX_WEST <= lon <= UK_BBOX_EAST
)
def _in_england(self, lat: float, lon: float) -> bool:
# Fast bbox pre-filter, then precise polygon check
if not (
ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH
and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST
):
return False
return self._england.contains(Point(lon, lat))
def _match_tags(self, tags: osmium.osm.TagList) -> list[str]:
return [f"{key}/{tags[key]}" for key in POI_TAG_KEYS if key in tags]
@ -90,7 +97,7 @@ class POIHandler(osmium.SimpleHandler):
if not n.location.valid:
return
lat, lon = n.location.lat, n.location.lon
if not self._in_uk(lat, lon):
if not self._in_england(lat, lon):
return
categories = self._match_tags(n.tags)
for category in categories:
@ -107,11 +114,19 @@ def main() -> None:
parser.add_argument(
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
)
parser.add_argument(
"--boundary",
type=Path,
required=True,
help="England boundary GeoJSON file",
)
args = parser.parse_args()
pbf_file = args.pbf
print(f"Tag keys: {POI_TAG_KEYS}")
england_polygon = load_england_polygon(args.boundary)
tmp_dir = Path(mkdtemp(prefix="pois_"))
with tqdm(
unit=" elements",
@ -120,7 +135,7 @@ def main() -> None:
smoothing=0.05,
mininterval=1.0,
) as progress:
handler = POIHandler(progress, tmp_dir)
handler = POIHandler(progress, tmp_dir, england_polygon)
handler.apply_file(str(pbf_file), locations=True)
handler._flush_batch() # write any remaining POIs

View file

@ -3,6 +3,8 @@ from pathlib import Path
import polars as pl
from pipeline.utils.england_geometry import in_england_mask
DROP_CATEGORIES = {
# Street furniture & infrastructure
@ -1056,7 +1058,11 @@ NAPTAN_EMOJIS: dict[str, str] = {
}
def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame:
def transform(
input_path: Path,
naptan_path: Path | None = None,
boundary_path: Path | None = None,
) -> pl.LazyFrame:
lf = pl.scan_parquet(input_path)
# Get all unique categories present in the data
@ -1072,16 +1078,14 @@ def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame
if unmapped:
raise ValueError(f"Categories missing from CATEGORY_MAP: {sorted(unmapped)}")
# Verify every CATEGORY_MAP key actually exists in the data (catch typos)
# Warn about CATEGORY_MAP keys not in data (may be absent in regional extracts)
mapped_but_absent = []
all_set = set(all_categories)
for cat in CATEGORY_MAP:
if cat not in all_set:
mapped_but_absent.append(cat)
if mapped_but_absent:
raise ValueError(
f"CATEGORY_MAP contains categories not in data: {sorted(mapped_but_absent)}"
)
print(f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}")
# Drop unwanted categories
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
@ -1105,7 +1109,15 @@ def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame
pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
)
naptan = pl.scan_parquet(naptan_path).with_columns(
naptan_df = pl.scan_parquet(naptan_path).collect()
if boundary_path is not None:
mask = in_england_mask(
boundary_path,
naptan_df["lat"].to_numpy(),
naptan_df["lng"].to_numpy(),
)
naptan_df = naptan_df.filter(pl.Series(mask))
naptan = naptan_df.lazy().with_columns(
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
pl.lit("Public Transport").alias("group"),
)
@ -1122,12 +1134,18 @@ def main():
parser.add_argument(
"--naptan", type=Path, required=True, help="NaPTAN stations parquet file"
)
parser.add_argument(
"--boundary",
type=Path,
required=True,
help="England boundary GeoJSON file",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
)
args = parser.parse_args()
df = transform(args.input, args.naptan).collect(engine="streaming")
df = transform(args.input, args.naptan, args.boundary).collect(engine="streaming")
df.write_parquet(args.output)

View file

@ -0,0 +1,33 @@
"""England boundary polygon for accurate point-in-country filtering.
Uses shapely prepared geometry for fast single-point checks (osmium handlers)
and vectorized shapely.contains for batch checks (Polars DataFrames).
"""
import json
from pathlib import Path
import numpy as np
import shapely
from shapely.geometry import shape
from shapely.prepared import PreparedGeometry, prep
def load_england_polygon(geojson_path: Path) -> PreparedGeometry:
"""Load England boundary as a prepared shapely polygon for fast contains checks."""
with open(geojson_path) as f:
data = json.load(f)
geometry = shape(data["features"][0]["geometry"])
return prep(geometry)
def in_england_mask(geojson_path: Path, lats: np.ndarray, lngs: np.ndarray) -> np.ndarray:
"""Vectorized check: which (lat, lng) points are within England.
Returns a boolean numpy array.
"""
with open(geojson_path) as f:
data = json.load(f)
polygon = shape(data["features"][0]["geometry"])
pts = shapely.points(lngs, lats)
return shapely.contains(polygon, pts)