932 lines
30 KiB
Python
932 lines
30 KiB
Python
"""Extract places, stations, and universities → data/places.parquet.
|
|
|
|
Extracts named place nodes and railway stations (tube, national rail, DLR,
|
|
etc.) for typeahead search. Official English university providers from the
|
|
Office for Students register can also be added as travel-time destinations.
|
|
Reuses the same england-latest.osm.pbf as pois.py.
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import osmium
|
|
import polars as pl
|
|
from scipy.spatial import cKDTree
|
|
from shapely.geometry import Point
|
|
from pyproj import Transformer
|
|
from tqdm import tqdm
|
|
|
|
from pipeline.utils.england_geometry import (
|
|
ENGLAND_BBOX_EAST,
|
|
ENGLAND_BBOX_NORTH,
|
|
ENGLAND_BBOX_SOUTH,
|
|
ENGLAND_BBOX_WEST,
|
|
load_england_polygon,
|
|
)
|
|
|
|
# Search can use a wider set of OSM place nodes, but travel-time destinations
|
|
# must remain restricted to the historical city/station origin set.
|
|
SEARCH_PLACE_TYPES = {
|
|
"city",
|
|
"town",
|
|
"village",
|
|
"suburb",
|
|
"neighbourhood",
|
|
"quarter",
|
|
"borough",
|
|
"locality",
|
|
"hamlet",
|
|
"isolated_dwelling",
|
|
"island",
|
|
}
|
|
TRAVEL_DESTINATION_PLACE_TYPES = {"city"}
|
|
|
|
# Named OSM highways worth surfacing as searchable streets (N). Service roads, footways,
|
|
# cycleways and motorways are deliberately excluded.
|
|
SEARCHABLE_HIGHWAY_TYPES = {
|
|
"residential",
|
|
"unclassified",
|
|
"tertiary",
|
|
"tertiary_link",
|
|
"secondary",
|
|
"secondary_link",
|
|
"primary",
|
|
"primary_link",
|
|
"trunk",
|
|
"living_street",
|
|
"pedestrian",
|
|
}
|
|
|
|
# High-value named POIs (M) lifted from uk_pois.parquet into the gazetteer, mapped from the
|
|
# OSM "key/value" category onto a search place_type. Everyday shops/amenities are excluded.
|
|
HIGH_VALUE_POI_CATEGORIES = {
|
|
"leisure/park": "park",
|
|
"leisure/garden": "park",
|
|
"leisure/nature_reserve": "park",
|
|
"leisure/common": "park",
|
|
"tourism/attraction": "attraction",
|
|
"tourism/theme_park": "attraction",
|
|
"tourism/zoo": "attraction",
|
|
"tourism/museum": "attraction",
|
|
"tourism/gallery": "attraction",
|
|
"amenity/hospital": "hospital",
|
|
"healthcare/hospital": "hospital",
|
|
"shop/mall": "retail",
|
|
"shop/department_store": "retail",
|
|
}
|
|
|
|
ENGLAND_COUNTRY_CODE = "E92000001"
|
|
LONDON_REGION_CODE = "E12000007"
|
|
LONDON_LAD_PREFIX = "E09"
|
|
LONDON_COUNTY_CODES = {"E13000001", "E13000002"}
|
|
DISPLAY_CITY_NEAREST_POSTCODE_MAX_M = 3_000
|
|
WGS84_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
|
|
|
|
# England British National Grid (EPSG:27700) bounding box, with margin. ONS NSPL stores
|
|
# postcodes that have no grid reference at the Null-Island sentinel lat=99.999999,
|
|
# long=0.000000, whose paired easting/northing collapse to the grid origin (0, 0) (or
|
|
# inf). Requiring coordinates inside this box drops the sentinel from every index, so an
|
|
# active postcode lacking a grid ref can never become a false nearest neighbour.
|
|
ENGLAND_BNG_MIN_EAST = 50_000.0
|
|
ENGLAND_BNG_MAX_EAST = 660_000.0
|
|
ENGLAND_BNG_MIN_NORTH = 0.0
|
|
ENGLAND_BNG_MAX_NORTH = 660_000.0
|
|
|
|
|
|
def _valid_wgs84_expr() -> pl.Expr:
|
|
"""Rows with a real lat/long inside England (drops the ONS lat=99.999999, long=0.0
|
|
no-grid-reference sentinel and any nulls), so they never enter a coordinate index."""
|
|
return (
|
|
pl.col("lat").is_not_null()
|
|
& pl.col("long").is_not_null()
|
|
& pl.col("lat").is_between(ENGLAND_BBOX_SOUTH, ENGLAND_BBOX_NORTH)
|
|
& pl.col("long").is_between(ENGLAND_BBOX_WEST, ENGLAND_BBOX_EAST)
|
|
)
|
|
|
|
|
|
def _valid_bng_expr() -> pl.Expr:
|
|
"""Rows with a real easting/northing inside England (drops the (0, 0) grid-origin /
|
|
inf paired with the ONS no-grid-reference sentinel and any nulls)."""
|
|
return (
|
|
pl.col("east1m").is_not_null()
|
|
& pl.col("north1m").is_not_null()
|
|
& pl.col("east1m").is_between(ENGLAND_BNG_MIN_EAST, ENGLAND_BNG_MAX_EAST)
|
|
& pl.col("north1m").is_between(ENGLAND_BNG_MIN_NORTH, ENGLAND_BNG_MAX_NORTH)
|
|
)
|
|
|
|
# Suffixes to strip from raw station names before appending the typed suffix.
|
|
_STATION_STRIP = (
|
|
" tube station",
|
|
" underground station",
|
|
" railway station",
|
|
" dlr station",
|
|
" station dlr",
|
|
" dlr",
|
|
" overground station",
|
|
" tram stop",
|
|
" station",
|
|
)
|
|
|
|
_DLR_CODE_RE = re.compile(r"ZZDL([A-Z0-9]{3})")
|
|
_POSTCODE_RE = re.compile(r"\b([A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2})\b", re.I)
|
|
_LONDON_TOKEN_RE = re.compile(r"(^|[^a-z])london([^a-z]|$)", re.I)
|
|
|
|
_NOISY_PROVIDER_SUFFIXES = (
|
|
" higher education corporation",
|
|
" limited",
|
|
" ltd",
|
|
)
|
|
|
|
_LEGAL_NAME_FALLBACK_MARKERS = (
|
|
"the chancellor",
|
|
"chancellor, masters",
|
|
"chancellor masters",
|
|
)
|
|
|
|
|
|
def _is_dlr_station(tags: dict[str, str]) -> bool:
|
|
name = tags.get("name", "").lower()
|
|
network = tags.get("network", "").lower()
|
|
operator = tags.get("operator", "").lower()
|
|
return (
|
|
"docklands" in network
|
|
or "dlr" in network
|
|
or "docklands" in operator
|
|
or "dlr" in operator
|
|
or name.endswith(" dlr")
|
|
or " dlr " in name
|
|
)
|
|
|
|
|
|
def _is_tram_station(tags: dict[str, str]) -> bool:
|
|
if _is_dlr_station(tags):
|
|
return False
|
|
station_tag = tags.get("station", "")
|
|
network = tags.get("network", "").lower()
|
|
return station_tag == "light_rail" or "tramlink" in network or "tram" in network
|
|
|
|
|
|
def _station_display_name(name: str, tags: dict[str, str]) -> str:
|
|
"""Build a descriptive station name like 'Bank tube station'."""
|
|
station_tag = tags.get("station", "")
|
|
network = tags.get("network", "").lower()
|
|
|
|
if station_tag == "subway" or "underground" in network:
|
|
suffix = "tube station"
|
|
elif "docklands" in network or "dlr" in network:
|
|
suffix = "DLR station"
|
|
elif "overground" in network:
|
|
suffix = "overground station"
|
|
elif "elizabeth" in network:
|
|
suffix = "Elizabeth line station"
|
|
elif station_tag == "light_rail" or "tramlink" in network or "tram" in network:
|
|
suffix = "tram stop"
|
|
else:
|
|
suffix = "railway station"
|
|
|
|
# Strip any existing station suffix from the raw name
|
|
lower = name.lower()
|
|
for s in _STATION_STRIP:
|
|
if lower.endswith(s):
|
|
name = name[: len(name) - len(s)].rstrip()
|
|
break
|
|
|
|
return f"{name} {suffix}"
|
|
|
|
|
|
def _station_name_score(name: str) -> tuple[int, int]:
|
|
lower = name.lower()
|
|
suffix_penalty = int(
|
|
lower.endswith(
|
|
(
|
|
" underground station",
|
|
" tube station",
|
|
" dlr station",
|
|
" railway station",
|
|
" rail station",
|
|
" station dlr",
|
|
" station",
|
|
)
|
|
)
|
|
or lower.endswith(" dlr")
|
|
)
|
|
return (suffix_penalty, len(name))
|
|
|
|
|
|
def _cell_text(value: object) -> str:
|
|
if value is None:
|
|
return ""
|
|
return str(value).strip()
|
|
|
|
|
|
def _header_key(value: object) -> str:
|
|
return re.sub(r"[^a-z0-9]+", " ", _cell_text(value).lower()).strip()
|
|
|
|
|
|
def _find_header_row(rows: list[tuple]) -> int:
|
|
for idx, row in enumerate(rows):
|
|
keys = [_header_key(value) for value in row]
|
|
has_legal_name = any(
|
|
all(token in key for token in ("provider", "legal", "name")) for key in keys
|
|
)
|
|
has_university_title = any(
|
|
all(token in key for token in ("right", "use", "university"))
|
|
for key in keys
|
|
)
|
|
if has_legal_name and has_university_title:
|
|
return idx
|
|
raise ValueError("Could not find the OfS register header row")
|
|
|
|
|
|
def _find_column(headers: list[object], *tokens: str) -> int:
|
|
for idx, header in enumerate(headers):
|
|
key = _header_key(header)
|
|
if all(token in key for token in tokens):
|
|
return idx
|
|
raise ValueError(f"Could not find OfS register column containing {tokens}")
|
|
|
|
|
|
def _normalize_postcode(postcode: str) -> str:
|
|
return re.sub(r"[^A-Z0-9]", "", postcode.upper())
|
|
|
|
|
|
def _extract_postcode(address: str) -> str | None:
|
|
match = _POSTCODE_RE.search(address)
|
|
if match is None:
|
|
return None
|
|
return _normalize_postcode(match.group(1))
|
|
|
|
|
|
def _clean_provider_name(name: str) -> str:
|
|
name = re.sub(r"\s+", " ", name).strip(" ,")
|
|
if name.lower().endswith(", the"):
|
|
name = f"The {name[:-5].strip(' ,')}"
|
|
for suffix in _NOISY_PROVIDER_SUFFIXES:
|
|
if name.lower().endswith(suffix):
|
|
name = name[: -len(suffix)].strip(" ,")
|
|
break
|
|
if name.startswith("The ") and name != "The Open University":
|
|
name = name[4:].strip()
|
|
return name
|
|
|
|
|
|
def _split_trading_names(trading_names: str) -> list[str]:
|
|
if not trading_names or trading_names.casefold() == "not applicable":
|
|
return []
|
|
return [
|
|
_clean_provider_name(name)
|
|
for name in trading_names.splitlines()
|
|
if _clean_provider_name(name)
|
|
]
|
|
|
|
|
|
def _needs_trading_name(legal_name: str) -> bool:
|
|
lower = legal_name.lower()
|
|
return any(marker in lower for marker in _LEGAL_NAME_FALLBACK_MARKERS) or any(
|
|
lower.endswith(suffix) for suffix in _NOISY_PROVIDER_SUFFIXES
|
|
)
|
|
|
|
|
|
def _select_university_name(legal_name: str, trading_names: str) -> str:
|
|
legal = _clean_provider_name(legal_name)
|
|
trading = _split_trading_names(trading_names)
|
|
if _needs_trading_name(legal_name):
|
|
for name in trading:
|
|
if "university" in name.lower() or "imperial college" in name.lower():
|
|
return name
|
|
if trading:
|
|
return trading[0]
|
|
return legal
|
|
|
|
|
|
def _slugify_name(name: str) -> str:
|
|
slug = name.lower()
|
|
slug = re.sub(r"[^a-z0-9 -]", "", slug)
|
|
return re.sub(r"\s+", "-", slug).strip("-")
|
|
|
|
|
|
def _street_centroid(coords: list[tuple[float, float]]) -> tuple[float, float] | None:
|
|
"""Average (lat, lon) of a way's vertices."""
|
|
if not coords:
|
|
return None
|
|
count = len(coords)
|
|
lat = sum(lat for lat, _ in coords) / count
|
|
lon = sum(lon for _, lon in coords) / count
|
|
return lat, lon
|
|
|
|
|
|
def _normalize_street_name(name: str) -> str:
|
|
"""Grouping key for a street name: collapse whitespace, lowercase."""
|
|
return re.sub(r"\s+", " ", name).strip().lower()
|
|
|
|
|
|
def _outcode_of_postcode(postcode: str) -> str:
|
|
"""Outward code (everything before the space) of a postcode, e.g. 'NW1' from 'NW1 6XE'."""
|
|
return postcode.split(" ", 1)[0] if postcode else ""
|
|
|
|
|
|
def _outcode_tree(postcodes_path: Path) -> tuple[cKDTree, list[str]]:
|
|
"""Build a nearest-neighbour index from postcode coordinates to their outcode, so each
|
|
street can be tagged with the outcode it sits in (used to disambiguate same-named roads).
|
|
|
|
The tree lives in BNG metres (like `_london_postcode_tree`): in raw degrees
|
|
1° of longitude is only ~0.6° of latitude at UK latitudes, which biases
|
|
nearest-postcode picks E-W near outcode boundaries."""
|
|
df = (
|
|
pl.read_parquet(
|
|
postcodes_path,
|
|
columns=["pcds", "east1m", "north1m", "ctry25cd", "doterm"],
|
|
)
|
|
.filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
|
|
.filter(_valid_bng_expr())
|
|
)
|
|
coords = np.column_stack(
|
|
[
|
|
df["east1m"].to_numpy().astype(np.float64),
|
|
df["north1m"].to_numpy().astype(np.float64),
|
|
]
|
|
)
|
|
outcodes = [_outcode_of_postcode(pc) for pc in df["pcds"].to_list()]
|
|
return cKDTree(coords), outcodes
|
|
|
|
|
|
def _build_street_places(
|
|
streets: list[dict],
|
|
tree: cKDTree,
|
|
outcodes: list[str],
|
|
) -> list[dict]:
|
|
"""Group street segments by (normalized name, outcode), averaging centroids, so a road that
|
|
OSM splits into many segments becomes one searchable result per outcode it passes through."""
|
|
if not streets:
|
|
return []
|
|
|
|
lons = np.array([street["lon"] for street in streets], dtype=np.float64)
|
|
lats = np.array([street["lat"] for street in streets], dtype=np.float64)
|
|
eastings, northings = WGS84_TO_BNG.transform(lons, lats)
|
|
_, indices = tree.query(np.column_stack([eastings, northings]))
|
|
|
|
grouped: dict[tuple[str, str], dict] = {}
|
|
for street, postcode_idx in zip(streets, indices):
|
|
outcode = outcodes[postcode_idx]
|
|
key = (_normalize_street_name(street["name"]), outcode)
|
|
entry = grouped.get(key)
|
|
if entry is None:
|
|
grouped[key] = {
|
|
"name": street["name"],
|
|
"lat_sum": street["lat"],
|
|
"lon_sum": street["lon"],
|
|
"count": 1,
|
|
}
|
|
else:
|
|
entry["lat_sum"] += street["lat"]
|
|
entry["lon_sum"] += street["lon"]
|
|
entry["count"] += 1
|
|
|
|
places = []
|
|
for entry in grouped.values():
|
|
count = entry["count"]
|
|
places.append(
|
|
{
|
|
"name": entry["name"],
|
|
"place_type": "street",
|
|
"lat": entry["lat_sum"] / count,
|
|
"lon": entry["lon_sum"] / count,
|
|
"population": 0,
|
|
"travel_destination": False,
|
|
"display_city": None,
|
|
}
|
|
)
|
|
return sorted(places, key=lambda place: place["name"].lower())
|
|
|
|
|
|
def _poi_dedup_key(name: str, place_type: str, lat: float, lon: float) -> tuple:
|
|
"""Geographic de-dup key: round(.,2) is ~1.1km lat / ~0.7km UK lon.
|
|
|
|
Coarse enough to collapse the SAME physical POI mapped twice a few metres
|
|
apart, fine enough to keep genuinely distinct same-named POIs in different
|
|
towns (e.g. "Victoria Park" in London vs Bristol).
|
|
"""
|
|
return (name.lower(), place_type, round(lat, 2), round(lon, 2))
|
|
|
|
|
|
def _pois_to_places(pois: pl.DataFrame) -> list[dict]:
|
|
"""Map high-value named POIs onto gazetteer place rows (M), de-duplicated by (name, type, coords)."""
|
|
if pois.is_empty():
|
|
return []
|
|
|
|
seen: set[tuple] = set()
|
|
places: list[dict] = []
|
|
for row in pois.iter_rows(named=True):
|
|
place_type = HIGH_VALUE_POI_CATEGORIES.get(str(row.get("category", "")))
|
|
if place_type is None:
|
|
continue
|
|
name = str(row.get("name") or "").strip()
|
|
if len(name) < 3:
|
|
continue
|
|
lat = float(row["lat"])
|
|
lon = float(row["lng"])
|
|
key = _poi_dedup_key(name, place_type, lat, lon)
|
|
if key in seen:
|
|
continue
|
|
seen.add(key)
|
|
places.append(
|
|
{
|
|
"name": name,
|
|
"place_type": place_type,
|
|
"lat": lat,
|
|
"lon": lon,
|
|
"population": 0,
|
|
"travel_destination": False,
|
|
"display_city": None,
|
|
}
|
|
)
|
|
return places
|
|
|
|
|
|
def _append_high_value_pois(places: list[dict], pois_path: Path) -> int:
|
|
pois = pl.read_parquet(pois_path, columns=["name", "category", "lat", "lng"])
|
|
new_places = _pois_to_places(pois)
|
|
existing = {
|
|
_poi_dedup_key(
|
|
str(place["name"]), place["place_type"], place["lat"], place["lon"]
|
|
)
|
|
for place in places
|
|
}
|
|
added = 0
|
|
for place in new_places:
|
|
key = _poi_dedup_key(
|
|
place["name"], place["place_type"], place["lat"], place["lon"]
|
|
)
|
|
if key in existing:
|
|
continue
|
|
places.append(place)
|
|
existing.add(key)
|
|
added += 1
|
|
return added
|
|
|
|
|
|
def _postcode_lookup(postcodes_path: Path) -> dict[str, tuple[float, float]]:
|
|
df = (
|
|
pl.read_parquet(
|
|
postcodes_path,
|
|
columns=["pcds", "lat", "long", "ctry25cd", "doterm"],
|
|
)
|
|
.filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
|
|
.filter(_valid_wgs84_expr())
|
|
)
|
|
return {
|
|
_normalize_postcode(postcode): (float(lat), float(lon))
|
|
for postcode, lat, lon in df.select(["pcds", "lat", "long"]).iter_rows()
|
|
}
|
|
|
|
|
|
def _display_city_from_tags(tags: dict[str, str]) -> str | None:
|
|
"""Use explicit OSM context where available, before we fall back to admin data."""
|
|
for key in (
|
|
"is_in",
|
|
"is_in:city",
|
|
"is_in:town",
|
|
"is_in:county",
|
|
"addr:city",
|
|
):
|
|
value = tags.get(key)
|
|
if value and _LONDON_TOKEN_RE.search(value):
|
|
return "London"
|
|
return None
|
|
|
|
|
|
def _parse_population(pop_str: str) -> int:
|
|
"""Robustly parse OSM population tags that may carry grouping separators,
|
|
decimals, or surrounding text ("12,345", "5 000", "12345.0", "approx 5000").
|
|
"""
|
|
# Take the integer part before any decimal point, then the first run of
|
|
# digits ignoring grouping separators (commas/spaces) and other annotations.
|
|
match = re.search(r"\d[\d,\s]*", pop_str.split(".", 1)[0])
|
|
if match is None:
|
|
return 0
|
|
digits = re.sub(r"\D", "", match.group(0))
|
|
return int(digits) if digits else 0
|
|
|
|
|
|
def _is_london_admin_expr() -> pl.Expr:
|
|
return (
|
|
(pl.col("rgn25cd") == LONDON_REGION_CODE)
|
|
| pl.col("lad25cd").str.starts_with(LONDON_LAD_PREFIX).fill_null(False)
|
|
| pl.col("cty25cd").is_in(LONDON_COUNTY_CODES)
|
|
)
|
|
|
|
|
|
def _london_postcode_tree(postcodes_path: Path) -> tuple[cKDTree, np.ndarray]:
|
|
required = [
|
|
"doterm",
|
|
"ctry25cd",
|
|
"east1m",
|
|
"north1m",
|
|
"rgn25cd",
|
|
"lad25cd",
|
|
"cty25cd",
|
|
]
|
|
df = (
|
|
pl.read_parquet(postcodes_path, columns=required)
|
|
.filter(
|
|
(pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null()
|
|
)
|
|
.filter(_valid_bng_expr())
|
|
.with_columns(_is_london_admin_expr().alias("is_london"))
|
|
.select("east1m", "north1m", "is_london")
|
|
)
|
|
if df.is_empty():
|
|
raise ValueError(f"No active England postcodes in {postcodes_path}")
|
|
|
|
coords = np.column_stack(
|
|
[
|
|
df["east1m"].to_numpy().astype(np.float64),
|
|
df["north1m"].to_numpy().astype(np.float64),
|
|
]
|
|
)
|
|
london_flags = df["is_london"].to_numpy().astype(bool)
|
|
return cKDTree(coords), london_flags
|
|
|
|
|
|
def _assign_london_display_city(
|
|
places: list[dict],
|
|
postcodes_path: Path,
|
|
max_distance_m: float = DISPLAY_CITY_NEAREST_POSTCODE_MAX_M,
|
|
) -> int:
|
|
"""Tag places whose nearest active postcode is inside Greater London."""
|
|
if not places:
|
|
return 0
|
|
|
|
tree, london_flags = _london_postcode_tree(postcodes_path)
|
|
lons = np.array([float(place["lon"]) for place in places], dtype=np.float64)
|
|
lats = np.array([float(place["lat"]) for place in places], dtype=np.float64)
|
|
eastings, northings = WGS84_TO_BNG.transform(lons, lats)
|
|
place_coords = np.column_stack([eastings, northings])
|
|
distances, indices = tree.query(place_coords)
|
|
|
|
assigned = 0
|
|
for idx, place in enumerate(places):
|
|
if place.get("display_city") or place.get("place_type") == "city":
|
|
continue
|
|
if distances[idx] <= max_distance_m and london_flags[indices[idx]]:
|
|
place["display_city"] = "London"
|
|
assigned += 1
|
|
return assigned
|
|
|
|
|
|
def _ofs_universities(
|
|
raw: pl.DataFrame, postcode_coords: dict[str, tuple[float, float]]
|
|
) -> tuple[list[dict], int]:
|
|
rows = raw.rows()
|
|
header_idx = _find_header_row(rows)
|
|
headers = list(rows[header_idx])
|
|
legal_idx = _find_column(headers, "provider", "legal", "name")
|
|
trading_idx = _find_column(headers, "trading", "name")
|
|
address_idx = _find_column(headers, "contact", "address")
|
|
university_title_idx = _find_column(headers, "right", "use", "university")
|
|
|
|
universities: list[dict] = []
|
|
skipped = 0
|
|
for row in rows[header_idx + 1 :]:
|
|
if _cell_text(row[university_title_idx]).casefold() != "yes":
|
|
continue
|
|
|
|
name = _select_university_name(
|
|
_cell_text(row[legal_idx]), _cell_text(row[trading_idx])
|
|
)
|
|
postcode = _extract_postcode(_cell_text(row[address_idx]))
|
|
coords = postcode_coords.get(postcode or "")
|
|
if not name or coords is None:
|
|
skipped += 1
|
|
continue
|
|
|
|
lat, lon = coords
|
|
universities.append(
|
|
{
|
|
"name": name,
|
|
"place_type": "university",
|
|
"lat": lat,
|
|
"lon": lon,
|
|
"population": 0,
|
|
"travel_destination": True,
|
|
"display_city": None,
|
|
}
|
|
)
|
|
|
|
return universities, skipped
|
|
|
|
|
|
def _append_ofs_universities(
|
|
places: list[dict], register_path: Path, postcodes_path: Path
|
|
) -> tuple[int, int]:
|
|
postcode_coords = _postcode_lookup(postcodes_path)
|
|
raw = pl.read_excel(register_path, has_header=False)
|
|
universities, skipped = _ofs_universities(raw, postcode_coords)
|
|
|
|
existing_slugs = {_slugify_name(str(place["name"])) for place in places}
|
|
added = 0
|
|
for university in universities:
|
|
slug = _slugify_name(university["name"])
|
|
if slug in existing_slugs:
|
|
continue
|
|
places.append(university)
|
|
existing_slugs.add(slug)
|
|
added += 1
|
|
return added, skipped
|
|
|
|
|
|
def _naptan_dlr_stations(naptan_path: Path) -> list[dict]:
|
|
"""Extract station-level DLR destinations from NaPTAN access nodes."""
|
|
df = pl.read_parquet(naptan_path)
|
|
required = {"id", "name", "category", "lat", "lng"}
|
|
missing = required - set(df.columns)
|
|
if missing:
|
|
raise ValueError(f"NaPTAN file is missing columns: {sorted(missing)}")
|
|
|
|
rows: dict[str, dict] = {}
|
|
for row in df.iter_rows(named=True):
|
|
atco_id = str(row["id"] or "")
|
|
match = _DLR_CODE_RE.search(atco_id)
|
|
if not match:
|
|
continue
|
|
if row["category"] not in {"Tube station", "Tram & Metro stop", "Rail station"}:
|
|
continue
|
|
|
|
code = match.group(1)
|
|
raw_name = str(row["name"] or "")
|
|
if not raw_name:
|
|
continue
|
|
|
|
lat = float(row["lat"])
|
|
lon = float(row["lng"])
|
|
current = rows.get(code)
|
|
if current is None:
|
|
rows[code] = {
|
|
"raw_name": raw_name,
|
|
"lat_sum": lat,
|
|
"lon_sum": lon,
|
|
"count": 1,
|
|
}
|
|
continue
|
|
|
|
current["lat_sum"] += lat
|
|
current["lon_sum"] += lon
|
|
current["count"] += 1
|
|
if _station_name_score(raw_name) < _station_name_score(current["raw_name"]):
|
|
current["raw_name"] = raw_name
|
|
|
|
stations = []
|
|
for station in rows.values():
|
|
count = station["count"]
|
|
display_name = _station_display_name(station["raw_name"], {"network": "DLR"})
|
|
stations.append(
|
|
{
|
|
"name": display_name,
|
|
"place_type": "station",
|
|
"lat": station["lat_sum"] / count,
|
|
"lon": station["lon_sum"] / count,
|
|
"population": 0,
|
|
"travel_destination": True,
|
|
"display_city": None,
|
|
}
|
|
)
|
|
|
|
return sorted(stations, key=lambda station: station["name"])
|
|
|
|
|
|
def _append_naptan_dlr_stations(places: list[dict], naptan_path: Path) -> int:
|
|
existing_names = {str(place["name"]).casefold() for place in places}
|
|
added = 0
|
|
for station in _naptan_dlr_stations(naptan_path):
|
|
key = station["name"].casefold()
|
|
if key in existing_names:
|
|
continue
|
|
places.append(station)
|
|
existing_names.add(key)
|
|
added += 1
|
|
return added
|
|
|
|
|
|
class PlaceHandler(osmium.SimpleHandler):
|
|
def __init__(
|
|
self, progress: tqdm, england_polygon, *, collect_streets: bool = False
|
|
) -> None:
|
|
super().__init__()
|
|
self._progress = progress
|
|
self.places: list[dict] = []
|
|
self.streets: list[dict] = []
|
|
self._england = england_polygon
|
|
self._collect_streets = collect_streets
|
|
|
|
def _add(
|
|
self,
|
|
name: str,
|
|
place_type: str,
|
|
lat: float,
|
|
lon: float,
|
|
population: int,
|
|
travel_destination: bool,
|
|
display_city: str | None = None,
|
|
) -> None:
|
|
self.places.append(
|
|
{
|
|
"name": name,
|
|
"place_type": place_type,
|
|
"lat": lat,
|
|
"lon": lon,
|
|
"population": population,
|
|
"travel_destination": travel_destination,
|
|
"display_city": display_city,
|
|
}
|
|
)
|
|
self._progress.set_postfix(places=f"{len(self.places):,}", refresh=False)
|
|
|
|
def node(self, n: osmium.osm.Node) -> None:
|
|
self._progress.update(1)
|
|
if not n.location.valid:
|
|
return
|
|
lat, lon = n.location.lat, n.location.lon
|
|
if not (
|
|
ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH
|
|
and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST
|
|
):
|
|
return
|
|
if not self._england.contains(Point(lon, lat)):
|
|
return
|
|
|
|
tags = dict(n.tags)
|
|
name = tags.get("name:en", tags.get("name", ""))
|
|
if not name:
|
|
return
|
|
|
|
population = _parse_population(tags.get("population", ""))
|
|
|
|
# place=* nodes
|
|
place_type = tags.get("place")
|
|
if place_type in SEARCH_PLACE_TYPES:
|
|
self._add(
|
|
name,
|
|
place_type,
|
|
lat,
|
|
lon,
|
|
population,
|
|
travel_destination=place_type in TRAVEL_DESTINATION_PLACE_TYPES,
|
|
display_city=None
|
|
if place_type == "city"
|
|
else _display_city_from_tags(tags),
|
|
)
|
|
return
|
|
|
|
# Railway stations (tube, national rail, DLR, overground, Elizabeth line)
|
|
if tags.get("railway") == "station":
|
|
if _is_tram_station(tags):
|
|
return
|
|
display_name = _station_display_name(name, tags)
|
|
self._add(
|
|
display_name,
|
|
"station",
|
|
lat,
|
|
lon,
|
|
population,
|
|
travel_destination=True,
|
|
display_city=_display_city_from_tags(tags),
|
|
)
|
|
return
|
|
|
|
def way(self, w: osmium.osm.Way) -> None:
|
|
"""Collect named, searchable highways as raw segments (grouped into streets later)."""
|
|
if not self._collect_streets:
|
|
return
|
|
self._progress.update(1)
|
|
if w.tags.get("highway") not in SEARCHABLE_HIGHWAY_TYPES:
|
|
return
|
|
name = w.tags.get("name:en", w.tags.get("name", ""))
|
|
if not name:
|
|
return
|
|
|
|
# Way node refs expose resolved .lat/.lon directly (locations=True); accessing them
|
|
# raises InvalidLocationError when a node's location is missing from the index.
|
|
coords: list[tuple[float, float]] = []
|
|
for node in w.nodes:
|
|
try:
|
|
coords.append((node.lat, node.lon))
|
|
except osmium.InvalidLocationError:
|
|
continue
|
|
|
|
centroid = _street_centroid(coords)
|
|
if centroid is None:
|
|
return
|
|
lat, lon = centroid
|
|
if not (
|
|
ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH
|
|
and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST
|
|
):
|
|
return
|
|
if not self._england.contains(Point(lon, lat)):
|
|
return
|
|
self.streets.append({"name": name, "lat": lat, "lon": lon})
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Extract place names from OSM PBF")
|
|
parser.add_argument(
|
|
"--output", type=Path, required=True, help="Output parquet file path"
|
|
)
|
|
parser.add_argument("--pbf", type=Path, required=True, help="Path to OSM PBF file")
|
|
parser.add_argument(
|
|
"--boundary",
|
|
type=Path,
|
|
required=True,
|
|
help="England boundary GeoJSON file",
|
|
)
|
|
parser.add_argument(
|
|
"--naptan",
|
|
type=Path,
|
|
help="Optional NaPTAN parquet file used to add DLR station destinations",
|
|
)
|
|
parser.add_argument(
|
|
"--university-register",
|
|
type=Path,
|
|
help="Optional OfS register spreadsheet used to add university destinations",
|
|
)
|
|
parser.add_argument(
|
|
"--postcodes",
|
|
type=Path,
|
|
help=(
|
|
"Postcode parquet used to geocode OfS university contact postcodes, assign "
|
|
"Greater London display labels, and tag streets with their outcode"
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--pois",
|
|
type=Path,
|
|
help="Optional uk_pois.parquet; high-value named POIs are added to the gazetteer",
|
|
)
|
|
parser.add_argument(
|
|
"--include-streets",
|
|
action="store_true",
|
|
help="Extract named highways as searchable streets (requires --postcodes)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
pbf_file = args.pbf
|
|
england_polygon = load_england_polygon(args.boundary)
|
|
|
|
if args.include_streets and not args.postcodes:
|
|
raise ValueError("--postcodes is required with --include-streets")
|
|
|
|
print("Extracting search place nodes + railway stations")
|
|
with tqdm(
|
|
unit=" elements",
|
|
unit_scale=True,
|
|
desc="Streaming",
|
|
smoothing=0.05,
|
|
mininterval=1.0,
|
|
) as progress:
|
|
handler = PlaceHandler(
|
|
progress, england_polygon, collect_streets=args.include_streets
|
|
)
|
|
handler.apply_file(str(pbf_file), locations=True)
|
|
|
|
print(f"Extracted {len(handler.places):,} place nodes")
|
|
if args.include_streets:
|
|
print(f"Collected {len(handler.streets):,} named street segments")
|
|
tree, outcodes = _outcode_tree(args.postcodes)
|
|
street_places = _build_street_places(handler.streets, tree, outcodes)
|
|
handler.places.extend(street_places)
|
|
print(f"Added {len(street_places):,} grouped streets")
|
|
if args.pois:
|
|
added = _append_high_value_pois(handler.places, args.pois)
|
|
print(f"Added {added:,} high-value POIs from {args.pois}")
|
|
if args.naptan:
|
|
added = _append_naptan_dlr_stations(handler.places, args.naptan)
|
|
print(f"Added {added:,} DLR station destinations from NaPTAN")
|
|
if args.university_register:
|
|
if not args.postcodes:
|
|
raise ValueError("--postcodes is required with --university-register")
|
|
added, skipped = _append_ofs_universities(
|
|
handler.places, args.university_register, args.postcodes
|
|
)
|
|
print(f"Added {added:,} university travel destinations from the OfS register")
|
|
if skipped:
|
|
print(f"Skipped {skipped:,} OfS university rows without usable coordinates")
|
|
|
|
if handler.places:
|
|
if args.postcodes:
|
|
assigned = _assign_london_display_city(handler.places, args.postcodes)
|
|
print(f"Assigned London display labels to {assigned:,} places")
|
|
for place in handler.places:
|
|
place.setdefault("display_city", None)
|
|
df = pl.DataFrame(handler.places)
|
|
df = df.with_columns(pl.col("display_city").cast(pl.Utf8))
|
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
df.write_parquet(args.output)
|
|
print(f"Saved to {args.output}")
|
|
else:
|
|
print("No places found — skipping output")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|