This commit is contained in:
Andras Schmelczer 2026-05-06 22:40:46 +01:00
parent 28323f145e
commit 94f9c0d594
76 changed files with 3238 additions and 1230 deletions

View file

@ -2,12 +2,17 @@
import argparse
import io
import math
import re
import urllib.request
from dataclasses import dataclass
from pathlib import Path
import polars as pl
NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
TUBE_STATION_CATEGORY = "Tube station"
TUBE_STATION_MERGE_RADIUS_DEGREES = 0.01
STOP_TYPES = {
@ -25,6 +30,41 @@ STOP_TYPES = {
OUTPUT_COLUMNS = ["id", "name", "category", "lat", "lng"]
def canonical_station_name(name: str | None) -> str:
"""Normalize station names so entrances/transport-mode variants collapse."""
if not name:
return ""
normalized = name.lower()
normalized = re.sub(r"\([^)]*\)", " ", normalized)
normalized = re.sub(r"['`]", "", normalized)
normalized = normalized.replace("&", " and ")
normalized = re.sub(r"[^a-z0-9]+", " ", normalized)
words = normalized.split()
suffixes = (
("underground", "station"),
("tube", "station"),
("dlr", "station"),
("metro", "station"),
("tram", "stop"),
("rail", "station"),
("railway", "station"),
("station",),
("stop",),
)
while True:
suffix = next(
(suffix for suffix in suffixes if words[-len(suffix) :] == list(suffix)),
None,
)
if suffix is None:
break
del words[-len(suffix) :]
return " ".join(words)
def canonical_station_name_expr(name_col: str = "name") -> pl.Expr:
"""Normalize station names so entrances/transport-mode variants collapse."""
expr = pl.col(name_col).str.to_lowercase()
@ -45,67 +85,158 @@ def _has_locality() -> pl.Expr:
return pl.col("locality").is_not_null() & (pl.col("locality") != "")
def _deduplicate_tube_partition(
df: pl.DataFrame, group_cols: list[str]
) -> pl.DataFrame:
if len(df) == 0:
return pl.DataFrame(
{
"id": pl.Series([], dtype=pl.String),
"name": pl.Series([], dtype=pl.String),
"category": pl.Series([], dtype=pl.String),
"lat": pl.Series([], dtype=pl.Float64),
"lng": pl.Series([], dtype=pl.Float64),
}
)
name_len = pl.col("name").str.len_chars()
return (
df.group_by(group_cols)
.agg(
pl.col("id").sort_by(name_len).first(),
pl.col("name").sort_by(name_len).first(),
pl.col("category").first(),
pl.col("lat").mean(),
pl.col("lng").mean(),
)
.select(OUTPUT_COLUMNS)
def _empty_output_frame() -> pl.DataFrame:
return pl.DataFrame(
{
"id": pl.Series([], dtype=pl.String),
"name": pl.Series([], dtype=pl.String),
"category": pl.Series([], dtype=pl.String),
"lat": pl.Series([], dtype=pl.Float64),
"lng": pl.Series([], dtype=pl.Float64),
}
)
def station_name_score(name: str) -> tuple[int, int]:
lower = name.lower()
suffix_penalty = int(
lower.endswith(
(
" underground station",
" tube station",
" dlr station",
" metro station",
" tram stop",
" station",
" stop",
)
)
)
return (suffix_penalty, len(name))
@dataclass
class StationAccumulator:
id: str
name: str
category: str
lat_sum: float
lng_sum: float
count: int = 1
@property
def lat(self) -> float:
return self.lat_sum / self.count
@property
def lng(self) -> float:
return self.lng_sum / self.count
def same_area(self, lat: float, lng: float) -> bool:
dlat = self.lat - lat
dlng = (self.lng - lng) * math.cos(math.radians(self.lat))
return (dlat * dlat + dlng * dlng) <= TUBE_STATION_MERGE_RADIUS_DEGREES**2
def merge(self, row: dict[str, object]) -> None:
self.lat_sum += float(row["lat"])
self.lng_sum += float(row["lng"])
self.count += 1
name = str(row["name"] or "")
if station_name_score(name) < station_name_score(self.name):
self.id = str(row["id"] or "")
self.name = name
def _station_from_row(row: dict[str, object]) -> StationAccumulator:
return StationAccumulator(
id=str(row["id"] or ""),
name=str(row["name"] or ""),
category=str(row["category"] or ""),
lat_sum=float(row["lat"]),
lng_sum=float(row["lng"]),
)
def _deduplicate_tube_stations(df: pl.DataFrame) -> pl.DataFrame:
if len(df) == 0:
return _empty_output_frame()
selected: list[StationAccumulator] = []
groups: dict[str, list[int]] = {}
for row in df.iter_rows(named=True):
station_key = canonical_station_name(str(row["name"] or ""))
if not station_key:
selected.append(_station_from_row(row))
continue
existing = next(
(
index
for index in groups.get(station_key, [])
if selected[index].same_area(float(row["lat"]), float(row["lng"]))
),
None,
)
if existing is not None:
selected[existing].merge(row)
continue
index = len(selected)
selected.append(_station_from_row(row))
groups.setdefault(station_key, []).append(index)
return pl.DataFrame(
{
"id": [station.id for station in selected],
"name": [station.name for station in selected],
"category": [station.category for station in selected],
"lat": [station.lat for station in selected],
"lng": [station.lng for station in selected],
}
).select(OUTPUT_COLUMNS)
def _deduplicate_non_tube_stops(df: pl.DataFrame) -> pl.DataFrame:
if len(df) == 0:
return _empty_output_frame()
has_loc = df.filter(_has_locality())
no_loc = df.filter(~_has_locality())
# First pass: one record per exact stop name/category/locality.
frames = []
if len(has_loc) > 0:
frames.append(
has_loc.group_by("name", "category", "locality")
.agg(
pl.col("id").first(),
pl.col("lat").mean(),
pl.col("lng").mean(),
)
.select(OUTPUT_COLUMNS)
)
if len(no_loc) > 0:
frames.append(no_loc.select(OUTPUT_COLUMNS))
if not frames:
return _empty_output_frame()
return pl.concat(frames).select(OUTPUT_COLUMNS)
def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
"""Deduplicate NaPTAN stops, with stricter station-level merging for Tube POIs."""
has_loc = df.filter(_has_locality())
no_loc = df.filter(~_has_locality())
cols_with_locality = [*OUTPUT_COLUMNS, "locality"]
"""Deduplicate NaPTAN stops, with station-level merging for Tube POIs."""
tube = df.filter(pl.col("category") == TUBE_STATION_CATEGORY)
other = df.filter(pl.col("category") != TUBE_STATION_CATEGORY)
# First pass: one record per exact stop name/category/locality.
deduped_has_loc = (
has_loc.group_by("name", "category", "locality")
.agg(
pl.col("id").first(),
pl.col("lat").mean(),
pl.col("lng").mean(),
)
.select(cols_with_locality)
)
df = pl.concat([deduped_has_loc, no_loc.select(cols_with_locality)])
tube = df.filter(pl.col("category") == "Tube station").with_columns(
canonical_station_name_expr().alias("_station_key")
)
other = df.filter(pl.col("category") != "Tube station")
tube_with_loc = tube.filter(_has_locality())
tube_no_loc = tube.filter(~_has_locality())
deduped_tube = pl.concat(
return pl.concat(
[
_deduplicate_tube_partition(tube_with_loc, ["_station_key", "locality"]),
_deduplicate_tube_partition(tube_no_loc, ["_station_key"]),
_deduplicate_non_tube_stops(other),
_deduplicate_tube_stations(tube),
]
)
return pl.concat([other.select(OUTPUT_COLUMNS), deduped_tube])
).select(OUTPUT_COLUMNS)
def download_naptan(output: Path) -> None:
@ -140,7 +271,7 @@ def download_naptan(output: Path) -> None:
print(
f"Deduplicated {before:,}{len(df):,} stops "
"(by name+category+locality; tube stations by normalized station name)"
"(by name+category+locality; tube stations by normalized name+area)"
)
df.write_parquet(output)

View file

@ -1,19 +1,24 @@
import polars as pl
import pytest
from pipeline.download.naptan import canonical_station_name_expr, deduplicate_naptan
from pipeline.download.naptan import (
canonical_station_name,
canonical_station_name_expr,
deduplicate_naptan,
)
def test_canonical_station_name_expr_normalizes_transport_suffixes():
names = [
"Bank",
"Bank Underground Station",
"Bank DLR Station",
"Pleasure Beach (Blackpool Tramway)",
"Earl's Court Tube Station",
]
df = pl.DataFrame(
{
"name": [
"Bank",
"Bank Underground Station",
"Bank DLR Station",
"Pleasure Beach (Blackpool Tramway)",
"Earl's Court Tube Station",
]
"name": names,
}
)
@ -26,30 +31,45 @@ def test_canonical_station_name_expr_normalizes_transport_suffixes():
"pleasure beach",
"earls court",
]
assert [canonical_station_name(name) for name in names] == result
def test_deduplicate_naptan_merges_tube_station_variants_by_locality():
def test_deduplicate_naptan_merges_tube_station_variants_by_area():
df = pl.DataFrame(
{
"id": ["bank", "bank-lu", "bank-dlr", "other-bank"],
"id": [
"bank",
"bank-lu",
"bank-dlr",
"other-bank",
"central-a",
"central-b",
],
"name": [
"Bank",
"Bank Underground Station",
"Bank DLR Station",
"Bank Underground Station",
"Central Tube Station",
"Central Tube Station",
],
"category": ["Tube station"] * 4,
"lat": [51.5129, 51.5134, 51.5132, 55.0140],
"lng": [-0.0889, -0.0890, -0.0885, -1.6781],
"locality": ["LOC1", "LOC1", "LOC1", "LOC2"],
"category": ["Tube station"] * 6,
"lat": [51.5129, 51.5134, 51.5132, 55.0140, 51.5, 53.0],
"lng": [-0.0889, -0.0890, -0.0885, -1.6781, -0.1, -2.0],
"locality": ["LOC1", "LOC1", "LOC2", "LOC1", None, None],
}
)
result = deduplicate_naptan(df).sort("lat")
assert len(result) == 2
assert result["name"].to_list() == ["Bank", "Bank Underground Station"]
assert result["lat"].to_list()[0] == pytest.approx(
assert len(result) == 4
assert result["name"].to_list() == [
"Central Tube Station",
"Bank",
"Central Tube Station",
"Bank Underground Station",
]
assert result.filter(pl.col("name") == "Bank")["lat"][0] == pytest.approx(
(51.5129 + 51.5134 + 51.5132) / 3
)