Codex changes

2026-05-04 16:19:09 +01:00 · 2026-05-04 16:19:09 +01:00 · d4dde21ad2
commit d4dde21ad2
parent 0bae902e08
46 changed files with 4953 additions and 966 deletions
--- a/pipeline/download/naptan.py
+++ b/pipeline/download/naptan.py
@ -22,6 +22,92 @@ STOP_TYPES = {
 }


+OUTPUT_COLUMNS = ["id", "name", "category", "lat", "lng"]
+
+
+def canonical_station_name_expr(name_col: str = "name") -> pl.Expr:
+    """Normalize station names so entrances/transport-mode variants collapse."""
+    expr = pl.col(name_col).str.to_lowercase()
+    expr = expr.str.replace_all(r"\([^)]*\)", " ")
+    expr = expr.str.replace_all(r"['’`]", "")
+    expr = expr.str.replace_all(r"&", " and ")
+    expr = expr.str.replace_all(r"[^a-z0-9]+", " ")
+    expr = expr.str.replace_all(r"\s+", " ").str.strip_chars()
+    expr = expr.str.replace_all(
+        r"\s+(underground|tube|dlr|metro|rail|railway)\s+station$", ""
+    )
+    expr = expr.str.replace_all(r"\s+tram\s+stop$", "")
+    expr = expr.str.replace_all(r"\s+(station|stop)$", "")
+    return expr.str.strip_chars()
+
+
+def _has_locality() -> pl.Expr:
+    return pl.col("locality").is_not_null() & (pl.col("locality") != "")
+
+
+def _deduplicate_tube_partition(
+    df: pl.DataFrame, group_cols: list[str]
+) -> pl.DataFrame:
+    if len(df) == 0:
+        return pl.DataFrame(
+            {
+                "id": pl.Series([], dtype=pl.String),
+                "name": pl.Series([], dtype=pl.String),
+                "category": pl.Series([], dtype=pl.String),
+                "lat": pl.Series([], dtype=pl.Float64),
+                "lng": pl.Series([], dtype=pl.Float64),
+            }
+        )
+
+    name_len = pl.col("name").str.len_chars()
+    return (
+        df.group_by(group_cols)
+        .agg(
+            pl.col("id").sort_by(name_len).first(),
+            pl.col("name").sort_by(name_len).first(),
+            pl.col("category").first(),
+            pl.col("lat").mean(),
+            pl.col("lng").mean(),
+        )
+        .select(OUTPUT_COLUMNS)
+    )
+
+
+def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
+    """Deduplicate NaPTAN stops, with stricter station-level merging for Tube POIs."""
+    has_loc = df.filter(_has_locality())
+    no_loc = df.filter(~_has_locality())
+    cols_with_locality = [*OUTPUT_COLUMNS, "locality"]
+
+    # First pass: one record per exact stop name/category/locality.
+    deduped_has_loc = (
+        has_loc.group_by("name", "category", "locality")
+        .agg(
+            pl.col("id").first(),
+            pl.col("lat").mean(),
+            pl.col("lng").mean(),
+        )
+        .select(cols_with_locality)
+    )
+    df = pl.concat([deduped_has_loc, no_loc.select(cols_with_locality)])
+
+    tube = df.filter(pl.col("category") == "Tube station").with_columns(
+        canonical_station_name_expr().alias("_station_key")
+    )
+    other = df.filter(pl.col("category") != "Tube station")
+
+    tube_with_loc = tube.filter(_has_locality())
+    tube_no_loc = tube.filter(~_has_locality())
+    deduped_tube = pl.concat(
+        [
+            _deduplicate_tube_partition(tube_with_loc, ["_station_key", "locality"]),
+            _deduplicate_tube_partition(tube_no_loc, ["_station_key"]),
+        ]
+    )
+
+    return pl.concat([other.select(OUTPUT_COLUMNS), deduped_tube])
+
+
 def download_naptan(output: Path) -> None:
    output.parent.mkdir(parents=True, exist_ok=True)

@ -50,24 +136,12 @@ def download_naptan(output: Path) -> None:
    )

    before = len(df)
+    df = deduplicate_naptan(df)

-    # Deduplicate: one record per name+category+locality
-    # (merges entrances, bus stop pairs on opposite sides of the road, etc.)
-    has_loc = df.filter(
-        pl.col("locality").is_not_null() & (pl.col("locality") != "")
+    print(
+        f"Deduplicated {before:,} → {len(df):,} stops "
+        "(by name+category+locality; tube stations by normalized station name)"
    )
-    no_loc = df.filter(
-        pl.col("locality").is_null() | (pl.col("locality") == "")
-    )
-    cols = ["id", "name", "category", "lat", "lng"]
-    deduped = has_loc.group_by("name", "category", "locality").agg(
-        pl.col("id").first(),
-        pl.col("lat").mean(),
-        pl.col("lng").mean(),
-    )
-    df = pl.concat([deduped.select(cols), no_loc.select(cols)])
-
-    print(f"Deduplicated {before:,} → {len(df):,} stops (by name+category+locality)")

    df.write_parquet(output)
    size_mb = output.stat().st_size / (1024 * 1024)