From 23d128ff63ad897cf9345b1c147180e47938d5e1 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sat, 4 Apr 2026 09:49:06 +0100 Subject: [PATCH] Fix transport POIs --- pipeline/download/naptan.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pipeline/download/naptan.py b/pipeline/download/naptan.py index 750344d..a0c3dab 100644 --- a/pipeline/download/naptan.py +++ b/pipeline/download/naptan.py @@ -18,6 +18,7 @@ STOP_TYPES = { "BCE": "Bus station", "TXR": "Taxi rank", "TMU": "Metro or Tram stop", + "MET": "Metro or Tram stop", } @@ -44,9 +45,30 @@ def download_naptan(output: Path) -> None: pl.col("StopType").replace(STOP_TYPES).alias("category"), pl.col("Latitude").alias("lat"), pl.col("Longitude").alias("lng"), + pl.col("NptgLocalityCode").alias("locality"), ) ) + before = len(df) + + # Deduplicate: one record per name+category+locality + # (merges entrances, bus stop pairs on opposite sides of the road, etc.) + has_loc = df.filter( + pl.col("locality").is_not_null() & (pl.col("locality") != "") + ) + no_loc = df.filter( + pl.col("locality").is_null() | (pl.col("locality") == "") + ) + cols = ["id", "name", "category", "lat", "lng"] + deduped = has_loc.group_by("name", "category", "locality").agg( + pl.col("id").first(), + pl.col("lat").mean(), + pl.col("lng").mean(), + ) + df = pl.concat([deduped.select(cols), no_loc.select(cols)]) + + print(f"Deduplicated {before:,} → {len(df):,} stops (by name+category+locality)") + df.write_parquet(output) size_mb = output.stat().st_size / (1024 * 1024) print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} stations)")