Use naptan pois

2026-02-01 09:26:22 +00:00 · 2026-02-01 09:26:22 +00:00 · 01edde3ebd
commit 01edde3ebd
parent 6a42b81a2a
1 changed files with 25 additions and 8 deletions
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -75,6 +75,11 @@ DROP_CATEGORIES = {
    "tourism/information",
    "tourism/village_sign",
    "tourism/yes",
+    # public transport comes from naptan
+    "public_transport/entrance",
+    "public_transport/platform",
+    "public_transport/station",
+    "public_transport/stop_position",
 }

 # (friendly_name, emoji) for every category we keep
@ -368,11 +373,6 @@ CATEGORY_MAP: dict[str, tuple[str, str]] = {
    "office/university": ("University Office", "🎓"),
    "office/vacant": ("Vacant Office", "🏚️"),
    "office/web_design": ("Web Design", "🌐"),
-    # public_transport
-    "public_transport/entrance": ("Transport Entrance", "🚪"),
-    "public_transport/platform": ("Platform", "🚉"),
-    "public_transport/station": ("Station", "🚉"),
-    "public_transport/stop_position": ("Stop", "🚏"),
    # shop
    "shop/accessories": ("Accessories Shop", "👜"),
    "shop/agrarian": ("Farm Supply Shop", "🌾"),
@ -573,7 +573,18 @@ CATEGORY_MAP: dict[str, tuple[str, str]] = {
 }


-def transform(input_path: Path) -> pl.LazyFrame:
+NAPTAN_EMOJIS: dict[str, str] = {
+    "Airport": "✈️",
+    "Ferry": "⛴️",
+    "Rail station": "🚆",
+    "Bus stop": "🚏",
+    "Bus station": "🚌",
+    "Taxi rank": "🚕",
+    "Metro or Tram stop": "🚊",
+}
+
+
+def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame:
    lf = pl.scan_parquet(input_path)

    # Get all unique categories present in the data
@ -618,7 +629,10 @@ def transform(input_path: Path) -> pl.LazyFrame:
        pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
    )

-    return lf
+    naptan = pl.scan_parquet(naptan_path).with_columns(
+        pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
+    )
+    return pl.concat([lf, naptan], how="diagonal_relaxed")


 def main():
@ -628,12 +642,15 @@ def main():
    parser.add_argument(
        "--input", type=Path, required=True, help="Raw POIs parquet file"
    )
+    parser.add_argument(
+        "--naptan", type=Path, required=True, help="NaPTAN stations parquet file"
+    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output filtered POIs parquet file"
    )
    args = parser.parse_args()

-    df = transform(args.input).collect(engine="streaming")
+    df = transform(args.input, args.naptan).collect(engine="streaming")

    df.write_parquet(args.output)