diff --git a/pipeline/transform/transform_poi.py b/pipeline/transform/transform_poi.py index ac6d98f..d44b877 100644 --- a/pipeline/transform/transform_poi.py +++ b/pipeline/transform/transform_poi.py @@ -75,6 +75,11 @@ DROP_CATEGORIES = { "tourism/information", "tourism/village_sign", "tourism/yes", + # public transport comes from naptan + "public_transport/entrance", + "public_transport/platform", + "public_transport/station", + "public_transport/stop_position", } # (friendly_name, emoji) for every category we keep @@ -368,11 +373,6 @@ CATEGORY_MAP: dict[str, tuple[str, str]] = { "office/university": ("University Office", "🎓"), "office/vacant": ("Vacant Office", "🏚️"), "office/web_design": ("Web Design", "🌐"), - # public_transport - "public_transport/entrance": ("Transport Entrance", "🚪"), - "public_transport/platform": ("Platform", "🚉"), - "public_transport/station": ("Station", "🚉"), - "public_transport/stop_position": ("Stop", "🚏"), # shop "shop/accessories": ("Accessories Shop", "👜"), "shop/agrarian": ("Farm Supply Shop", "🌾"), @@ -573,7 +573,18 @@ CATEGORY_MAP: dict[str, tuple[str, str]] = { } -def transform(input_path: Path) -> pl.LazyFrame: +NAPTAN_EMOJIS: dict[str, str] = { + "Airport": "✈️", + "Ferry": "⛴️", + "Rail station": "🚆", + "Bus stop": "🚏", + "Bus station": "🚌", + "Taxi rank": "🚕", + "Metro or Tram stop": "🚊", +} + + +def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame: lf = pl.scan_parquet(input_path) # Get all unique categories present in the data @@ -618,7 +629,10 @@ def transform(input_path: Path) -> pl.LazyFrame: pl.col("category").replace_strict(emoji_mapping).alias("emoji"), ) - return lf + naptan = pl.scan_parquet(naptan_path).with_columns( + pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"), + ) + return pl.concat([lf, naptan], how="diagonal_relaxed") def main(): @@ -628,12 +642,15 @@ def main(): parser.add_argument( "--input", type=Path, required=True, help="Raw POIs parquet file" ) + parser.add_argument( + "--naptan", type=Path, required=True, help="NaPTAN stations parquet file" + ) parser.add_argument( "--output", type=Path, required=True, help="Output filtered POIs parquet file" ) args = parser.parse_args() - df = transform(args.input).collect(engine="streaming") + df = transform(args.input, args.naptan).collect(engine="streaming") df.write_parquet(args.output)