Use naptan pois

This commit is contained in:
Andras Schmelczer 2026-02-01 09:26:22 +00:00
parent 6a42b81a2a
commit 01edde3ebd

View file

@ -75,6 +75,11 @@ DROP_CATEGORIES = {
"tourism/information",
"tourism/village_sign",
"tourism/yes",
# public transport comes from naptan
"public_transport/entrance",
"public_transport/platform",
"public_transport/station",
"public_transport/stop_position",
}
# (friendly_name, emoji) for every category we keep
@ -368,11 +373,6 @@ CATEGORY_MAP: dict[str, tuple[str, str]] = {
"office/university": ("University Office", "🎓"),
"office/vacant": ("Vacant Office", "🏚️"),
"office/web_design": ("Web Design", "🌐"),
# public_transport
"public_transport/entrance": ("Transport Entrance", "🚪"),
"public_transport/platform": ("Platform", "🚉"),
"public_transport/station": ("Station", "🚉"),
"public_transport/stop_position": ("Stop", "🚏"),
# shop
"shop/accessories": ("Accessories Shop", "👜"),
"shop/agrarian": ("Farm Supply Shop", "🌾"),
@ -573,7 +573,18 @@ CATEGORY_MAP: dict[str, tuple[str, str]] = {
}
def transform(input_path: Path) -> pl.LazyFrame:
NAPTAN_EMOJIS: dict[str, str] = {
"Airport": "✈️",
"Ferry": "⛴️",
"Rail station": "🚆",
"Bus stop": "🚏",
"Bus station": "🚌",
"Taxi rank": "🚕",
"Metro or Tram stop": "🚊",
}
def transform(input_path: Path, naptan_path: Path | None = None) -> pl.LazyFrame:
lf = pl.scan_parquet(input_path)
# Get all unique categories present in the data
@ -618,7 +629,10 @@ def transform(input_path: Path) -> pl.LazyFrame:
pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
)
return lf
naptan = pl.scan_parquet(naptan_path).with_columns(
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
)
return pl.concat([lf, naptan], how="diagonal_relaxed")
def main():
@ -628,12 +642,15 @@ def main():
parser.add_argument(
"--input", type=Path, required=True, help="Raw POIs parquet file"
)
parser.add_argument(
"--naptan", type=Path, required=True, help="NaPTAN stations parquet file"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
)
args = parser.parse_args()
df = transform(args.input).collect(engine="streaming")
df = transform(args.input, args.naptan).collect(engine="streaming")
df.write_parquet(args.output)