Codex changes

This commit is contained in:
Andras Schmelczer 2026-05-04 16:19:09 +01:00
parent 0bae902e08
commit d4dde21ad2
46 changed files with 4953 additions and 966 deletions

View file

@ -22,6 +22,92 @@ STOP_TYPES = {
}
OUTPUT_COLUMNS = ["id", "name", "category", "lat", "lng"]
def canonical_station_name_expr(name_col: str = "name") -> pl.Expr:
"""Normalize station names so entrances/transport-mode variants collapse."""
expr = pl.col(name_col).str.to_lowercase()
expr = expr.str.replace_all(r"\([^)]*\)", " ")
expr = expr.str.replace_all(r"['`]", "")
expr = expr.str.replace_all(r"&", " and ")
expr = expr.str.replace_all(r"[^a-z0-9]+", " ")
expr = expr.str.replace_all(r"\s+", " ").str.strip_chars()
expr = expr.str.replace_all(
r"\s+(underground|tube|dlr|metro|rail|railway)\s+station$", ""
)
expr = expr.str.replace_all(r"\s+tram\s+stop$", "")
expr = expr.str.replace_all(r"\s+(station|stop)$", "")
return expr.str.strip_chars()
def _has_locality() -> pl.Expr:
return pl.col("locality").is_not_null() & (pl.col("locality") != "")
def _deduplicate_tube_partition(
df: pl.DataFrame, group_cols: list[str]
) -> pl.DataFrame:
if len(df) == 0:
return pl.DataFrame(
{
"id": pl.Series([], dtype=pl.String),
"name": pl.Series([], dtype=pl.String),
"category": pl.Series([], dtype=pl.String),
"lat": pl.Series([], dtype=pl.Float64),
"lng": pl.Series([], dtype=pl.Float64),
}
)
name_len = pl.col("name").str.len_chars()
return (
df.group_by(group_cols)
.agg(
pl.col("id").sort_by(name_len).first(),
pl.col("name").sort_by(name_len).first(),
pl.col("category").first(),
pl.col("lat").mean(),
pl.col("lng").mean(),
)
.select(OUTPUT_COLUMNS)
)
def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
"""Deduplicate NaPTAN stops, with stricter station-level merging for Tube POIs."""
has_loc = df.filter(_has_locality())
no_loc = df.filter(~_has_locality())
cols_with_locality = [*OUTPUT_COLUMNS, "locality"]
# First pass: one record per exact stop name/category/locality.
deduped_has_loc = (
has_loc.group_by("name", "category", "locality")
.agg(
pl.col("id").first(),
pl.col("lat").mean(),
pl.col("lng").mean(),
)
.select(cols_with_locality)
)
df = pl.concat([deduped_has_loc, no_loc.select(cols_with_locality)])
tube = df.filter(pl.col("category") == "Tube station").with_columns(
canonical_station_name_expr().alias("_station_key")
)
other = df.filter(pl.col("category") != "Tube station")
tube_with_loc = tube.filter(_has_locality())
tube_no_loc = tube.filter(~_has_locality())
deduped_tube = pl.concat(
[
_deduplicate_tube_partition(tube_with_loc, ["_station_key", "locality"]),
_deduplicate_tube_partition(tube_no_loc, ["_station_key"]),
]
)
return pl.concat([other.select(OUTPUT_COLUMNS), deduped_tube])
def download_naptan(output: Path) -> None:
output.parent.mkdir(parents=True, exist_ok=True)
@ -50,24 +136,12 @@ def download_naptan(output: Path) -> None:
)
before = len(df)
df = deduplicate_naptan(df)
# Deduplicate: one record per name+category+locality
# (merges entrances, bus stop pairs on opposite sides of the road, etc.)
has_loc = df.filter(
pl.col("locality").is_not_null() & (pl.col("locality") != "")
print(
f"Deduplicated {before:,}{len(df):,} stops "
"(by name+category+locality; tube stations by normalized station name)"
)
no_loc = df.filter(
pl.col("locality").is_null() | (pl.col("locality") == "")
)
cols = ["id", "name", "category", "lat", "lng"]
deduped = has_loc.group_by("name", "category", "locality").agg(
pl.col("id").first(),
pl.col("lat").mean(),
pl.col("lng").mean(),
)
df = pl.concat([deduped.select(cols), no_loc.select(cols)])
print(f"Deduplicated {before:,}{len(df):,} stops (by name+category+locality)")
df.write_parquet(output)
size_mb = output.stat().st_size / (1024 * 1024)