Fix data pipelines once and for all

This commit is contained in:
Andras Schmelczer 2026-06-10 21:27:32 +01:00
parent 08560476c5
commit 4012e4e047
46 changed files with 4508 additions and 855 deletions

View file

@ -0,0 +1,93 @@
"""Download Census 2021 children by five-year age band per LSOA.
Source: NOMIS (ONS Census 2021 TS007A dataset, age by five-year bands)
License: Open Government Licence v3.0
Used to estimate how many primary-age (4-10) and secondary-age (11-15)
children live in each LSOA, which drives the school catchment model. Census
bands don't align with school phases, so phase totals take fractional shares
of the 0-4, 10-14 and 15-19 bands (one fifth per single year of age).
"""
import argparse
from io import BytesIO
from pathlib import Path
import httpx
import polars as pl
# NOMIS API: Census 2021 TS007A (age, five-year bands) by LSOA 2021 (TYPE151).
# c2021_age_19 codes: 1 = 0-4, 2 = 5-9, 3 = 10-14, 4 = 15-19.
# NOMIS paginates at 25,000 rows by default, so we paginate with recordoffset.
BASE_URL = (
"https://www.nomisweb.co.uk/api/v01/dataset/NM_2020_1.data.csv"
"?date=latest&geography=TYPE151&measures=20100&c2021_age_19=1,2,3,4"
"&select=GEOGRAPHY_CODE,C2021_AGE_19,OBS_VALUE"
)
PAGE_SIZE = 25000
AGE_BAND_COLUMNS = {
1: "aged_0_4",
2: "aged_5_9",
3: "aged_10_14",
4: "aged_15_19",
}
def download_and_convert(output_path: Path) -> None:
print("Downloading Census 2021 LSOA age bands from NOMIS...")
frames = []
offset = 0
while True:
url = f"{BASE_URL}&recordoffset={offset}"
response = httpx.get(url, follow_redirects=True, timeout=120)
response.raise_for_status()
if len(response.content) == 0:
break
chunk = pl.read_csv(BytesIO(response.content))
if chunk.height == 0:
break
frames.append(chunk)
print(f" Fetched {chunk.height} rows (offset={offset})")
if chunk.height < PAGE_SIZE:
break
offset += PAGE_SIZE
df = pl.concat(frames)
print(f"Total rows: {df.height}")
result = (
df.rename({"GEOGRAPHY_CODE": "lsoa21"})
.pivot(on="C2021_AGE_19", index="lsoa21", values="OBS_VALUE")
.rename({str(code): name for code, name in AGE_BAND_COLUMNS.items()})
.with_columns(pl.col(name).cast(pl.UInt32) for name in AGE_BAND_COLUMNS.values())
.filter(pl.col("lsoa21").str.starts_with("E"))
.sort("lsoa21")
)
missing = [c for c in AGE_BAND_COLUMNS.values() if c not in result.columns]
if missing:
raise ValueError(f"NOMIS response missing age bands: {missing}")
print(f"England LSOAs: {result.height}")
for name in AGE_BAND_COLUMNS.values():
print(f" {name}: total {result[name].sum():,}")
output_path.parent.mkdir(parents=True, exist_ok=True)
result.write_parquet(output_path, compression="zstd")
print(f"Saved to {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download Census 2021 age bands (children) by LSOA"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
download_and_convert(args.output)
if __name__ == "__main__":
main()

View file

@ -12,8 +12,18 @@ import polars as pl
NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
TUBE_STATION_CATEGORY = "Tube station"
TRAM_METRO_CATEGORY = "Tram & Metro stop"
TUBE_STATION_MERGE_RADIUS_DEGREES = 0.01
# London Underground ATCO codes are "<area><kind>ZZLU<station>": a 3-digit
# AdministrativeAreaCode (940 national, 490 London, plus 150/210/040/... for
# LU stations outside Greater London such as Epping or Amersham), then "0"
# (platform/entrance node) or "G" (station group node), then the system code.
# "ZZLU" is unique to London Underground, which cleanly separates genuine Tube
# stations from every other TMU/MET network (Metrolink, Supertram, T&W Metro,
# WM Metro, Blackpool Tramway, heritage railways, ...).
LONDON_UNDERGROUND_ATCO_PATTERN = r"(?i)^\d{3}[0G]ZZLU"
STOP_TYPES = {
"AIR": "Airport",
@ -25,25 +35,110 @@ STOP_TYPES = {
"RLY": "Rail station",
"RSE": "Rail station",
"BCT": "Bus stop",
# Bus/coach stations: BST is the station access-area node, BCS/BCQ are
# bays/stands within the station and BCE is a station entrance. NaPTAN maps
# very few BCE nodes (~80 GB-wide), so without BST/BCS/BCQ the category was
# so sparse that 20% of England showed the nearest bus station >100km away.
# Bays and entrances collapse to one POI per station via
# STATION_MERGE_CATEGORIES below.
"BST": "Bus station",
"BCS": "Bus station",
"BCQ": "Bus station",
"BCE": "Bus station",
"TXR": "Taxi rank",
"TMU": "Tube station",
"MET": "Tube station",
# Tram/Metro/Underground: TMU is an entrance node, MET the station access
# area. Both start as "Tram & Metro stop"; merged stations whose ATCO codes
# mark them as London Underground (ZZLU) are reclassified to "Tube station"
# after dedup (see _deduplicate_station_areas). Heritage railways (RHDR,
# Severn Valley, ...) are TMU/MET in NaPTAN with no machine-readable
# "heritage" flag, so they remain in "Tram & Metro stop".
"TMU": TRAM_METRO_CATEGORY,
"MET": TRAM_METRO_CATEGORY,
}
# Stop types that are access/entrance nodes rather than the primary station or
# terminal node. During dedup the primary node (e.g. RLY/FER) wins so a station
# with both a station node and entrances yields one POI at the station node.
ENTRANCE_STOP_TYPES = {"RSE", "FTD"}
# terminal node. During dedup the primary node (e.g. RLY/FER/MET) wins so a
# station with both a station node and entrances yields one POI at the station
# node.
ENTRANCE_STOP_TYPES = {"RSE", "FTD", "TMU", "BCE"}
# Categories whose entrances/variants are merged into a single station-level POI
# by normalized name + area (like Tube stations), so an RLY node and its RSE
# entrances collapse to one POI at the station node.
STATION_MERGE_CATEGORIES = {TUBE_STATION_CATEGORY, "Rail station", "Ferry"}
STATION_MERGE_CATEGORIES = {
TRAM_METRO_CATEGORY,
TUBE_STATION_CATEGORY,
"Rail station",
"Ferry",
"Bus station",
}
OUTPUT_COLUMNS = ["id", "name", "category", "lat", "lng"]
# Trailing entrance designators ("North East Ent", "Main Entrance No 2",
# "West Station Entrance", ...) are stripped from canonical names so a
# station's individually-named entrance nodes collapse into the station.
# A trailing run of filler words is only stripped when it contains at least
# one entrance word, so "Maze Hill North" or "Platform 1" are untouched.
_ENTRANCE_NAME_WORDS = {"ent", "entrance", "entrances", "access"}
_ENTRANCE_FILLER_WORDS = {
"north",
"south",
"east",
"west",
"ne",
"nw",
"se",
"sw",
"n",
"s",
"e",
"w",
"wt",
"main",
"side",
"no",
"station",
"stop",
"platform",
}
_ENTRANCE_WORDS_RE = "(?:ent|entrance|entrances|access)"
_ENTRANCE_FILLER_RE = (
r"(?:north|south|east|west|ne|nw|se|sw|n|s|e|w|wt|main|side|no|station|stop"
r"|platform|\d+)"
)
_ENTRANCE_SUFFIX_RE = (
rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*"
rf"\s+{_ENTRANCE_WORDS_RE}"
rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*$"
)
# Bus-station bay/stand designators ("Stand A3", "Bay 2", "Stance 5") are
# stripped so every bay of one station shares a canonical name. The designator
# word must be followed by a short alphanumeric token, so place names ending in
# a bare "Bay" (Colwyn Bay, Herne Bay) are untouched.
_BAY_WORDS = {"stand", "stance", "bay", "gate"}
_BAY_SUFFIX_RE = r"\s+(?:stand|stance|bay|gate)\s+[a-z0-9]{1,3}$"
def _strip_entrance_suffix(words: list[str]) -> list[str]:
"""Drop a trailing entrance designator (direction/number filler around an
entrance word) from a tokenized stop name; no-op when no entrance word."""
idx = len(words)
saw_entrance = False
while idx > 0:
word = words[idx - 1]
if word in _ENTRANCE_NAME_WORDS:
saw_entrance = True
elif word.isdigit() or word in _ENTRANCE_FILLER_WORDS:
pass
else:
break
idx -= 1
return words[:idx] if saw_entrance else words
def canonical_station_name(name: str | None) -> str:
"""Normalize station names so entrances/transport-mode variants collapse."""
@ -55,18 +150,24 @@ def canonical_station_name(name: str | None) -> str:
normalized = re.sub(r"['`]", "", normalized)
normalized = normalized.replace("&", " and ")
normalized = re.sub(r"[^a-z0-9]+", " ", normalized)
words = normalized.split()
words = _strip_entrance_suffix(normalized.split())
if len(words) >= 3 and words[-2] in _BAY_WORDS and len(words[-1]) <= 3:
del words[-2:]
suffixes = (
("underground", "station"),
("tube", "station"),
("dlr", "station"),
("metro", "station"),
("metrolink", "station"),
("metrolink", "stop"),
("tram", "stop"),
("rail", "station"),
("railway", "station"),
("station",),
("stop",),
("metrolink",),
)
while True:
suffix = next(
@ -88,11 +189,14 @@ def canonical_station_name_expr(name_col: str = "name") -> pl.Expr:
expr = expr.str.replace_all(r"&", " and ")
expr = expr.str.replace_all(r"[^a-z0-9]+", " ")
expr = expr.str.replace_all(r"\s+", " ").str.strip_chars()
expr = expr.str.replace_all(_ENTRANCE_SUFFIX_RE, "")
expr = expr.str.replace_all(_BAY_SUFFIX_RE, "")
expr = expr.str.replace_all(
r"\s+(underground|tube|dlr|metro|rail|railway)\s+station$", ""
r"\s+(underground|tube|dlr|metro|metrolink|rail|railway)\s+station$", ""
)
expr = expr.str.replace_all(r"\s+tram\s+stop$", "")
expr = expr.str.replace_all(r"\s+(metrolink|tram)\s+stop$", "")
expr = expr.str.replace_all(r"\s+(station|stop)$", "")
expr = expr.str.replace_all(r"\s+metrolink$", "")
return expr.str.strip_chars()
@ -140,6 +244,7 @@ class StationAccumulator:
lat_sum: float
lng_sum: float
entrance: bool = False
is_lu: bool = False
count: int = 1
@property
@ -159,6 +264,7 @@ class StationAccumulator:
self.lat_sum += float(row["lat"])
self.lng_sum += float(row["lng"])
self.count += 1
self.is_lu = self.is_lu or bool(row.get("is_lu"))
name = str(row["name"] or "")
entrance = bool(row.get("entrance"))
@ -169,6 +275,16 @@ class StationAccumulator:
self.name = name
self.entrance = entrance
@property
def output_category(self) -> str:
# A merged tram/metro station is a genuine Tube station when ANY of its
# constituent nodes carries a London Underground ATCO code. Checking
# the whole group (not just the winning node) matters because LU
# entrance nodes often carry non-ZZLU codes (e.g. 4900VICT...).
if self.category == TRAM_METRO_CATEGORY and self.is_lu:
return TUBE_STATION_CATEGORY
return self.category
def _station_from_row(row: dict[str, object]) -> StationAccumulator:
return StationAccumulator(
@ -178,6 +294,7 @@ def _station_from_row(row: dict[str, object]) -> StationAccumulator:
lat_sum=float(row["lat"]),
lng_sum=float(row["lng"]),
entrance=bool(row.get("entrance")),
is_lu=bool(row.get("is_lu")),
)
@ -217,7 +334,7 @@ def _deduplicate_station_areas(df: pl.DataFrame) -> pl.DataFrame:
{
"id": [station.id for station in selected],
"name": [station.name for station in selected],
"category": [station.category for station in selected],
"category": [station.output_category for station in selected],
"lat": [station.lat for station in selected],
"lng": [station.lng for station in selected],
}
@ -258,10 +375,12 @@ def _deduplicate_local_stops(df: pl.DataFrame) -> pl.DataFrame:
def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
"""Deduplicate NaPTAN stops, merging station/terminal entrances by area.
Tube, rail and ferry POIs are merged to one record per station by
normalized name + area, with the primary station/terminal node (e.g. RLY,
FER) winning over an entrance node (RSE, FTD). Other stops are deduplicated
by exact name+category+locality.
Tram/metro, rail, ferry and bus-station POIs are merged to one record per
station by normalized name + area, with the primary station/terminal node
(e.g. RLY, FER, MET, BST) winning over an entrance node (RSE, FTD, TMU,
BCE). Merged tram/metro stations with a London Underground ATCO code in
the group become "Tube station". Other stops are deduplicated by exact
name+category+locality.
"""
station = df.filter(pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
other = df.filter(~pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
@ -274,6 +393,29 @@ def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
).select(OUTPUT_COLUMNS)
def filter_active_stops(df: pl.DataFrame) -> pl.DataFrame:
"""Keep only active NaPTAN stops.
The NaPTAN export's Status column marks stops as active/inactive/pending;
without this filter closed stations ("(closed)", "not in use") ship as
live POIs. Rows with a null Status are kept (benefit of the doubt); a
missing column is tolerated so older extracts still load.
"""
if "Status" not in df.columns:
print("WARNING: NaPTAN data has no Status column; keeping all stops")
return df
before = len(df)
df = df.filter(
pl.col("Status").is_null()
| pl.col("Status").str.strip_chars().str.to_lowercase().is_in(["active", "act"])
)
dropped = before - len(df)
if dropped:
print(f"Dropped {dropped:,} non-active stops (Status != active)")
return df
def download_naptan(output: Path) -> None:
output.parent.mkdir(parents=True, exist_ok=True)
@ -291,15 +433,19 @@ def download_naptan(output: Path) -> None:
)
.drop_nulls(subset=["Latitude", "Longitude"])
.filter(pl.col("StopType").is_in(list(STOP_TYPES.keys())))
.select(
pl.col("ATCOCode").alias("id"),
pl.col("CommonName").alias("name"),
pl.col("StopType").replace(STOP_TYPES).alias("category"),
pl.col("Latitude").alias("lat"),
pl.col("Longitude").alias("lng"),
pl.col("NptgLocalityCode").alias("locality"),
pl.col("StopType").is_in(list(ENTRANCE_STOP_TYPES)).alias("entrance"),
)
)
df = filter_active_stops(df).select(
pl.col("ATCOCode").alias("id"),
pl.col("CommonName").alias("name"),
pl.col("StopType").replace(STOP_TYPES).alias("category"),
pl.col("Latitude").alias("lat"),
pl.col("Longitude").alias("lng"),
pl.col("NptgLocalityCode").alias("locality"),
pl.col("StopType").is_in(list(ENTRANCE_STOP_TYPES)).alias("entrance"),
pl.col("ATCOCode")
.str.contains(LONDON_UNDERGROUND_ATCO_PATTERN)
.fill_null(False)
.alias("is_lu"),
)
before = len(df)

View file

@ -2,12 +2,15 @@
Downloads the OS Open Greenspace dataset as ESRI Shapefile and extracts
access point locations (park entrances). Each access point is tagged with
its parent site's function type (e.g. Public Park Or Garden). Sites without
access points fall back to polygon centroids.
its parent site's function type (e.g. Public Park Or Garden), the parent
site id and the site's polygon centroid. Sites without access points fall
back to polygon centroids.
Using access points rather than polygon centroids gives much more accurate
distance calculations a property next to Hyde Park won't show 400m just
because the centroid is in the middle of the park.
because the centroid is in the middle of the park. The site id / centroid
columns let downstream consumers (poi_proximity) collapse the frame back to
one row per SITE for counting, so a park with 30 gates counts as one park.
Source: https://osdatahub.os.uk/downloads/open/OpenGreenspace
License: Open Government Licence v3.0
@ -65,8 +68,8 @@ def _read_site_functions(shp_path: Path) -> dict[str, str]:
def _read_access_points(
shp_path: Path, site_funcs: dict[str, str]
) -> tuple[list[float], list[float], list[str]]:
"""Read access points, tagging each with its parent site's function."""
) -> tuple[list[float], list[float], list[str], list[str]]:
"""Read access points, tagging each with its parent site's function and id."""
reader = shp.Reader(str(shp_path), encoding="latin-1")
field_names = [f[0] for f in reader.fields[1:]]
@ -80,6 +83,7 @@ def _read_access_points(
lats: list[float] = []
lngs: list[float] = []
categories: list[str] = []
site_ids: list[str] = []
skipped = 0
error_skipped = 0
@ -107,6 +111,7 @@ def _read_access_points(
lats.append(lat)
lngs.append(lng)
categories.append(func)
site_ids.append(str(site_id))
if skipped:
print(f" Skipped {skipped:,} access points with unknown site ID")
@ -116,31 +121,26 @@ def _read_access_points(
error_skipped,
)
return lats, lngs, categories
return lats, lngs, categories, site_ids
def _read_site_centroids(
shp_path: Path, site_funcs: dict[str, str], covered_ids: set[str]
) -> tuple[list[float], list[float], list[str]]:
"""Read polygon centroids for sites that have no access points (fallback)."""
def _read_site_centroids(shp_path: Path) -> dict[str, tuple[float, float]]:
"""Compute the WGS84 polygon centroid of every greenspace site.
Used both as the representative point for site-level counting and as the
location fallback for sites that have no access points.
"""
reader = shp.Reader(str(shp_path), encoding="latin-1")
field_names = [f[0] for f in reader.fields[1:]]
id_idx = _find_field(field_names, "id")
func_idx = _find_field(field_names, "funct")
if id_idx is None or func_idx is None:
return [], [], []
if id_idx is None:
return {}
lats: list[float] = []
lngs: list[float] = []
categories: list[str] = []
centroids: dict[str, tuple[float, float]] = {}
error_skipped = 0
for sr in reader.shapeRecords():
site_id = sr.record[id_idx]
if site_id in covered_ids:
continue
func = sr.record[func_idx]
try:
geom = to_shapely(sr.shape.__geo_interface__)
if geom.is_empty or not geom.is_valid:
@ -156,9 +156,7 @@ def _read_site_centroids(
)
continue
lats.append(lat)
lngs.append(lng)
categories.append(func)
centroids[str(site_id)] = (lat, lng)
if error_skipped:
logger.warning(
@ -166,7 +164,7 @@ def _read_site_centroids(
error_skipped,
)
return lats, lngs, categories
return centroids
def download_greenspace(output: Path) -> None:
@ -194,33 +192,53 @@ def download_greenspace(output: Path) -> None:
# Step 2: Read access points (primary — park entrances)
print(f"Reading {access_shps[0].name}...")
ap_lats, ap_lngs, ap_cats = _read_access_points(access_shps[0], site_funcs)
ap_lats, ap_lngs, ap_cats, ap_site_ids = _read_access_points(
access_shps[0], site_funcs
)
print(f" {len(ap_lats):,} access points loaded")
# Step 3: Fall back to centroids for sites without any access points
covered_ids = set()
reader = shp.Reader(str(access_shps[0]), encoding="latin-1")
field_names = [f[0] for f in reader.fields[1:]]
ref_idx = _find_field(field_names, "refto", "ref_to", "greensp")
if ref_idx is not None:
for rec in reader.iterRecords():
covered_ids.add(rec[ref_idx])
# Step 3: Compute every site's centroid: the representative point for
# site-level counting, and the location fallback for sites without any
# access points.
print("Computing site centroids...")
centroids = _read_site_centroids(site_shps[0])
print(f" {len(centroids):,} site centroids computed")
print("Adding centroids for sites without access points...")
fb_lats, fb_lngs, fb_cats = _read_site_centroids(
site_shps[0], site_funcs, covered_ids
)
covered_ids = set(ap_site_ids)
fb_lats: list[float] = []
fb_lngs: list[float] = []
fb_cats: list[str] = []
fb_site_ids: list[str] = []
for site_id, (lat, lng) in centroids.items():
if site_id in covered_ids:
continue
func = site_funcs.get(site_id)
if func is None:
continue
fb_lats.append(lat)
fb_lngs.append(lng)
fb_cats.append(func)
fb_site_ids.append(site_id)
print(f" {len(fb_lats):,} centroid fallbacks added")
lats = ap_lats + fb_lats
lngs = ap_lngs + fb_lngs
categories = ap_cats + fb_cats
site_ids = ap_site_ids + fb_site_ids
site_lats = [centroids.get(site_id, (None, None))[0] for site_id in site_ids]
site_lngs = [centroids.get(site_id, (None, None))[1] for site_id in site_ids]
df = pl.DataFrame(
{
"lat": np.array(lats, dtype=np.float64),
"lng": np.array(lngs, dtype=np.float64),
"category": categories,
"site_id": site_ids,
# Site polygon centroid (null when the centroid could not be
# computed): the representative point when collapsing to one row
# per site for counting.
"site_lat": pl.Series(site_lats, dtype=pl.Float64),
"site_lng": pl.Series(site_lngs, dtype=pl.Float64),
}
)

View file

@ -641,7 +641,7 @@ def _naptan_dlr_stations(naptan_path: Path) -> list[dict]:
match = _DLR_CODE_RE.search(atco_id)
if not match:
continue
if row["category"] not in {"Tube station", "Rail station"}:
if row["category"] not in {"Tube station", "Tram & Metro stop", "Rail station"}:
continue
code = match.group(1)

View file

@ -2,9 +2,12 @@ import polars as pl
import pytest
from pipeline.download.naptan import (
TRAM_METRO_CATEGORY,
TUBE_STATION_CATEGORY,
canonical_station_name,
canonical_station_name_expr,
deduplicate_naptan,
filter_active_stops,
)
@ -34,6 +37,127 @@ def test_canonical_station_name_expr_normalizes_transport_suffixes():
assert [canonical_station_name(name) for name in names] == result
def test_canonical_station_name_strips_entrance_suffixes():
# Real shipped NaPTAN entrance names that previously failed to merge with
# their station node (79 stray entrance POIs).
cases = {
"Weaste Metrolink Station North East Entrance": "weaste",
"Weaste Metrolink Station North Entrance No 2": "weaste",
"Whitefield Metrolink Station Main Entrance": "whitefield",
"Radcliffe Metrolink Station Entrance": "radcliffe",
"Stretford Metrolink Station Wt Platform Entrance": "stretford",
"Salford Quays Metrolink Station SW entrance": "salford quays",
"Bank Station Ent 2": "bank",
"Hainault": "hainault",
# The Metrolink MET node names collapse to the same key.
"Weaste (Manchester Metrolink)": "weaste",
# No entrance word: direction/filler words must NOT be stripped.
"Maze Hill North": "maze hill north",
"Bus Station Entrance": "bus",
# Bus-station bay/stand designators collapse to the station name…
"Tonypandy Bus Station Stand A3": "tonypandy bus",
"Caerphilly Interchange Stand 5": "caerphilly interchange",
"Stanley Bus Station Stand G": "stanley bus",
# …but a bare trailing "Bay" (place names) is untouched.
"Colwyn Bay": "colwyn bay",
}
for name, expected in cases.items():
assert canonical_station_name(name) == expected, name
df = pl.DataFrame({"name": list(cases.keys())})
expr_result = df.select(canonical_station_name_expr().alias("key"))["key"].to_list()
assert expr_result == list(cases.values())
def test_filter_active_stops_drops_non_active():
df = pl.DataFrame(
{
"ATCOCode": ["a", "b", "c", "d"],
"Status": ["active", "inactive", None, "Pending"],
}
)
result = filter_active_stops(df)
# Active and unknown (null) statuses survive; inactive/pending are dropped.
assert result["ATCOCode"].to_list() == ["a", "c"]
def test_filter_active_stops_tolerates_missing_status_column():
df = pl.DataFrame({"ATCOCode": ["a"]})
assert filter_active_stops(df)["ATCOCode"].to_list() == ["a"]
def test_deduplicate_naptan_splits_london_underground_from_tram_metro():
# MET station nodes plus TMU entrances, pre-categorised as the tram/metro
# family. The Hainault group contains a 940GZZLU station node, so the
# merged POI is a genuine "Tube station" even though its entrance carries a
# non-ZZLU ATCO code; the Metrolink group stays "Tram & Metro stop".
df = pl.DataFrame(
{
"id": [
"940GZZLUHLT",
"490000095003",
"9400ZZMAWST",
"1800NFR2691",
],
"name": [
"Hainault Underground Station",
"Hainault",
"Weaste (Manchester Metrolink)",
"Weaste Metrolink Station North West Entrance",
],
"category": [TRAM_METRO_CATEGORY] * 4,
"lat": [51.6034, 51.6037, 53.4826, 53.4826],
"lng": [0.0933, 0.0931, -2.3087, -2.3086],
"locality": [None, None, None, None],
"entrance": [False, True, False, True],
"is_lu": [True, False, False, False],
}
)
result = deduplicate_naptan(df).sort("category")
assert len(result) == 2
assert result["category"].to_list() == [
TRAM_METRO_CATEGORY,
TUBE_STATION_CATEGORY,
]
tube = result.filter(pl.col("category") == TUBE_STATION_CATEGORY)
# The station node (not the entrance) represents the merged POI.
assert tube["id"][0] == "940GZZLUHLT"
tram = result.filter(pl.col("category") == TRAM_METRO_CATEGORY)
assert tram["id"][0] == "9400ZZMAWST"
def test_deduplicate_naptan_merges_bus_station_bays_and_entrances():
# BCS bays and a BCE entrance of one bus station collapse to a single POI
# represented by a non-entrance node; a different bus station in another
# area survives separately.
df = pl.DataFrame(
{
"id": ["bay-1", "bay-2", "ent-1", "other"],
"name": [
"Bury Interchange",
"Bury Interchange",
"Bury Interchange East Entrance",
"Rochdale Interchange",
],
"category": ["Bus station"] * 4,
"lat": [53.5907, 53.5908, 53.5909, 53.6160],
"lng": [-2.2958, -2.2957, -2.2956, -2.1561],
"locality": ["BURY", "BURY", "BURY", "ROCHDALE"],
"entrance": [False, False, True, False],
}
)
result = deduplicate_naptan(df).sort("name")
assert result["name"].to_list() == ["Bury Interchange", "Rochdale Interchange"]
assert result.filter(pl.col("name") == "Bury Interchange")["id"][0] == "bay-1"
def test_deduplicate_naptan_merges_tube_station_variants_by_area():
df = pl.DataFrame(
{

View file

@ -86,7 +86,7 @@ def test_naptan_dlr_stations_are_deduplicated_by_atco_code(tmp_path):
"Bank",
],
"category": [
"Tube station",
"Tram & Metro stop",
"Tube station",
"Rail station",
"Bus stop",

View file

@ -1,11 +1,15 @@
"""Tests for transit_network GTFS processing."""
import datetime as dt
import zipfile
from pathlib import Path
import pytest
from pipeline.download.transit_network import convert_high_freq_to_frequency_based
from pipeline.download.transit_network import (
convert_high_freq_to_frequency_based,
validate_gtfs_feed,
)
def _write_gtfs(path: Path, *, stop_times: str) -> None:
@ -77,3 +81,162 @@ def test_raises_when_no_first_stops_found(tmp_path: Path) -> None:
with pytest.raises(RuntimeError, match="no first stops"):
convert_high_freq_to_frequency_based(src, dst)
# ── validate_gtfs_feed ────────────────────────────────────────────────────────
TODAY = dt.date(2026, 6, 10)
def _make_gtfs(
path: Path,
*,
calendar: str | None = (
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
"start_date,end_date\n"
"S1,1,1,1,1,1,0,0,20260101,20271231\n"
),
calendar_dates: str | None = None,
stops: str = (
"stop_id,stop_name,stop_lat,stop_lon\n"
"STOP_A,Bank,51.5133,-0.0886\n"
"STOP_B,Liverpool Street,51.5178,-0.0823\n"
),
routes: str = "route_id,agency_id,route_short_name,route_type\nR1,OP1,Central,1\n",
trips: str = "trip_id,route_id,service_id\nT1,R1,S1\n",
stop_times: str = (
"trip_id,stop_sequence,departure_time,stop_id\n"
"T1,0,06:00:00,STOP_A\n"
"T1,1,06:02:00,STOP_B\n"
),
) -> Path:
"""Write a tiny synthetic GTFS zip; defaults form a valid current feed."""
with zipfile.ZipFile(path, "w") as z:
if calendar is not None:
z.writestr("calendar.txt", calendar)
if calendar_dates is not None:
z.writestr("calendar_dates.txt", calendar_dates)
z.writestr("stops.txt", stops)
z.writestr("routes.txt", routes)
z.writestr("trips.txt", trips)
z.writestr("stop_times.txt", stop_times)
return path
def test_validate_gtfs_feed_happy_path(tmp_path: Path) -> None:
feed = _make_gtfs(tmp_path / "feed.zip")
validate_gtfs_feed(feed, "test feed", today=TODAY) # must not raise
def test_validate_gtfs_feed_expired_calendar(tmp_path: Path) -> None:
"""The 2010 TfL snapshot failure mode: all calendars ended years ago."""
feed = _make_gtfs(
tmp_path / "feed.zip",
calendar=(
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
"start_date,end_date\n"
"S1,1,1,1,1,1,0,0,20091201,20101224\n"
),
)
with pytest.raises(RuntimeError, match=r"'stale tfl'.*no service active"):
validate_gtfs_feed(feed, "stale tfl", today=TODAY)
def test_validate_gtfs_feed_calendar_starting_after_window_fails(
tmp_path: Path,
) -> None:
feed = _make_gtfs(
tmp_path / "feed.zip",
calendar=(
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
"start_date,end_date\n"
"S1,1,1,1,1,1,0,0,20270101,20271231\n"
),
)
with pytest.raises(RuntimeError, match="no service active"):
validate_gtfs_feed(feed, "future feed", today=TODAY)
def test_validate_gtfs_feed_calendar_dates_rescues_expired_calendar(
tmp_path: Path,
) -> None:
"""An expired calendar.txt passes if calendar_dates.txt adds service now."""
feed = _make_gtfs(
tmp_path / "feed.zip",
calendar=(
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
"start_date,end_date\n"
"S1,1,1,1,1,1,0,0,20091201,20101224\n"
),
calendar_dates="service_id,date,exception_type\nS1,20260615,1\n",
)
validate_gtfs_feed(feed, "rescued feed", today=TODAY) # must not raise
def test_validate_gtfs_feed_removed_service_exception_does_not_count(
tmp_path: Path,
) -> None:
feed = _make_gtfs(
tmp_path / "feed.zip",
calendar=None,
calendar_dates="service_id,date,exception_type\nS1,20260615,2\n",
)
with pytest.raises(RuntimeError, match="no service active"):
validate_gtfs_feed(feed, "removed-only feed", today=TODAY)
def test_validate_gtfs_feed_zero_and_empty_coords(tmp_path: Path) -> None:
"""The 2010 TfL snapshot's other failure mode: empty or 0,0 stop coords."""
feed = _make_gtfs(
tmp_path / "feed.zip",
stops=(
"stop_id,stop_name,stop_lat,stop_lon\n"
"STOP_A,Nowhere,0,0\n"
"STOP_B,Blank,,\n"
),
)
with pytest.raises(RuntimeError, match=r"plausible UK coordinates"):
validate_gtfs_feed(feed, "coordless feed", today=TODAY)
def test_validate_gtfs_feed_non_uk_coords_fail(tmp_path: Path) -> None:
feed = _make_gtfs(
tmp_path / "feed.zip",
stops=(
"stop_id,stop_name,stop_lat,stop_lon\n"
"STOP_A,New York,40.71,-74.0\n"
"STOP_B,Sydney,-33.87,151.21\n"
),
)
with pytest.raises(RuntimeError, match="plausible UK coordinates"):
validate_gtfs_feed(feed, "abroad feed", today=TODAY)
def test_validate_gtfs_feed_minority_bad_coords_pass(tmp_path: Path) -> None:
"""One bad stop out of 30 (3.3%) stays under the 5% tolerance."""
rows = [f"STOP_{i},Stop {i},51.5,{-0.1 + i * 0.001}\n" for i in range(29)]
rows.append("STOP_BAD,Broken,0,0\n")
feed = _make_gtfs(
tmp_path / "feed.zip",
stops="stop_id,stop_name,stop_lat,stop_lon\n" + "".join(rows),
)
validate_gtfs_feed(feed, "mostly good feed", today=TODAY) # must not raise
def test_validate_gtfs_feed_empty_trips(tmp_path: Path) -> None:
feed = _make_gtfs(tmp_path / "feed.zip", trips="trip_id,route_id,service_id\n")
with pytest.raises(RuntimeError, match="trips.txt has no data rows"):
validate_gtfs_feed(feed, "tripless feed", today=TODAY)
def test_validate_gtfs_feed_missing_calendar_files(tmp_path: Path) -> None:
feed = _make_gtfs(tmp_path / "feed.zip", calendar=None)
with pytest.raises(RuntimeError, match="neither calendar.txt nor calendar_dates"):
validate_gtfs_feed(feed, "calendarless feed", today=TODAY)
def test_validate_gtfs_feed_not_a_zip(tmp_path: Path) -> None:
bogus = tmp_path / "feed.zip"
bogus.write_text("not a zip")
with pytest.raises(RuntimeError, match="not a valid zip"):
validate_gtfs_feed(bogus, "bogus feed", today=TODAY)

View file

@ -2,24 +2,32 @@
Downloads:
- England OSM PBF from Geofabrik (~1.5GB)
- BODS GTFS from Bus Open Data Service (~1.5GB, all England bus/tram/ferry)
- TfL TransXChange timetables converted to GTFS
- National Rail CIF timetable converted to GTFS (requires credentials)
- BODS GTFS from Bus Open Data Service (~1.5GB; all England bus/tram/ferry,
plus London Underground, DLR, London Tramlink and the IFS Cloud Cable Car)
- National Rail CIF timetable converted to GTFS (requires credentials;
includes the Elizabeth line, TOC "XR")
Then processes for R5 compatibility:
- Cleans BODS GTFS (fixes stop_times >72h, feed_info year >2100)
- Converts high-frequency metro/tram services to frequency-based GTFS
- Converts TfL TransXChange to GTFS via transxchange2gtfs
- Converts National Rail CIF to GTFS via dtd2mysql (requires MariaDB Docker)
- Validates every produced GTFS zip (active calendar window, plausible UK
stop coordinates, non-empty routes/trips/stop_times)
Requires: osmium-tool, Node.js (npx), Docker (for national rail)
Note: the legacy TfL TransXChange feed (tfl.gov.uk journey-planner-timetables)
was removed: that URL serves a 2010-10-28 snapshot whose calendars all expired
in 2010 and whose stops have empty/0,0 coordinates, so it contributed zero
service. BODS covers all TfL modes that feed nominally provided.
Requires: osmium-tool, Docker (for national rail)
Output directory: property-data/transit/
raw/england.osm.pbf + bods_gtfs.zip + tfl_gtfs.zip + national_rail_gtfs.zip
raw/england.osm.pbf + bods_gtfs.zip + national_rail_gtfs.zip
"""
import argparse
import csv
import datetime as dt
import io
import json
import os
@ -45,20 +53,18 @@ ENGLAND_PBF_URL = (
# Bus Open Data Service — pre-converted GTFS covering all England bus/tram/ferry
BODS_GTFS_URL = "https://data.bus-data.dft.gov.uk/timetable/download/gtfs-file/all/"
# TfL TransXChange timetables (tube, DLR, tram, buses, river bus, cable car)
TFL_TRANSXCHANGE_URL = (
"https://tfl.gov.uk/cdn/static/cms/documents/journey-planner-timetables.zip"
)
# NaPTAN stops data — needed by transxchange2gtfs (its built-in URL is broken)
NAPTAN_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
# National Rail Open Data API
NR_AUTH_URL = "https://opendata.nationalrail.co.uk/authenticate"
NR_TIMETABLE_URL = "https://opendata.nationalrail.co.uk/api/staticfeeds/3.0/timetable"
USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"
TRANSXCHANGE2GTFS_PACKAGE = "transxchange2gtfs@1.12.0"
# GTFS validation: a feed must have service within this many days of the build
# date, and at least this fraction of stops must have plausible UK coordinates.
GTFS_CALENDAR_LOOKAHEAD_DAYS = 60
GTFS_MIN_VALID_STOP_FRACTION = 0.95
UK_LAT_RANGE = (49.0, 61.0)
UK_LON_RANGE = (-9.0, 2.5)
def _download_http(
@ -468,89 +474,175 @@ def convert_high_freq_to_frequency_based(
print(f" Saved to {dst}")
def download_tfl_transxchange(raw_dir: Path) -> Path:
"""Download TfL TransXChange timetable bundle."""
dest = raw_dir / "tfl_transxchange.zip"
if dest.exists():
print(f"TfL TransXChange already exists: {dest}")
return dest
print("Downloading TfL TransXChange timetables...")
_download_http(TFL_TRANSXCHANGE_URL, dest, desc="tfl_transxchange.zip")
return dest
def _gtfs_has_data_row(z: zipfile.ZipFile, filename: str) -> bool:
"""True if a GTFS file has at least one non-empty data row after the header."""
with z.open(filename) as f:
f.readline() # header
for line in f:
if _parse_csv_line(line):
return True
return False
def download_naptan() -> None:
"""Download NaPTAN stops to the local temp dir for transxchange2gtfs."""
dest = local_tmp_dir() / "Stops.csv"
if dest.exists():
print(f"NaPTAN Stops.csv already exists: {dest}")
return
def _calendar_active_in_window(
z: zipfile.ZipFile, names: set[str], window_start: int, window_end: int
) -> bool:
"""True if calendar.txt/calendar_dates.txt have service in [start, end].
print("Downloading NaPTAN stops data...")
_download_http(NAPTAN_URL, dest, desc="Stops.csv")
def convert_tfl_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
"""Convert TfL TransXChange to GTFS using transxchange2gtfs."""
dest = output_dir / "tfl_gtfs.zip"
if dest.exists():
print(f"TfL GTFS already exists: {dest}")
return dest
txc_path = raw_dir / "tfl_transxchange.zip"
# Ensure NaPTAN is available (transxchange2gtfs has a broken download URL)
download_naptan()
print("Converting TfL TransXChange → GTFS...")
# The shim patches known packaging/runtime issues in the pinned npm package
# before loading its CLI from npx's temporary install.
shim_path = Path(__file__).with_name("transxchange2gtfs_shim.js")
subprocess.run(
[
"npx",
"--yes",
"--package",
TRANSXCHANGE2GTFS_PACKAGE,
"sh",
"-c",
"\n".join(
[
'bin="$(command -v transxchange2gtfs)"',
'script="$(readlink -f "$bin")"',
'pkg_dir="$(dirname "$(dirname "$script")")"',
'shim="$1"',
"shift",
'exec node "$shim" "$pkg_dir" "$@"',
]
),
"transxchange2gtfs",
str(shim_path.resolve()),
str(txc_path.resolve()),
str(dest.resolve()),
],
check=True,
Dates are compared as YYYYMMDD integers. A calendar.txt row counts when its
date range overlaps the window AND at least one weekday flag is set; a
calendar_dates.txt row counts when it adds service (exception_type=1) on a
date inside the window.
"""
weekdays = (
"monday",
"tuesday",
"wednesday",
"thursday",
"friday",
"saturday",
"sunday",
)
if "calendar.txt" in names:
with z.open("calendar.txt") as f:
cols = _parse_csv_line(f.readline())
try:
start_idx = cols.index("start_date")
end_idx = cols.index("end_date")
except ValueError:
return False
day_idxs = [cols.index(d) for d in weekdays if d in cols]
for line in f:
parts = _parse_csv_line(line)
if not parts:
continue
try:
start = int(parts[start_idx].strip('"'))
end = int(parts[end_idx].strip('"'))
except (ValueError, IndexError):
continue
if start > window_end or end < window_start:
continue
if day_idxs and not any(
parts[i].strip('"') == "1" for i in day_idxs if i < len(parts)
):
continue
return True
if "calendar_dates.txt" in names:
with z.open("calendar_dates.txt") as f:
cols = _parse_csv_line(f.readline())
try:
date_idx = cols.index("date")
exc_idx = cols.index("exception_type")
except ValueError:
return False
for line in f:
parts = _parse_csv_line(line)
if not parts:
continue
try:
date = int(parts[date_idx].strip('"'))
except (ValueError, IndexError):
continue
if exc_idx < len(parts) and parts[exc_idx].strip('"') != "1":
continue
if window_start <= date <= window_end:
return True
return False
def validate_gtfs_feed(path: Path, feed_name: str, *, today: dt.date | None = None) -> None:
"""Sanity-check a produced/downloaded GTFS zip; raise RuntimeError if dead.
Guards against silently shipping a feed that contributes zero service (as
the old TfL dump did: 2010 calendars, empty/0,0 stop coordinates). Checks:
(a) calendar.txt/calendar_dates.txt have at least one service active
within [today, today + GTFS_CALENDAR_LOOKAHEAD_DAYS];
(b) stops.txt is non-empty and >= GTFS_MIN_VALID_STOP_FRACTION of stops
have plausible UK coordinates (lat 49-61, lon -9..2.5, not 0,0);
(c) routes.txt, trips.txt and stop_times.txt each have data rows.
"""
if today is None:
today = dt.date.today()
window_start = int(today.strftime("%Y%m%d"))
window_end = int(
(today + dt.timedelta(days=GTFS_CALENDAR_LOOKAHEAD_DAYS)).strftime("%Y%m%d")
)
def fail(reason: str) -> None:
raise RuntimeError(
f"GTFS validation failed for feed '{feed_name}' ({path}): {reason}"
)
print(f"Validating GTFS feed '{feed_name}'...")
if not path.exists() or not zipfile.is_zipfile(path):
fail("not a valid zip file")
with zipfile.ZipFile(path) as z:
names = set(z.namelist())
# (c) core files present and non-empty
for required in ("routes.txt", "trips.txt", "stop_times.txt", "stops.txt"):
if required not in names:
fail(f"missing {required}")
if not _gtfs_has_data_row(z, required):
fail(f"{required} has no data rows")
# (a) at least one service active in the routing window
if "calendar.txt" not in names and "calendar_dates.txt" not in names:
fail("has neither calendar.txt nor calendar_dates.txt")
if not _calendar_active_in_window(z, names, window_start, window_end):
fail(
f"no service active between {window_start} and {window_end}"
"the feed's calendars are stale/expired and it would contribute "
"zero service to routing"
)
# (b) stops have plausible UK coordinates
total_stops = 0
valid_stops = 0
with z.open("stops.txt") as f:
cols = _parse_csv_line(f.readline())
try:
lat_idx = cols.index("stop_lat")
lon_idx = cols.index("stop_lon")
except ValueError:
fail("stops.txt is missing stop_lat/stop_lon columns")
for line in f:
parts = _parse_csv_line(line)
if not parts:
continue
total_stops += 1
try:
lat = float(parts[lat_idx].strip('"'))
lon = float(parts[lon_idx].strip('"'))
except (ValueError, IndexError):
continue # empty/garbage coordinate → invalid
if lat == 0.0 and lon == 0.0:
continue
if (
UK_LAT_RANGE[0] <= lat <= UK_LAT_RANGE[1]
and UK_LON_RANGE[0] <= lon <= UK_LON_RANGE[1]
):
valid_stops += 1
if total_stops == 0:
fail("stops.txt has no stops")
fraction = valid_stops / total_stops
if fraction < GTFS_MIN_VALID_STOP_FRACTION:
fail(
f"only {valid_stops}/{total_stops} stops "
f"({fraction:.1%}) have plausible UK coordinates "
f"(lat {UK_LAT_RANGE[0]}-{UK_LAT_RANGE[1]}, "
f"lon {UK_LON_RANGE[0]}..{UK_LON_RANGE[1]}, non-null, not 0,0); "
f"need >= {GTFS_MIN_VALID_STOP_FRACTION:.0%}"
)
print(
f" OK: service active in window, {valid_stops}/{total_stops} stops "
f"({fraction:.1%}) with plausible UK coordinates"
)
required_files = {
"agency.txt",
"calendar.txt",
"calendar_dates.txt",
"routes.txt",
"stop_times.txt",
"stops.txt",
"trips.txt",
}
if not dest.exists() or not zipfile.is_zipfile(dest):
raise RuntimeError(f"transxchange2gtfs did not create a valid GTFS zip: {dest}")
with zipfile.ZipFile(dest) as z:
missing = required_files - set(z.namelist())
if missing:
missing_str = ", ".join(sorted(missing))
raise RuntimeError(f"TfL GTFS zip is missing required files: {missing_str}")
size_mb = dest.stat().st_size / (1024 * 1024)
print(f" Saved to {dest} ({size_mb:.1f} MB)")
return dest
def download_national_rail_cif(raw_dir: Path) -> Path | None:
@ -1007,18 +1099,15 @@ def main() -> None:
required=True,
help="Output directory for transit data",
)
parser.add_argument(
"--skip-tfl",
action="store_true",
help="Skip TfL TransXChange download and conversion",
)
args = parser.parse_args()
output_dir: Path = args.output
raw_dir = output_dir / "raw"
raw_dir.mkdir(parents=True, exist_ok=True)
# 1. Download, clean, and frequency-convert BODS GTFS
# 1. Download, clean, and frequency-convert BODS GTFS. BODS covers all
# England bus/tram/ferry plus London Underground, DLR, London Tramlink and
# the IFS Cloud Cable Car, so no separate TfL feed is needed.
download_osm_pbf(raw_dir)
bods_raw = download_bods_gtfs(raw_dir)
@ -1027,16 +1116,10 @@ def main() -> None:
bods_final = output_dir / "bods_gtfs.zip"
convert_high_freq_to_frequency_based(bods_cleaned, bods_final)
validate_gtfs_feed(bods_final, "BODS GTFS")
# 2. TfL TransXChange → GTFS
if args.skip_tfl:
print("Skipping TfL (--skip-tfl)")
else:
download_tfl_transxchange(raw_dir)
convert_tfl_to_gtfs(raw_dir, output_dir)
# 3. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
# reach the ~2,725 railway-station destinations, so a bus/TfL-only network
# 2. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
# reach the ~2,725 railway-station destinations, so a bus/metro-only network
# silently overstates every train commute. Missing credentials are a HARD
# error, so a rail-less network can never ship.
cif = download_national_rail_cif(raw_dir)
@ -1048,7 +1131,8 @@ def main() -> None:
"required; without it the transit network models every train journey "
"as bus-only and overstates commute times."
)
convert_national_rail_to_gtfs(raw_dir, output_dir)
nr_final = convert_national_rail_to_gtfs(raw_dir, output_dir)
validate_gtfs_feed(nr_final, "National Rail GTFS")
# Summary
print()

View file

@ -1,106 +0,0 @@
#!/usr/bin/env node
"use strict";
const fs = require("fs");
const path = require("path");
const { createRequire } = require("module");
const [pkgDirArg, ...converterArgs] = process.argv.slice(2);
if (!pkgDirArg || converterArgs.length < 2) {
console.error(
"Usage: transxchange2gtfs_shim.js <package-dir> <input...> <output>",
);
process.exit(2);
}
const pkgDir = path.resolve(pkgDirArg);
const defaultTmpDir = path.resolve(__dirname, "..", "..", ".tmp");
const localTmpDir =
process.env.TMPDIR || process.env.TEMP || process.env.TMP || defaultTmpDir;
const stopsCsv = path.join(localTmpDir, "Stops.csv");
const converterTmpPrefix = path.join(localTmpDir, "transxchange2gtfs_");
const converterTmpPatch =
`static TMP = ${JSON.stringify(converterTmpPrefix)}` +
` + process.pid + ${JSON.stringify(path.sep)};`;
fs.mkdirSync(localTmpDir, { recursive: true });
function replaceOnce(relativePath, before, after) {
const file = path.join(pkgDir, relativePath);
const original = fs.readFileSync(file, "utf8");
if (original.includes(before)) {
fs.writeFileSync(file, original.replace(before, after));
} else if (original.includes(after)) {
return;
} else {
throw new Error(`Could not patch ${relativePath}: expected text not found`);
}
}
// The published 1.12.0 package has a few compatibility issues with current
// TfL TransXChange exports:
// - the bin script points at dist/src/cli.js, but the package ships dist/cli.js
// - the compiled date-holidays import expects a synthetic default export
// - some TfL journeys reference timing links without matching route-link geometry
//
// GTFS shapes are optional for R5 routing. Clear shape references and omit
// shapes.txt so missing route geometry does not drop otherwise usable trips.
function patchPackage() {
replaceOnce(
"dist/Container.js",
"static TMP = `/tmp/transxchange2gtfs_${process.pid}/`;",
converterTmpPatch,
);
replaceOnce(
"dist/Container.js",
'fs.existsSync("/tmp/Stops.csv")',
`fs.existsSync(${JSON.stringify(stopsCsv)})`,
);
replaceOnce(
"dist/Container.js",
'fs.createReadStream("/tmp/Stops.csv", "utf8")',
`fs.createReadStream(${JSON.stringify(stopsCsv)}, "utf8")`,
);
replaceOnce(
"dist/converter/GetStopData.js",
'fs.createWriteStream("/tmp/Stops.csv")',
`fs.createWriteStream(${JSON.stringify(stopsCsv)})`,
);
replaceOnce(
"dist/transxchange/TransXChangeJourneyStream.js",
"distanceSoFarM += routeLink.Distance;",
"distanceSoFarM += routeLink ? routeLink.Distance : 0;",
);
replaceOnce(
"dist/gtfs/TripsStream.js",
"(0, crypto_1.createHash)('md5').update(JSON.stringify({ routeId: journey.route, routeLinkSeq: journey.routeLinkIds })).digest(\"hex\"));",
"\"\");",
);
replaceOnce(
"dist/gtfs/StopTimesStream.js",
"stop.shapeDistTraveled, stop.exactTime ? \"1\" : \"0\");",
"\"\", stop.exactTime ? \"1\" : \"0\");",
);
replaceOnce(
"dist/Container.js",
"\"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex)),\n \"shapes.txt\": journeyStream.pipe(new ShapesStream_1.ShapesStream())",
"\"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
);
replaceOnce(
"dist/Container.js",
"\"routes.txt\": transxchange.pipe(new RoutesStream_1.RoutesStream()),\n \"transfers.txt\": transxchange.pipe(new TransfersStream_1.TransfersStream(naptanIndex, locationIndex)),\n \"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
"\"routes.txt\": transxchange.pipe(new RoutesStream_1.RoutesStream()),\n \"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
);
}
patchPackage();
const pkgRequire = createRequire(path.join(pkgDir, "package.json"));
const Holidays = pkgRequire("date-holidays");
if (!Holidays.default) {
Holidays.default = Holidays;
}
process.argv = [process.argv[0], "transxchange2gtfs", ...converterArgs];
require(path.join(pkgDir, "dist", "cli.js"));