Fix data pipelines once and for all
This commit is contained in:
parent
08560476c5
commit
4012e4e047
46 changed files with 4508 additions and 855 deletions
93
pipeline/download/lsoa_children.py
Normal file
93
pipeline/download/lsoa_children.py
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
"""Download Census 2021 children by five-year age band per LSOA.
|
||||
|
||||
Source: NOMIS (ONS Census 2021 — TS007A dataset, age by five-year bands)
|
||||
License: Open Government Licence v3.0
|
||||
|
||||
Used to estimate how many primary-age (4-10) and secondary-age (11-15)
|
||||
children live in each LSOA, which drives the school catchment model. Census
|
||||
bands don't align with school phases, so phase totals take fractional shares
|
||||
of the 0-4, 10-14 and 15-19 bands (one fifth per single year of age).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import polars as pl
|
||||
|
||||
# NOMIS API: Census 2021 TS007A (age, five-year bands) by LSOA 2021 (TYPE151).
|
||||
# c2021_age_19 codes: 1 = 0-4, 2 = 5-9, 3 = 10-14, 4 = 15-19.
|
||||
# NOMIS paginates at 25,000 rows by default, so we paginate with recordoffset.
|
||||
BASE_URL = (
|
||||
"https://www.nomisweb.co.uk/api/v01/dataset/NM_2020_1.data.csv"
|
||||
"?date=latest&geography=TYPE151&measures=20100&c2021_age_19=1,2,3,4"
|
||||
"&select=GEOGRAPHY_CODE,C2021_AGE_19,OBS_VALUE"
|
||||
)
|
||||
PAGE_SIZE = 25000
|
||||
|
||||
AGE_BAND_COLUMNS = {
|
||||
1: "aged_0_4",
|
||||
2: "aged_5_9",
|
||||
3: "aged_10_14",
|
||||
4: "aged_15_19",
|
||||
}
|
||||
|
||||
|
||||
def download_and_convert(output_path: Path) -> None:
|
||||
print("Downloading Census 2021 LSOA age bands from NOMIS...")
|
||||
frames = []
|
||||
offset = 0
|
||||
while True:
|
||||
url = f"{BASE_URL}&recordoffset={offset}"
|
||||
response = httpx.get(url, follow_redirects=True, timeout=120)
|
||||
response.raise_for_status()
|
||||
if len(response.content) == 0:
|
||||
break
|
||||
chunk = pl.read_csv(BytesIO(response.content))
|
||||
if chunk.height == 0:
|
||||
break
|
||||
frames.append(chunk)
|
||||
print(f" Fetched {chunk.height} rows (offset={offset})")
|
||||
if chunk.height < PAGE_SIZE:
|
||||
break
|
||||
offset += PAGE_SIZE
|
||||
|
||||
df = pl.concat(frames)
|
||||
print(f"Total rows: {df.height}")
|
||||
|
||||
result = (
|
||||
df.rename({"GEOGRAPHY_CODE": "lsoa21"})
|
||||
.pivot(on="C2021_AGE_19", index="lsoa21", values="OBS_VALUE")
|
||||
.rename({str(code): name for code, name in AGE_BAND_COLUMNS.items()})
|
||||
.with_columns(pl.col(name).cast(pl.UInt32) for name in AGE_BAND_COLUMNS.values())
|
||||
.filter(pl.col("lsoa21").str.starts_with("E"))
|
||||
.sort("lsoa21")
|
||||
)
|
||||
|
||||
missing = [c for c in AGE_BAND_COLUMNS.values() if c not in result.columns]
|
||||
if missing:
|
||||
raise ValueError(f"NOMIS response missing age bands: {missing}")
|
||||
|
||||
print(f"England LSOAs: {result.height}")
|
||||
for name in AGE_BAND_COLUMNS.values():
|
||||
print(f" {name}: total {result[name].sum():,}")
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
result.write_parquet(output_path, compression="zstd")
|
||||
print(f"Saved to {output_path}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download Census 2021 age bands (children) by LSOA"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
download_and_convert(args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -12,8 +12,18 @@ import polars as pl
|
|||
|
||||
NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
|
||||
TUBE_STATION_CATEGORY = "Tube station"
|
||||
TRAM_METRO_CATEGORY = "Tram & Metro stop"
|
||||
TUBE_STATION_MERGE_RADIUS_DEGREES = 0.01
|
||||
|
||||
# London Underground ATCO codes are "<area><kind>ZZLU<station>": a 3-digit
|
||||
# AdministrativeAreaCode (940 national, 490 London, plus 150/210/040/... for
|
||||
# LU stations outside Greater London such as Epping or Amersham), then "0"
|
||||
# (platform/entrance node) or "G" (station group node), then the system code.
|
||||
# "ZZLU" is unique to London Underground, which cleanly separates genuine Tube
|
||||
# stations from every other TMU/MET network (Metrolink, Supertram, T&W Metro,
|
||||
# WM Metro, Blackpool Tramway, heritage railways, ...).
|
||||
LONDON_UNDERGROUND_ATCO_PATTERN = r"(?i)^\d{3}[0G]ZZLU"
|
||||
|
||||
|
||||
STOP_TYPES = {
|
||||
"AIR": "Airport",
|
||||
|
|
@ -25,25 +35,110 @@ STOP_TYPES = {
|
|||
"RLY": "Rail station",
|
||||
"RSE": "Rail station",
|
||||
"BCT": "Bus stop",
|
||||
# Bus/coach stations: BST is the station access-area node, BCS/BCQ are
|
||||
# bays/stands within the station and BCE is a station entrance. NaPTAN maps
|
||||
# very few BCE nodes (~80 GB-wide), so without BST/BCS/BCQ the category was
|
||||
# so sparse that 20% of England showed the nearest bus station >100km away.
|
||||
# Bays and entrances collapse to one POI per station via
|
||||
# STATION_MERGE_CATEGORIES below.
|
||||
"BST": "Bus station",
|
||||
"BCS": "Bus station",
|
||||
"BCQ": "Bus station",
|
||||
"BCE": "Bus station",
|
||||
"TXR": "Taxi rank",
|
||||
"TMU": "Tube station",
|
||||
"MET": "Tube station",
|
||||
# Tram/Metro/Underground: TMU is an entrance node, MET the station access
|
||||
# area. Both start as "Tram & Metro stop"; merged stations whose ATCO codes
|
||||
# mark them as London Underground (ZZLU) are reclassified to "Tube station"
|
||||
# after dedup (see _deduplicate_station_areas). Heritage railways (RHDR,
|
||||
# Severn Valley, ...) are TMU/MET in NaPTAN with no machine-readable
|
||||
# "heritage" flag, so they remain in "Tram & Metro stop".
|
||||
"TMU": TRAM_METRO_CATEGORY,
|
||||
"MET": TRAM_METRO_CATEGORY,
|
||||
}
|
||||
|
||||
# Stop types that are access/entrance nodes rather than the primary station or
|
||||
# terminal node. During dedup the primary node (e.g. RLY/FER) wins so a station
|
||||
# with both a station node and entrances yields one POI at the station node.
|
||||
ENTRANCE_STOP_TYPES = {"RSE", "FTD"}
|
||||
# terminal node. During dedup the primary node (e.g. RLY/FER/MET) wins so a
|
||||
# station with both a station node and entrances yields one POI at the station
|
||||
# node.
|
||||
ENTRANCE_STOP_TYPES = {"RSE", "FTD", "TMU", "BCE"}
|
||||
|
||||
# Categories whose entrances/variants are merged into a single station-level POI
|
||||
# by normalized name + area (like Tube stations), so an RLY node and its RSE
|
||||
# entrances collapse to one POI at the station node.
|
||||
STATION_MERGE_CATEGORIES = {TUBE_STATION_CATEGORY, "Rail station", "Ferry"}
|
||||
STATION_MERGE_CATEGORIES = {
|
||||
TRAM_METRO_CATEGORY,
|
||||
TUBE_STATION_CATEGORY,
|
||||
"Rail station",
|
||||
"Ferry",
|
||||
"Bus station",
|
||||
}
|
||||
|
||||
|
||||
OUTPUT_COLUMNS = ["id", "name", "category", "lat", "lng"]
|
||||
|
||||
# Trailing entrance designators ("North East Ent", "Main Entrance No 2",
|
||||
# "West Station Entrance", ...) are stripped from canonical names so a
|
||||
# station's individually-named entrance nodes collapse into the station.
|
||||
# A trailing run of filler words is only stripped when it contains at least
|
||||
# one entrance word, so "Maze Hill North" or "Platform 1" are untouched.
|
||||
_ENTRANCE_NAME_WORDS = {"ent", "entrance", "entrances", "access"}
|
||||
_ENTRANCE_FILLER_WORDS = {
|
||||
"north",
|
||||
"south",
|
||||
"east",
|
||||
"west",
|
||||
"ne",
|
||||
"nw",
|
||||
"se",
|
||||
"sw",
|
||||
"n",
|
||||
"s",
|
||||
"e",
|
||||
"w",
|
||||
"wt",
|
||||
"main",
|
||||
"side",
|
||||
"no",
|
||||
"station",
|
||||
"stop",
|
||||
"platform",
|
||||
}
|
||||
|
||||
_ENTRANCE_WORDS_RE = "(?:ent|entrance|entrances|access)"
|
||||
_ENTRANCE_FILLER_RE = (
|
||||
r"(?:north|south|east|west|ne|nw|se|sw|n|s|e|w|wt|main|side|no|station|stop"
|
||||
r"|platform|\d+)"
|
||||
)
|
||||
_ENTRANCE_SUFFIX_RE = (
|
||||
rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*"
|
||||
rf"\s+{_ENTRANCE_WORDS_RE}"
|
||||
rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*$"
|
||||
)
|
||||
|
||||
# Bus-station bay/stand designators ("Stand A3", "Bay 2", "Stance 5") are
|
||||
# stripped so every bay of one station shares a canonical name. The designator
|
||||
# word must be followed by a short alphanumeric token, so place names ending in
|
||||
# a bare "Bay" (Colwyn Bay, Herne Bay) are untouched.
|
||||
_BAY_WORDS = {"stand", "stance", "bay", "gate"}
|
||||
_BAY_SUFFIX_RE = r"\s+(?:stand|stance|bay|gate)\s+[a-z0-9]{1,3}$"
|
||||
|
||||
|
||||
def _strip_entrance_suffix(words: list[str]) -> list[str]:
|
||||
"""Drop a trailing entrance designator (direction/number filler around an
|
||||
entrance word) from a tokenized stop name; no-op when no entrance word."""
|
||||
idx = len(words)
|
||||
saw_entrance = False
|
||||
while idx > 0:
|
||||
word = words[idx - 1]
|
||||
if word in _ENTRANCE_NAME_WORDS:
|
||||
saw_entrance = True
|
||||
elif word.isdigit() or word in _ENTRANCE_FILLER_WORDS:
|
||||
pass
|
||||
else:
|
||||
break
|
||||
idx -= 1
|
||||
return words[:idx] if saw_entrance else words
|
||||
|
||||
|
||||
def canonical_station_name(name: str | None) -> str:
|
||||
"""Normalize station names so entrances/transport-mode variants collapse."""
|
||||
|
|
@ -55,18 +150,24 @@ def canonical_station_name(name: str | None) -> str:
|
|||
normalized = re.sub(r"['’`]", "", normalized)
|
||||
normalized = normalized.replace("&", " and ")
|
||||
normalized = re.sub(r"[^a-z0-9]+", " ", normalized)
|
||||
words = normalized.split()
|
||||
words = _strip_entrance_suffix(normalized.split())
|
||||
|
||||
if len(words) >= 3 and words[-2] in _BAY_WORDS and len(words[-1]) <= 3:
|
||||
del words[-2:]
|
||||
|
||||
suffixes = (
|
||||
("underground", "station"),
|
||||
("tube", "station"),
|
||||
("dlr", "station"),
|
||||
("metro", "station"),
|
||||
("metrolink", "station"),
|
||||
("metrolink", "stop"),
|
||||
("tram", "stop"),
|
||||
("rail", "station"),
|
||||
("railway", "station"),
|
||||
("station",),
|
||||
("stop",),
|
||||
("metrolink",),
|
||||
)
|
||||
while True:
|
||||
suffix = next(
|
||||
|
|
@ -88,11 +189,14 @@ def canonical_station_name_expr(name_col: str = "name") -> pl.Expr:
|
|||
expr = expr.str.replace_all(r"&", " and ")
|
||||
expr = expr.str.replace_all(r"[^a-z0-9]+", " ")
|
||||
expr = expr.str.replace_all(r"\s+", " ").str.strip_chars()
|
||||
expr = expr.str.replace_all(_ENTRANCE_SUFFIX_RE, "")
|
||||
expr = expr.str.replace_all(_BAY_SUFFIX_RE, "")
|
||||
expr = expr.str.replace_all(
|
||||
r"\s+(underground|tube|dlr|metro|rail|railway)\s+station$", ""
|
||||
r"\s+(underground|tube|dlr|metro|metrolink|rail|railway)\s+station$", ""
|
||||
)
|
||||
expr = expr.str.replace_all(r"\s+tram\s+stop$", "")
|
||||
expr = expr.str.replace_all(r"\s+(metrolink|tram)\s+stop$", "")
|
||||
expr = expr.str.replace_all(r"\s+(station|stop)$", "")
|
||||
expr = expr.str.replace_all(r"\s+metrolink$", "")
|
||||
return expr.str.strip_chars()
|
||||
|
||||
|
||||
|
|
@ -140,6 +244,7 @@ class StationAccumulator:
|
|||
lat_sum: float
|
||||
lng_sum: float
|
||||
entrance: bool = False
|
||||
is_lu: bool = False
|
||||
count: int = 1
|
||||
|
||||
@property
|
||||
|
|
@ -159,6 +264,7 @@ class StationAccumulator:
|
|||
self.lat_sum += float(row["lat"])
|
||||
self.lng_sum += float(row["lng"])
|
||||
self.count += 1
|
||||
self.is_lu = self.is_lu or bool(row.get("is_lu"))
|
||||
|
||||
name = str(row["name"] or "")
|
||||
entrance = bool(row.get("entrance"))
|
||||
|
|
@ -169,6 +275,16 @@ class StationAccumulator:
|
|||
self.name = name
|
||||
self.entrance = entrance
|
||||
|
||||
@property
|
||||
def output_category(self) -> str:
|
||||
# A merged tram/metro station is a genuine Tube station when ANY of its
|
||||
# constituent nodes carries a London Underground ATCO code. Checking
|
||||
# the whole group (not just the winning node) matters because LU
|
||||
# entrance nodes often carry non-ZZLU codes (e.g. 4900VICT...).
|
||||
if self.category == TRAM_METRO_CATEGORY and self.is_lu:
|
||||
return TUBE_STATION_CATEGORY
|
||||
return self.category
|
||||
|
||||
|
||||
def _station_from_row(row: dict[str, object]) -> StationAccumulator:
|
||||
return StationAccumulator(
|
||||
|
|
@ -178,6 +294,7 @@ def _station_from_row(row: dict[str, object]) -> StationAccumulator:
|
|||
lat_sum=float(row["lat"]),
|
||||
lng_sum=float(row["lng"]),
|
||||
entrance=bool(row.get("entrance")),
|
||||
is_lu=bool(row.get("is_lu")),
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -217,7 +334,7 @@ def _deduplicate_station_areas(df: pl.DataFrame) -> pl.DataFrame:
|
|||
{
|
||||
"id": [station.id for station in selected],
|
||||
"name": [station.name for station in selected],
|
||||
"category": [station.category for station in selected],
|
||||
"category": [station.output_category for station in selected],
|
||||
"lat": [station.lat for station in selected],
|
||||
"lng": [station.lng for station in selected],
|
||||
}
|
||||
|
|
@ -258,10 +375,12 @@ def _deduplicate_local_stops(df: pl.DataFrame) -> pl.DataFrame:
|
|||
def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Deduplicate NaPTAN stops, merging station/terminal entrances by area.
|
||||
|
||||
Tube, rail and ferry POIs are merged to one record per station by
|
||||
normalized name + area, with the primary station/terminal node (e.g. RLY,
|
||||
FER) winning over an entrance node (RSE, FTD). Other stops are deduplicated
|
||||
by exact name+category+locality.
|
||||
Tram/metro, rail, ferry and bus-station POIs are merged to one record per
|
||||
station by normalized name + area, with the primary station/terminal node
|
||||
(e.g. RLY, FER, MET, BST) winning over an entrance node (RSE, FTD, TMU,
|
||||
BCE). Merged tram/metro stations with a London Underground ATCO code in
|
||||
the group become "Tube station". Other stops are deduplicated by exact
|
||||
name+category+locality.
|
||||
"""
|
||||
station = df.filter(pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
|
||||
other = df.filter(~pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
|
||||
|
|
@ -274,6 +393,29 @@ def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
|
|||
).select(OUTPUT_COLUMNS)
|
||||
|
||||
|
||||
def filter_active_stops(df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Keep only active NaPTAN stops.
|
||||
|
||||
The NaPTAN export's Status column marks stops as active/inactive/pending;
|
||||
without this filter closed stations ("(closed)", "not in use") ship as
|
||||
live POIs. Rows with a null Status are kept (benefit of the doubt); a
|
||||
missing column is tolerated so older extracts still load.
|
||||
"""
|
||||
if "Status" not in df.columns:
|
||||
print("WARNING: NaPTAN data has no Status column; keeping all stops")
|
||||
return df
|
||||
|
||||
before = len(df)
|
||||
df = df.filter(
|
||||
pl.col("Status").is_null()
|
||||
| pl.col("Status").str.strip_chars().str.to_lowercase().is_in(["active", "act"])
|
||||
)
|
||||
dropped = before - len(df)
|
||||
if dropped:
|
||||
print(f"Dropped {dropped:,} non-active stops (Status != active)")
|
||||
return df
|
||||
|
||||
|
||||
def download_naptan(output: Path) -> None:
|
||||
output.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
|
@ -291,15 +433,19 @@ def download_naptan(output: Path) -> None:
|
|||
)
|
||||
.drop_nulls(subset=["Latitude", "Longitude"])
|
||||
.filter(pl.col("StopType").is_in(list(STOP_TYPES.keys())))
|
||||
.select(
|
||||
pl.col("ATCOCode").alias("id"),
|
||||
pl.col("CommonName").alias("name"),
|
||||
pl.col("StopType").replace(STOP_TYPES).alias("category"),
|
||||
pl.col("Latitude").alias("lat"),
|
||||
pl.col("Longitude").alias("lng"),
|
||||
pl.col("NptgLocalityCode").alias("locality"),
|
||||
pl.col("StopType").is_in(list(ENTRANCE_STOP_TYPES)).alias("entrance"),
|
||||
)
|
||||
)
|
||||
df = filter_active_stops(df).select(
|
||||
pl.col("ATCOCode").alias("id"),
|
||||
pl.col("CommonName").alias("name"),
|
||||
pl.col("StopType").replace(STOP_TYPES).alias("category"),
|
||||
pl.col("Latitude").alias("lat"),
|
||||
pl.col("Longitude").alias("lng"),
|
||||
pl.col("NptgLocalityCode").alias("locality"),
|
||||
pl.col("StopType").is_in(list(ENTRANCE_STOP_TYPES)).alias("entrance"),
|
||||
pl.col("ATCOCode")
|
||||
.str.contains(LONDON_UNDERGROUND_ATCO_PATTERN)
|
||||
.fill_null(False)
|
||||
.alias("is_lu"),
|
||||
)
|
||||
|
||||
before = len(df)
|
||||
|
|
|
|||
|
|
@ -2,12 +2,15 @@
|
|||
|
||||
Downloads the OS Open Greenspace dataset as ESRI Shapefile and extracts
|
||||
access point locations (park entrances). Each access point is tagged with
|
||||
its parent site's function type (e.g. Public Park Or Garden). Sites without
|
||||
access points fall back to polygon centroids.
|
||||
its parent site's function type (e.g. Public Park Or Garden), the parent
|
||||
site id and the site's polygon centroid. Sites without access points fall
|
||||
back to polygon centroids.
|
||||
|
||||
Using access points rather than polygon centroids gives much more accurate
|
||||
distance calculations — a property next to Hyde Park won't show 400m just
|
||||
because the centroid is in the middle of the park.
|
||||
because the centroid is in the middle of the park. The site id / centroid
|
||||
columns let downstream consumers (poi_proximity) collapse the frame back to
|
||||
one row per SITE for counting, so a park with 30 gates counts as one park.
|
||||
|
||||
Source: https://osdatahub.os.uk/downloads/open/OpenGreenspace
|
||||
License: Open Government Licence v3.0
|
||||
|
|
@ -65,8 +68,8 @@ def _read_site_functions(shp_path: Path) -> dict[str, str]:
|
|||
|
||||
def _read_access_points(
|
||||
shp_path: Path, site_funcs: dict[str, str]
|
||||
) -> tuple[list[float], list[float], list[str]]:
|
||||
"""Read access points, tagging each with its parent site's function."""
|
||||
) -> tuple[list[float], list[float], list[str], list[str]]:
|
||||
"""Read access points, tagging each with its parent site's function and id."""
|
||||
reader = shp.Reader(str(shp_path), encoding="latin-1")
|
||||
field_names = [f[0] for f in reader.fields[1:]]
|
||||
|
||||
|
|
@ -80,6 +83,7 @@ def _read_access_points(
|
|||
lats: list[float] = []
|
||||
lngs: list[float] = []
|
||||
categories: list[str] = []
|
||||
site_ids: list[str] = []
|
||||
skipped = 0
|
||||
error_skipped = 0
|
||||
|
||||
|
|
@ -107,6 +111,7 @@ def _read_access_points(
|
|||
lats.append(lat)
|
||||
lngs.append(lng)
|
||||
categories.append(func)
|
||||
site_ids.append(str(site_id))
|
||||
|
||||
if skipped:
|
||||
print(f" Skipped {skipped:,} access points with unknown site ID")
|
||||
|
|
@ -116,31 +121,26 @@ def _read_access_points(
|
|||
error_skipped,
|
||||
)
|
||||
|
||||
return lats, lngs, categories
|
||||
return lats, lngs, categories, site_ids
|
||||
|
||||
|
||||
def _read_site_centroids(
|
||||
shp_path: Path, site_funcs: dict[str, str], covered_ids: set[str]
|
||||
) -> tuple[list[float], list[float], list[str]]:
|
||||
"""Read polygon centroids for sites that have no access points (fallback)."""
|
||||
def _read_site_centroids(shp_path: Path) -> dict[str, tuple[float, float]]:
|
||||
"""Compute the WGS84 polygon centroid of every greenspace site.
|
||||
|
||||
Used both as the representative point for site-level counting and as the
|
||||
location fallback for sites that have no access points.
|
||||
"""
|
||||
reader = shp.Reader(str(shp_path), encoding="latin-1")
|
||||
field_names = [f[0] for f in reader.fields[1:]]
|
||||
id_idx = _find_field(field_names, "id")
|
||||
func_idx = _find_field(field_names, "funct")
|
||||
if id_idx is None or func_idx is None:
|
||||
return [], [], []
|
||||
if id_idx is None:
|
||||
return {}
|
||||
|
||||
lats: list[float] = []
|
||||
lngs: list[float] = []
|
||||
categories: list[str] = []
|
||||
centroids: dict[str, tuple[float, float]] = {}
|
||||
error_skipped = 0
|
||||
|
||||
for sr in reader.shapeRecords():
|
||||
site_id = sr.record[id_idx]
|
||||
if site_id in covered_ids:
|
||||
continue
|
||||
|
||||
func = sr.record[func_idx]
|
||||
try:
|
||||
geom = to_shapely(sr.shape.__geo_interface__)
|
||||
if geom.is_empty or not geom.is_valid:
|
||||
|
|
@ -156,9 +156,7 @@ def _read_site_centroids(
|
|||
)
|
||||
continue
|
||||
|
||||
lats.append(lat)
|
||||
lngs.append(lng)
|
||||
categories.append(func)
|
||||
centroids[str(site_id)] = (lat, lng)
|
||||
|
||||
if error_skipped:
|
||||
logger.warning(
|
||||
|
|
@ -166,7 +164,7 @@ def _read_site_centroids(
|
|||
error_skipped,
|
||||
)
|
||||
|
||||
return lats, lngs, categories
|
||||
return centroids
|
||||
|
||||
|
||||
def download_greenspace(output: Path) -> None:
|
||||
|
|
@ -194,33 +192,53 @@ def download_greenspace(output: Path) -> None:
|
|||
|
||||
# Step 2: Read access points (primary — park entrances)
|
||||
print(f"Reading {access_shps[0].name}...")
|
||||
ap_lats, ap_lngs, ap_cats = _read_access_points(access_shps[0], site_funcs)
|
||||
ap_lats, ap_lngs, ap_cats, ap_site_ids = _read_access_points(
|
||||
access_shps[0], site_funcs
|
||||
)
|
||||
print(f" {len(ap_lats):,} access points loaded")
|
||||
|
||||
# Step 3: Fall back to centroids for sites without any access points
|
||||
covered_ids = set()
|
||||
reader = shp.Reader(str(access_shps[0]), encoding="latin-1")
|
||||
field_names = [f[0] for f in reader.fields[1:]]
|
||||
ref_idx = _find_field(field_names, "refto", "ref_to", "greensp")
|
||||
if ref_idx is not None:
|
||||
for rec in reader.iterRecords():
|
||||
covered_ids.add(rec[ref_idx])
|
||||
# Step 3: Compute every site's centroid: the representative point for
|
||||
# site-level counting, and the location fallback for sites without any
|
||||
# access points.
|
||||
print("Computing site centroids...")
|
||||
centroids = _read_site_centroids(site_shps[0])
|
||||
print(f" {len(centroids):,} site centroids computed")
|
||||
|
||||
print("Adding centroids for sites without access points...")
|
||||
fb_lats, fb_lngs, fb_cats = _read_site_centroids(
|
||||
site_shps[0], site_funcs, covered_ids
|
||||
)
|
||||
covered_ids = set(ap_site_ids)
|
||||
fb_lats: list[float] = []
|
||||
fb_lngs: list[float] = []
|
||||
fb_cats: list[str] = []
|
||||
fb_site_ids: list[str] = []
|
||||
for site_id, (lat, lng) in centroids.items():
|
||||
if site_id in covered_ids:
|
||||
continue
|
||||
func = site_funcs.get(site_id)
|
||||
if func is None:
|
||||
continue
|
||||
fb_lats.append(lat)
|
||||
fb_lngs.append(lng)
|
||||
fb_cats.append(func)
|
||||
fb_site_ids.append(site_id)
|
||||
print(f" {len(fb_lats):,} centroid fallbacks added")
|
||||
|
||||
lats = ap_lats + fb_lats
|
||||
lngs = ap_lngs + fb_lngs
|
||||
categories = ap_cats + fb_cats
|
||||
site_ids = ap_site_ids + fb_site_ids
|
||||
site_lats = [centroids.get(site_id, (None, None))[0] for site_id in site_ids]
|
||||
site_lngs = [centroids.get(site_id, (None, None))[1] for site_id in site_ids]
|
||||
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"lat": np.array(lats, dtype=np.float64),
|
||||
"lng": np.array(lngs, dtype=np.float64),
|
||||
"category": categories,
|
||||
"site_id": site_ids,
|
||||
# Site polygon centroid (null when the centroid could not be
|
||||
# computed): the representative point when collapsing to one row
|
||||
# per site for counting.
|
||||
"site_lat": pl.Series(site_lats, dtype=pl.Float64),
|
||||
"site_lng": pl.Series(site_lngs, dtype=pl.Float64),
|
||||
}
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -641,7 +641,7 @@ def _naptan_dlr_stations(naptan_path: Path) -> list[dict]:
|
|||
match = _DLR_CODE_RE.search(atco_id)
|
||||
if not match:
|
||||
continue
|
||||
if row["category"] not in {"Tube station", "Rail station"}:
|
||||
if row["category"] not in {"Tube station", "Tram & Metro stop", "Rail station"}:
|
||||
continue
|
||||
|
||||
code = match.group(1)
|
||||
|
|
|
|||
|
|
@ -2,9 +2,12 @@ import polars as pl
|
|||
import pytest
|
||||
|
||||
from pipeline.download.naptan import (
|
||||
TRAM_METRO_CATEGORY,
|
||||
TUBE_STATION_CATEGORY,
|
||||
canonical_station_name,
|
||||
canonical_station_name_expr,
|
||||
deduplicate_naptan,
|
||||
filter_active_stops,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -34,6 +37,127 @@ def test_canonical_station_name_expr_normalizes_transport_suffixes():
|
|||
assert [canonical_station_name(name) for name in names] == result
|
||||
|
||||
|
||||
def test_canonical_station_name_strips_entrance_suffixes():
|
||||
# Real shipped NaPTAN entrance names that previously failed to merge with
|
||||
# their station node (79 stray entrance POIs).
|
||||
cases = {
|
||||
"Weaste Metrolink Station North East Entrance": "weaste",
|
||||
"Weaste Metrolink Station North Entrance No 2": "weaste",
|
||||
"Whitefield Metrolink Station Main Entrance": "whitefield",
|
||||
"Radcliffe Metrolink Station Entrance": "radcliffe",
|
||||
"Stretford Metrolink Station Wt Platform Entrance": "stretford",
|
||||
"Salford Quays Metrolink Station SW entrance": "salford quays",
|
||||
"Bank Station Ent 2": "bank",
|
||||
"Hainault": "hainault",
|
||||
# The Metrolink MET node names collapse to the same key.
|
||||
"Weaste (Manchester Metrolink)": "weaste",
|
||||
# No entrance word: direction/filler words must NOT be stripped.
|
||||
"Maze Hill North": "maze hill north",
|
||||
"Bus Station Entrance": "bus",
|
||||
# Bus-station bay/stand designators collapse to the station name…
|
||||
"Tonypandy Bus Station Stand A3": "tonypandy bus",
|
||||
"Caerphilly Interchange Stand 5": "caerphilly interchange",
|
||||
"Stanley Bus Station Stand G": "stanley bus",
|
||||
# …but a bare trailing "Bay" (place names) is untouched.
|
||||
"Colwyn Bay": "colwyn bay",
|
||||
}
|
||||
for name, expected in cases.items():
|
||||
assert canonical_station_name(name) == expected, name
|
||||
|
||||
df = pl.DataFrame({"name": list(cases.keys())})
|
||||
expr_result = df.select(canonical_station_name_expr().alias("key"))["key"].to_list()
|
||||
assert expr_result == list(cases.values())
|
||||
|
||||
|
||||
def test_filter_active_stops_drops_non_active():
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"ATCOCode": ["a", "b", "c", "d"],
|
||||
"Status": ["active", "inactive", None, "Pending"],
|
||||
}
|
||||
)
|
||||
|
||||
result = filter_active_stops(df)
|
||||
|
||||
# Active and unknown (null) statuses survive; inactive/pending are dropped.
|
||||
assert result["ATCOCode"].to_list() == ["a", "c"]
|
||||
|
||||
|
||||
def test_filter_active_stops_tolerates_missing_status_column():
|
||||
df = pl.DataFrame({"ATCOCode": ["a"]})
|
||||
|
||||
assert filter_active_stops(df)["ATCOCode"].to_list() == ["a"]
|
||||
|
||||
|
||||
def test_deduplicate_naptan_splits_london_underground_from_tram_metro():
|
||||
# MET station nodes plus TMU entrances, pre-categorised as the tram/metro
|
||||
# family. The Hainault group contains a 940GZZLU station node, so the
|
||||
# merged POI is a genuine "Tube station" even though its entrance carries a
|
||||
# non-ZZLU ATCO code; the Metrolink group stays "Tram & Metro stop".
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"id": [
|
||||
"940GZZLUHLT",
|
||||
"490000095003",
|
||||
"9400ZZMAWST",
|
||||
"1800NFR2691",
|
||||
],
|
||||
"name": [
|
||||
"Hainault Underground Station",
|
||||
"Hainault",
|
||||
"Weaste (Manchester Metrolink)",
|
||||
"Weaste Metrolink Station North West Entrance",
|
||||
],
|
||||
"category": [TRAM_METRO_CATEGORY] * 4,
|
||||
"lat": [51.6034, 51.6037, 53.4826, 53.4826],
|
||||
"lng": [0.0933, 0.0931, -2.3087, -2.3086],
|
||||
"locality": [None, None, None, None],
|
||||
"entrance": [False, True, False, True],
|
||||
"is_lu": [True, False, False, False],
|
||||
}
|
||||
)
|
||||
|
||||
result = deduplicate_naptan(df).sort("category")
|
||||
|
||||
assert len(result) == 2
|
||||
assert result["category"].to_list() == [
|
||||
TRAM_METRO_CATEGORY,
|
||||
TUBE_STATION_CATEGORY,
|
||||
]
|
||||
tube = result.filter(pl.col("category") == TUBE_STATION_CATEGORY)
|
||||
# The station node (not the entrance) represents the merged POI.
|
||||
assert tube["id"][0] == "940GZZLUHLT"
|
||||
tram = result.filter(pl.col("category") == TRAM_METRO_CATEGORY)
|
||||
assert tram["id"][0] == "9400ZZMAWST"
|
||||
|
||||
|
||||
def test_deduplicate_naptan_merges_bus_station_bays_and_entrances():
|
||||
# BCS bays and a BCE entrance of one bus station collapse to a single POI
|
||||
# represented by a non-entrance node; a different bus station in another
|
||||
# area survives separately.
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"id": ["bay-1", "bay-2", "ent-1", "other"],
|
||||
"name": [
|
||||
"Bury Interchange",
|
||||
"Bury Interchange",
|
||||
"Bury Interchange East Entrance",
|
||||
"Rochdale Interchange",
|
||||
],
|
||||
"category": ["Bus station"] * 4,
|
||||
"lat": [53.5907, 53.5908, 53.5909, 53.6160],
|
||||
"lng": [-2.2958, -2.2957, -2.2956, -2.1561],
|
||||
"locality": ["BURY", "BURY", "BURY", "ROCHDALE"],
|
||||
"entrance": [False, False, True, False],
|
||||
}
|
||||
)
|
||||
|
||||
result = deduplicate_naptan(df).sort("name")
|
||||
|
||||
assert result["name"].to_list() == ["Bury Interchange", "Rochdale Interchange"]
|
||||
assert result.filter(pl.col("name") == "Bury Interchange")["id"][0] == "bay-1"
|
||||
|
||||
|
||||
def test_deduplicate_naptan_merges_tube_station_variants_by_area():
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
|
|
|
|||
|
|
@ -86,7 +86,7 @@ def test_naptan_dlr_stations_are_deduplicated_by_atco_code(tmp_path):
|
|||
"Bank",
|
||||
],
|
||||
"category": [
|
||||
"Tube station",
|
||||
"Tram & Metro stop",
|
||||
"Tube station",
|
||||
"Rail station",
|
||||
"Bus stop",
|
||||
|
|
|
|||
|
|
@ -1,11 +1,15 @@
|
|||
"""Tests for transit_network GTFS processing."""
|
||||
|
||||
import datetime as dt
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pipeline.download.transit_network import convert_high_freq_to_frequency_based
|
||||
from pipeline.download.transit_network import (
|
||||
convert_high_freq_to_frequency_based,
|
||||
validate_gtfs_feed,
|
||||
)
|
||||
|
||||
|
||||
def _write_gtfs(path: Path, *, stop_times: str) -> None:
|
||||
|
|
@ -77,3 +81,162 @@ def test_raises_when_no_first_stops_found(tmp_path: Path) -> None:
|
|||
|
||||
with pytest.raises(RuntimeError, match="no first stops"):
|
||||
convert_high_freq_to_frequency_based(src, dst)
|
||||
|
||||
|
||||
# ── validate_gtfs_feed ────────────────────────────────────────────────────────
|
||||
|
||||
TODAY = dt.date(2026, 6, 10)
|
||||
|
||||
|
||||
def _make_gtfs(
|
||||
path: Path,
|
||||
*,
|
||||
calendar: str | None = (
|
||||
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
|
||||
"start_date,end_date\n"
|
||||
"S1,1,1,1,1,1,0,0,20260101,20271231\n"
|
||||
),
|
||||
calendar_dates: str | None = None,
|
||||
stops: str = (
|
||||
"stop_id,stop_name,stop_lat,stop_lon\n"
|
||||
"STOP_A,Bank,51.5133,-0.0886\n"
|
||||
"STOP_B,Liverpool Street,51.5178,-0.0823\n"
|
||||
),
|
||||
routes: str = "route_id,agency_id,route_short_name,route_type\nR1,OP1,Central,1\n",
|
||||
trips: str = "trip_id,route_id,service_id\nT1,R1,S1\n",
|
||||
stop_times: str = (
|
||||
"trip_id,stop_sequence,departure_time,stop_id\n"
|
||||
"T1,0,06:00:00,STOP_A\n"
|
||||
"T1,1,06:02:00,STOP_B\n"
|
||||
),
|
||||
) -> Path:
|
||||
"""Write a tiny synthetic GTFS zip; defaults form a valid current feed."""
|
||||
with zipfile.ZipFile(path, "w") as z:
|
||||
if calendar is not None:
|
||||
z.writestr("calendar.txt", calendar)
|
||||
if calendar_dates is not None:
|
||||
z.writestr("calendar_dates.txt", calendar_dates)
|
||||
z.writestr("stops.txt", stops)
|
||||
z.writestr("routes.txt", routes)
|
||||
z.writestr("trips.txt", trips)
|
||||
z.writestr("stop_times.txt", stop_times)
|
||||
return path
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_happy_path(tmp_path: Path) -> None:
|
||||
feed = _make_gtfs(tmp_path / "feed.zip")
|
||||
validate_gtfs_feed(feed, "test feed", today=TODAY) # must not raise
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_expired_calendar(tmp_path: Path) -> None:
|
||||
"""The 2010 TfL snapshot failure mode: all calendars ended years ago."""
|
||||
feed = _make_gtfs(
|
||||
tmp_path / "feed.zip",
|
||||
calendar=(
|
||||
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
|
||||
"start_date,end_date\n"
|
||||
"S1,1,1,1,1,1,0,0,20091201,20101224\n"
|
||||
),
|
||||
)
|
||||
with pytest.raises(RuntimeError, match=r"'stale tfl'.*no service active"):
|
||||
validate_gtfs_feed(feed, "stale tfl", today=TODAY)
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_calendar_starting_after_window_fails(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
feed = _make_gtfs(
|
||||
tmp_path / "feed.zip",
|
||||
calendar=(
|
||||
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
|
||||
"start_date,end_date\n"
|
||||
"S1,1,1,1,1,1,0,0,20270101,20271231\n"
|
||||
),
|
||||
)
|
||||
with pytest.raises(RuntimeError, match="no service active"):
|
||||
validate_gtfs_feed(feed, "future feed", today=TODAY)
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_calendar_dates_rescues_expired_calendar(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
"""An expired calendar.txt passes if calendar_dates.txt adds service now."""
|
||||
feed = _make_gtfs(
|
||||
tmp_path / "feed.zip",
|
||||
calendar=(
|
||||
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
|
||||
"start_date,end_date\n"
|
||||
"S1,1,1,1,1,1,0,0,20091201,20101224\n"
|
||||
),
|
||||
calendar_dates="service_id,date,exception_type\nS1,20260615,1\n",
|
||||
)
|
||||
validate_gtfs_feed(feed, "rescued feed", today=TODAY) # must not raise
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_removed_service_exception_does_not_count(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
feed = _make_gtfs(
|
||||
tmp_path / "feed.zip",
|
||||
calendar=None,
|
||||
calendar_dates="service_id,date,exception_type\nS1,20260615,2\n",
|
||||
)
|
||||
with pytest.raises(RuntimeError, match="no service active"):
|
||||
validate_gtfs_feed(feed, "removed-only feed", today=TODAY)
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_zero_and_empty_coords(tmp_path: Path) -> None:
|
||||
"""The 2010 TfL snapshot's other failure mode: empty or 0,0 stop coords."""
|
||||
feed = _make_gtfs(
|
||||
tmp_path / "feed.zip",
|
||||
stops=(
|
||||
"stop_id,stop_name,stop_lat,stop_lon\n"
|
||||
"STOP_A,Nowhere,0,0\n"
|
||||
"STOP_B,Blank,,\n"
|
||||
),
|
||||
)
|
||||
with pytest.raises(RuntimeError, match=r"plausible UK coordinates"):
|
||||
validate_gtfs_feed(feed, "coordless feed", today=TODAY)
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_non_uk_coords_fail(tmp_path: Path) -> None:
|
||||
feed = _make_gtfs(
|
||||
tmp_path / "feed.zip",
|
||||
stops=(
|
||||
"stop_id,stop_name,stop_lat,stop_lon\n"
|
||||
"STOP_A,New York,40.71,-74.0\n"
|
||||
"STOP_B,Sydney,-33.87,151.21\n"
|
||||
),
|
||||
)
|
||||
with pytest.raises(RuntimeError, match="plausible UK coordinates"):
|
||||
validate_gtfs_feed(feed, "abroad feed", today=TODAY)
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_minority_bad_coords_pass(tmp_path: Path) -> None:
|
||||
"""One bad stop out of 30 (3.3%) stays under the 5% tolerance."""
|
||||
rows = [f"STOP_{i},Stop {i},51.5,{-0.1 + i * 0.001}\n" for i in range(29)]
|
||||
rows.append("STOP_BAD,Broken,0,0\n")
|
||||
feed = _make_gtfs(
|
||||
tmp_path / "feed.zip",
|
||||
stops="stop_id,stop_name,stop_lat,stop_lon\n" + "".join(rows),
|
||||
)
|
||||
validate_gtfs_feed(feed, "mostly good feed", today=TODAY) # must not raise
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_empty_trips(tmp_path: Path) -> None:
|
||||
feed = _make_gtfs(tmp_path / "feed.zip", trips="trip_id,route_id,service_id\n")
|
||||
with pytest.raises(RuntimeError, match="trips.txt has no data rows"):
|
||||
validate_gtfs_feed(feed, "tripless feed", today=TODAY)
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_missing_calendar_files(tmp_path: Path) -> None:
|
||||
feed = _make_gtfs(tmp_path / "feed.zip", calendar=None)
|
||||
with pytest.raises(RuntimeError, match="neither calendar.txt nor calendar_dates"):
|
||||
validate_gtfs_feed(feed, "calendarless feed", today=TODAY)
|
||||
|
||||
|
||||
def test_validate_gtfs_feed_not_a_zip(tmp_path: Path) -> None:
|
||||
bogus = tmp_path / "feed.zip"
|
||||
bogus.write_text("not a zip")
|
||||
with pytest.raises(RuntimeError, match="not a valid zip"):
|
||||
validate_gtfs_feed(bogus, "bogus feed", today=TODAY)
|
||||
|
|
|
|||
|
|
@ -2,24 +2,32 @@
|
|||
|
||||
Downloads:
|
||||
- England OSM PBF from Geofabrik (~1.5GB)
|
||||
- BODS GTFS from Bus Open Data Service (~1.5GB, all England bus/tram/ferry)
|
||||
- TfL TransXChange timetables → converted to GTFS
|
||||
- National Rail CIF timetable → converted to GTFS (requires credentials)
|
||||
- BODS GTFS from Bus Open Data Service (~1.5GB; all England bus/tram/ferry,
|
||||
plus London Underground, DLR, London Tramlink and the IFS Cloud Cable Car)
|
||||
- National Rail CIF timetable → converted to GTFS (requires credentials;
|
||||
includes the Elizabeth line, TOC "XR")
|
||||
|
||||
Then processes for R5 compatibility:
|
||||
- Cleans BODS GTFS (fixes stop_times >72h, feed_info year >2100)
|
||||
- Converts high-frequency metro/tram services to frequency-based GTFS
|
||||
- Converts TfL TransXChange to GTFS via transxchange2gtfs
|
||||
- Converts National Rail CIF to GTFS via dtd2mysql (requires MariaDB Docker)
|
||||
- Validates every produced GTFS zip (active calendar window, plausible UK
|
||||
stop coordinates, non-empty routes/trips/stop_times)
|
||||
|
||||
Requires: osmium-tool, Node.js (npx), Docker (for national rail)
|
||||
Note: the legacy TfL TransXChange feed (tfl.gov.uk journey-planner-timetables)
|
||||
was removed: that URL serves a 2010-10-28 snapshot whose calendars all expired
|
||||
in 2010 and whose stops have empty/0,0 coordinates, so it contributed zero
|
||||
service. BODS covers all TfL modes that feed nominally provided.
|
||||
|
||||
Requires: osmium-tool, Docker (for national rail)
|
||||
|
||||
Output directory: property-data/transit/
|
||||
raw/england.osm.pbf + bods_gtfs.zip + tfl_gtfs.zip + national_rail_gtfs.zip
|
||||
raw/england.osm.pbf + bods_gtfs.zip + national_rail_gtfs.zip
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import datetime as dt
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
|
|
@ -45,20 +53,18 @@ ENGLAND_PBF_URL = (
|
|||
# Bus Open Data Service — pre-converted GTFS covering all England bus/tram/ferry
|
||||
BODS_GTFS_URL = "https://data.bus-data.dft.gov.uk/timetable/download/gtfs-file/all/"
|
||||
|
||||
# TfL TransXChange timetables (tube, DLR, tram, buses, river bus, cable car)
|
||||
TFL_TRANSXCHANGE_URL = (
|
||||
"https://tfl.gov.uk/cdn/static/cms/documents/journey-planner-timetables.zip"
|
||||
)
|
||||
|
||||
# NaPTAN stops data — needed by transxchange2gtfs (its built-in URL is broken)
|
||||
NAPTAN_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
|
||||
|
||||
# National Rail Open Data API
|
||||
NR_AUTH_URL = "https://opendata.nationalrail.co.uk/authenticate"
|
||||
NR_TIMETABLE_URL = "https://opendata.nationalrail.co.uk/api/staticfeeds/3.0/timetable"
|
||||
|
||||
USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"
|
||||
TRANSXCHANGE2GTFS_PACKAGE = "transxchange2gtfs@1.12.0"
|
||||
|
||||
# GTFS validation: a feed must have service within this many days of the build
|
||||
# date, and at least this fraction of stops must have plausible UK coordinates.
|
||||
GTFS_CALENDAR_LOOKAHEAD_DAYS = 60
|
||||
GTFS_MIN_VALID_STOP_FRACTION = 0.95
|
||||
UK_LAT_RANGE = (49.0, 61.0)
|
||||
UK_LON_RANGE = (-9.0, 2.5)
|
||||
|
||||
|
||||
def _download_http(
|
||||
|
|
@ -468,89 +474,175 @@ def convert_high_freq_to_frequency_based(
|
|||
print(f" Saved to {dst}")
|
||||
|
||||
|
||||
def download_tfl_transxchange(raw_dir: Path) -> Path:
|
||||
"""Download TfL TransXChange timetable bundle."""
|
||||
dest = raw_dir / "tfl_transxchange.zip"
|
||||
if dest.exists():
|
||||
print(f"TfL TransXChange already exists: {dest}")
|
||||
return dest
|
||||
|
||||
print("Downloading TfL TransXChange timetables...")
|
||||
_download_http(TFL_TRANSXCHANGE_URL, dest, desc="tfl_transxchange.zip")
|
||||
return dest
|
||||
def _gtfs_has_data_row(z: zipfile.ZipFile, filename: str) -> bool:
|
||||
"""True if a GTFS file has at least one non-empty data row after the header."""
|
||||
with z.open(filename) as f:
|
||||
f.readline() # header
|
||||
for line in f:
|
||||
if _parse_csv_line(line):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def download_naptan() -> None:
|
||||
"""Download NaPTAN stops to the local temp dir for transxchange2gtfs."""
|
||||
dest = local_tmp_dir() / "Stops.csv"
|
||||
if dest.exists():
|
||||
print(f"NaPTAN Stops.csv already exists: {dest}")
|
||||
return
|
||||
def _calendar_active_in_window(
|
||||
z: zipfile.ZipFile, names: set[str], window_start: int, window_end: int
|
||||
) -> bool:
|
||||
"""True if calendar.txt/calendar_dates.txt have service in [start, end].
|
||||
|
||||
print("Downloading NaPTAN stops data...")
|
||||
_download_http(NAPTAN_URL, dest, desc="Stops.csv")
|
||||
|
||||
|
||||
def convert_tfl_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
|
||||
"""Convert TfL TransXChange to GTFS using transxchange2gtfs."""
|
||||
dest = output_dir / "tfl_gtfs.zip"
|
||||
if dest.exists():
|
||||
print(f"TfL GTFS already exists: {dest}")
|
||||
return dest
|
||||
|
||||
txc_path = raw_dir / "tfl_transxchange.zip"
|
||||
|
||||
# Ensure NaPTAN is available (transxchange2gtfs has a broken download URL)
|
||||
download_naptan()
|
||||
|
||||
print("Converting TfL TransXChange → GTFS...")
|
||||
# The shim patches known packaging/runtime issues in the pinned npm package
|
||||
# before loading its CLI from npx's temporary install.
|
||||
shim_path = Path(__file__).with_name("transxchange2gtfs_shim.js")
|
||||
subprocess.run(
|
||||
[
|
||||
"npx",
|
||||
"--yes",
|
||||
"--package",
|
||||
TRANSXCHANGE2GTFS_PACKAGE,
|
||||
"sh",
|
||||
"-c",
|
||||
"\n".join(
|
||||
[
|
||||
'bin="$(command -v transxchange2gtfs)"',
|
||||
'script="$(readlink -f "$bin")"',
|
||||
'pkg_dir="$(dirname "$(dirname "$script")")"',
|
||||
'shim="$1"',
|
||||
"shift",
|
||||
'exec node "$shim" "$pkg_dir" "$@"',
|
||||
]
|
||||
),
|
||||
"transxchange2gtfs",
|
||||
str(shim_path.resolve()),
|
||||
str(txc_path.resolve()),
|
||||
str(dest.resolve()),
|
||||
],
|
||||
check=True,
|
||||
Dates are compared as YYYYMMDD integers. A calendar.txt row counts when its
|
||||
date range overlaps the window AND at least one weekday flag is set; a
|
||||
calendar_dates.txt row counts when it adds service (exception_type=1) on a
|
||||
date inside the window.
|
||||
"""
|
||||
weekdays = (
|
||||
"monday",
|
||||
"tuesday",
|
||||
"wednesday",
|
||||
"thursday",
|
||||
"friday",
|
||||
"saturday",
|
||||
"sunday",
|
||||
)
|
||||
if "calendar.txt" in names:
|
||||
with z.open("calendar.txt") as f:
|
||||
cols = _parse_csv_line(f.readline())
|
||||
try:
|
||||
start_idx = cols.index("start_date")
|
||||
end_idx = cols.index("end_date")
|
||||
except ValueError:
|
||||
return False
|
||||
day_idxs = [cols.index(d) for d in weekdays if d in cols]
|
||||
for line in f:
|
||||
parts = _parse_csv_line(line)
|
||||
if not parts:
|
||||
continue
|
||||
try:
|
||||
start = int(parts[start_idx].strip('"'))
|
||||
end = int(parts[end_idx].strip('"'))
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
if start > window_end or end < window_start:
|
||||
continue
|
||||
if day_idxs and not any(
|
||||
parts[i].strip('"') == "1" for i in day_idxs if i < len(parts)
|
||||
):
|
||||
continue
|
||||
return True
|
||||
|
||||
if "calendar_dates.txt" in names:
|
||||
with z.open("calendar_dates.txt") as f:
|
||||
cols = _parse_csv_line(f.readline())
|
||||
try:
|
||||
date_idx = cols.index("date")
|
||||
exc_idx = cols.index("exception_type")
|
||||
except ValueError:
|
||||
return False
|
||||
for line in f:
|
||||
parts = _parse_csv_line(line)
|
||||
if not parts:
|
||||
continue
|
||||
try:
|
||||
date = int(parts[date_idx].strip('"'))
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
if exc_idx < len(parts) and parts[exc_idx].strip('"') != "1":
|
||||
continue
|
||||
if window_start <= date <= window_end:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def validate_gtfs_feed(path: Path, feed_name: str, *, today: dt.date | None = None) -> None:
|
||||
"""Sanity-check a produced/downloaded GTFS zip; raise RuntimeError if dead.
|
||||
|
||||
Guards against silently shipping a feed that contributes zero service (as
|
||||
the old TfL dump did: 2010 calendars, empty/0,0 stop coordinates). Checks:
|
||||
(a) calendar.txt/calendar_dates.txt have at least one service active
|
||||
within [today, today + GTFS_CALENDAR_LOOKAHEAD_DAYS];
|
||||
(b) stops.txt is non-empty and >= GTFS_MIN_VALID_STOP_FRACTION of stops
|
||||
have plausible UK coordinates (lat 49-61, lon -9..2.5, not 0,0);
|
||||
(c) routes.txt, trips.txt and stop_times.txt each have data rows.
|
||||
"""
|
||||
if today is None:
|
||||
today = dt.date.today()
|
||||
window_start = int(today.strftime("%Y%m%d"))
|
||||
window_end = int(
|
||||
(today + dt.timedelta(days=GTFS_CALENDAR_LOOKAHEAD_DAYS)).strftime("%Y%m%d")
|
||||
)
|
||||
|
||||
def fail(reason: str) -> None:
|
||||
raise RuntimeError(
|
||||
f"GTFS validation failed for feed '{feed_name}' ({path}): {reason}"
|
||||
)
|
||||
|
||||
print(f"Validating GTFS feed '{feed_name}'...")
|
||||
if not path.exists() or not zipfile.is_zipfile(path):
|
||||
fail("not a valid zip file")
|
||||
|
||||
with zipfile.ZipFile(path) as z:
|
||||
names = set(z.namelist())
|
||||
|
||||
# (c) core files present and non-empty
|
||||
for required in ("routes.txt", "trips.txt", "stop_times.txt", "stops.txt"):
|
||||
if required not in names:
|
||||
fail(f"missing {required}")
|
||||
if not _gtfs_has_data_row(z, required):
|
||||
fail(f"{required} has no data rows")
|
||||
|
||||
# (a) at least one service active in the routing window
|
||||
if "calendar.txt" not in names and "calendar_dates.txt" not in names:
|
||||
fail("has neither calendar.txt nor calendar_dates.txt")
|
||||
if not _calendar_active_in_window(z, names, window_start, window_end):
|
||||
fail(
|
||||
f"no service active between {window_start} and {window_end} — "
|
||||
"the feed's calendars are stale/expired and it would contribute "
|
||||
"zero service to routing"
|
||||
)
|
||||
|
||||
# (b) stops have plausible UK coordinates
|
||||
total_stops = 0
|
||||
valid_stops = 0
|
||||
with z.open("stops.txt") as f:
|
||||
cols = _parse_csv_line(f.readline())
|
||||
try:
|
||||
lat_idx = cols.index("stop_lat")
|
||||
lon_idx = cols.index("stop_lon")
|
||||
except ValueError:
|
||||
fail("stops.txt is missing stop_lat/stop_lon columns")
|
||||
for line in f:
|
||||
parts = _parse_csv_line(line)
|
||||
if not parts:
|
||||
continue
|
||||
total_stops += 1
|
||||
try:
|
||||
lat = float(parts[lat_idx].strip('"'))
|
||||
lon = float(parts[lon_idx].strip('"'))
|
||||
except (ValueError, IndexError):
|
||||
continue # empty/garbage coordinate → invalid
|
||||
if lat == 0.0 and lon == 0.0:
|
||||
continue
|
||||
if (
|
||||
UK_LAT_RANGE[0] <= lat <= UK_LAT_RANGE[1]
|
||||
and UK_LON_RANGE[0] <= lon <= UK_LON_RANGE[1]
|
||||
):
|
||||
valid_stops += 1
|
||||
if total_stops == 0:
|
||||
fail("stops.txt has no stops")
|
||||
fraction = valid_stops / total_stops
|
||||
if fraction < GTFS_MIN_VALID_STOP_FRACTION:
|
||||
fail(
|
||||
f"only {valid_stops}/{total_stops} stops "
|
||||
f"({fraction:.1%}) have plausible UK coordinates "
|
||||
f"(lat {UK_LAT_RANGE[0]}-{UK_LAT_RANGE[1]}, "
|
||||
f"lon {UK_LON_RANGE[0]}..{UK_LON_RANGE[1]}, non-null, not 0,0); "
|
||||
f"need >= {GTFS_MIN_VALID_STOP_FRACTION:.0%}"
|
||||
)
|
||||
|
||||
print(
|
||||
f" OK: service active in window, {valid_stops}/{total_stops} stops "
|
||||
f"({fraction:.1%}) with plausible UK coordinates"
|
||||
)
|
||||
required_files = {
|
||||
"agency.txt",
|
||||
"calendar.txt",
|
||||
"calendar_dates.txt",
|
||||
"routes.txt",
|
||||
"stop_times.txt",
|
||||
"stops.txt",
|
||||
"trips.txt",
|
||||
}
|
||||
if not dest.exists() or not zipfile.is_zipfile(dest):
|
||||
raise RuntimeError(f"transxchange2gtfs did not create a valid GTFS zip: {dest}")
|
||||
with zipfile.ZipFile(dest) as z:
|
||||
missing = required_files - set(z.namelist())
|
||||
if missing:
|
||||
missing_str = ", ".join(sorted(missing))
|
||||
raise RuntimeError(f"TfL GTFS zip is missing required files: {missing_str}")
|
||||
size_mb = dest.stat().st_size / (1024 * 1024)
|
||||
print(f" Saved to {dest} ({size_mb:.1f} MB)")
|
||||
return dest
|
||||
|
||||
|
||||
def download_national_rail_cif(raw_dir: Path) -> Path | None:
|
||||
|
|
@ -1007,18 +1099,15 @@ def main() -> None:
|
|||
required=True,
|
||||
help="Output directory for transit data",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-tfl",
|
||||
action="store_true",
|
||||
help="Skip TfL TransXChange download and conversion",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
output_dir: Path = args.output
|
||||
raw_dir = output_dir / "raw"
|
||||
raw_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 1. Download, clean, and frequency-convert BODS GTFS
|
||||
# 1. Download, clean, and frequency-convert BODS GTFS. BODS covers all
|
||||
# England bus/tram/ferry plus London Underground, DLR, London Tramlink and
|
||||
# the IFS Cloud Cable Car, so no separate TfL feed is needed.
|
||||
download_osm_pbf(raw_dir)
|
||||
bods_raw = download_bods_gtfs(raw_dir)
|
||||
|
||||
|
|
@ -1027,16 +1116,10 @@ def main() -> None:
|
|||
|
||||
bods_final = output_dir / "bods_gtfs.zip"
|
||||
convert_high_freq_to_frequency_based(bods_cleaned, bods_final)
|
||||
validate_gtfs_feed(bods_final, "BODS GTFS")
|
||||
|
||||
# 2. TfL TransXChange → GTFS
|
||||
if args.skip_tfl:
|
||||
print("Skipping TfL (--skip-tfl)")
|
||||
else:
|
||||
download_tfl_transxchange(raw_dir)
|
||||
convert_tfl_to_gtfs(raw_dir, output_dir)
|
||||
|
||||
# 3. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
|
||||
# reach the ~2,725 railway-station destinations, so a bus/TfL-only network
|
||||
# 2. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
|
||||
# reach the ~2,725 railway-station destinations, so a bus/metro-only network
|
||||
# silently overstates every train commute. Missing credentials are a HARD
|
||||
# error, so a rail-less network can never ship.
|
||||
cif = download_national_rail_cif(raw_dir)
|
||||
|
|
@ -1048,7 +1131,8 @@ def main() -> None:
|
|||
"required; without it the transit network models every train journey "
|
||||
"as bus-only and overstates commute times."
|
||||
)
|
||||
convert_national_rail_to_gtfs(raw_dir, output_dir)
|
||||
nr_final = convert_national_rail_to_gtfs(raw_dir, output_dir)
|
||||
validate_gtfs_feed(nr_final, "National Rail GTFS")
|
||||
|
||||
# Summary
|
||||
print()
|
||||
|
|
|
|||
|
|
@ -1,106 +0,0 @@
|
|||
#!/usr/bin/env node
|
||||
"use strict";
|
||||
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
const { createRequire } = require("module");
|
||||
|
||||
const [pkgDirArg, ...converterArgs] = process.argv.slice(2);
|
||||
|
||||
if (!pkgDirArg || converterArgs.length < 2) {
|
||||
console.error(
|
||||
"Usage: transxchange2gtfs_shim.js <package-dir> <input...> <output>",
|
||||
);
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
const pkgDir = path.resolve(pkgDirArg);
|
||||
const defaultTmpDir = path.resolve(__dirname, "..", "..", ".tmp");
|
||||
const localTmpDir =
|
||||
process.env.TMPDIR || process.env.TEMP || process.env.TMP || defaultTmpDir;
|
||||
const stopsCsv = path.join(localTmpDir, "Stops.csv");
|
||||
const converterTmpPrefix = path.join(localTmpDir, "transxchange2gtfs_");
|
||||
const converterTmpPatch =
|
||||
`static TMP = ${JSON.stringify(converterTmpPrefix)}` +
|
||||
` + process.pid + ${JSON.stringify(path.sep)};`;
|
||||
|
||||
fs.mkdirSync(localTmpDir, { recursive: true });
|
||||
|
||||
function replaceOnce(relativePath, before, after) {
|
||||
const file = path.join(pkgDir, relativePath);
|
||||
const original = fs.readFileSync(file, "utf8");
|
||||
if (original.includes(before)) {
|
||||
fs.writeFileSync(file, original.replace(before, after));
|
||||
} else if (original.includes(after)) {
|
||||
return;
|
||||
} else {
|
||||
throw new Error(`Could not patch ${relativePath}: expected text not found`);
|
||||
}
|
||||
}
|
||||
|
||||
// The published 1.12.0 package has a few compatibility issues with current
|
||||
// TfL TransXChange exports:
|
||||
// - the bin script points at dist/src/cli.js, but the package ships dist/cli.js
|
||||
// - the compiled date-holidays import expects a synthetic default export
|
||||
// - some TfL journeys reference timing links without matching route-link geometry
|
||||
//
|
||||
// GTFS shapes are optional for R5 routing. Clear shape references and omit
|
||||
// shapes.txt so missing route geometry does not drop otherwise usable trips.
|
||||
function patchPackage() {
|
||||
replaceOnce(
|
||||
"dist/Container.js",
|
||||
"static TMP = `/tmp/transxchange2gtfs_${process.pid}/`;",
|
||||
converterTmpPatch,
|
||||
);
|
||||
replaceOnce(
|
||||
"dist/Container.js",
|
||||
'fs.existsSync("/tmp/Stops.csv")',
|
||||
`fs.existsSync(${JSON.stringify(stopsCsv)})`,
|
||||
);
|
||||
replaceOnce(
|
||||
"dist/Container.js",
|
||||
'fs.createReadStream("/tmp/Stops.csv", "utf8")',
|
||||
`fs.createReadStream(${JSON.stringify(stopsCsv)}, "utf8")`,
|
||||
);
|
||||
replaceOnce(
|
||||
"dist/converter/GetStopData.js",
|
||||
'fs.createWriteStream("/tmp/Stops.csv")',
|
||||
`fs.createWriteStream(${JSON.stringify(stopsCsv)})`,
|
||||
);
|
||||
replaceOnce(
|
||||
"dist/transxchange/TransXChangeJourneyStream.js",
|
||||
"distanceSoFarM += routeLink.Distance;",
|
||||
"distanceSoFarM += routeLink ? routeLink.Distance : 0;",
|
||||
);
|
||||
replaceOnce(
|
||||
"dist/gtfs/TripsStream.js",
|
||||
"(0, crypto_1.createHash)('md5').update(JSON.stringify({ routeId: journey.route, routeLinkSeq: journey.routeLinkIds })).digest(\"hex\"));",
|
||||
"\"\");",
|
||||
);
|
||||
replaceOnce(
|
||||
"dist/gtfs/StopTimesStream.js",
|
||||
"stop.shapeDistTraveled, stop.exactTime ? \"1\" : \"0\");",
|
||||
"\"\", stop.exactTime ? \"1\" : \"0\");",
|
||||
);
|
||||
replaceOnce(
|
||||
"dist/Container.js",
|
||||
"\"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex)),\n \"shapes.txt\": journeyStream.pipe(new ShapesStream_1.ShapesStream())",
|
||||
"\"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
|
||||
);
|
||||
replaceOnce(
|
||||
"dist/Container.js",
|
||||
"\"routes.txt\": transxchange.pipe(new RoutesStream_1.RoutesStream()),\n \"transfers.txt\": transxchange.pipe(new TransfersStream_1.TransfersStream(naptanIndex, locationIndex)),\n \"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
|
||||
"\"routes.txt\": transxchange.pipe(new RoutesStream_1.RoutesStream()),\n \"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
|
||||
);
|
||||
}
|
||||
|
||||
patchPackage();
|
||||
|
||||
const pkgRequire = createRequire(path.join(pkgDir, "package.json"));
|
||||
const Holidays = pkgRequire("date-holidays");
|
||||
if (!Holidays.default) {
|
||||
Holidays.default = Holidays;
|
||||
}
|
||||
|
||||
process.argv = [process.argv[0], "transxchange2gtfs", ...converterArgs];
|
||||
require(path.join(pkgDir, "dist", "cli.js"));
|
||||
Loading…
Add table
Add a link
Reference in a new issue