Test changes
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 8m20s
CI / Check (push) Failing after 10m40s

This commit is contained in:
Andras Schmelczer 2026-05-09 11:35:38 +01:00
parent 4c95815dc8
commit be02fc16bb
41 changed files with 4224 additions and 759 deletions

View file

@ -1,9 +1,15 @@
import argparse
import base64
import json
import re
import sys
import urllib.request
from concurrent.futures import ThreadPoolExecutor, as_completed
from io import BytesIO
from pathlib import Path
from PIL import Image, ImageDraw
from pipeline.transform.transform_poi import NAPTAN_EMOJIS, _CATEGORIES
GLYPHS_BASE = "https://protomaps.github.io/basemaps-assets/fonts"
@ -14,53 +20,80 @@ POI_ICON_BASE = "https://geolytix.github.io/MapIcons"
# Font stacks used by @protomaps/basemaps with lang='en'
FONT_STACKS = ["Noto Sans Regular", "Noto Sans Italic", "Noto Sans Medium"]
# Fallback emoji not in any category
_FALLBACK_EMOJIS = ["📍"]
POI_ICON_PATHS = [
"asda/asda_express_24px.svg",
"asda/asda_green_basket_24px.svg",
"asda/asda_green_trolley_24px.svg",
"asda/asda_living_24px.svg",
"asda/asda_pfs_24px.svg",
"asda/asda_primary.svg",
"asda/asda_superstore_green_trolley_24px.svg",
"brands/aldi_24px.svg",
"brands/amazon_fresh_alt_24px.svg",
"brands/booths_24px.svg",
"brands/budgens_24px.svg",
"brands/centra_24px.svg",
"brands/cook.svg",
"brands/coop_24px.svg",
"brands/costco_24px.svg",
"brands/dunnes_stores_24px.svg",
"brands/farmfoods_updated_24px.svg",
"brands/heron_24px.svg",
"brands/iceland_24px.svg",
"brands/iceland_food_warehouse_24px.svg",
"brands/lidl_24px.svg",
"brands/little_waitrose_24px.svg",
"brands/makro_24px.svg",
"brands/mns_24px.svg",
"brands/mns_food_24px.svg",
"brands/mns_high_street_24px.svg",
"brands/mns_hospital_24px.svg",
"brands/mns_moto_24px.svg",
"brands/mns_outlet_24px.svg",
"brands/morrisons_24px.svg",
"brands/morrisons_daily_24px.svg",
"brands/sainsburys_24px.svg",
"brands/sainsburys_local_24px.svg",
"brands/spar_24px.svg",
"brands/tesco_24px.svg",
"brands/tesco_express_24px.svg",
"brands/tesco_extra_24px.svg",
"brands/waitrose_24px.svg",
"brands/wholefoods_24px.svg",
"logos/planet_organic_24px.svg",
"brands_2023/supermarkets/farmfoods.svg",
"brands_2023/supermarkets/heron_foods.svg",
"brands_2023/supermarkets/little_waitrose.svg",
"brands_2024/amazon_fresh.svg",
"brands_2024/booths.svg",
"brands_2024/budgens.svg",
"brands_2024/cook.svg",
"brands_2024/dunnes_stores.svg",
"brands_2024/iceland.svg",
"brands_2024/makro.svg",
"brands_2024/mns.svg",
"brands_2024/morrisons_daily.svg",
"brands_2024/sainsburys_local.svg",
"brands_2024/wholefoods.svg",
"logos/aldi.svg",
"logos/asda.svg",
"logos/centra.svg",
"logos/coop.svg",
"logos/lidl.svg",
"logos/morrisons.svg",
"logos/planet_organic.svg",
"logos/sainsburys.svg",
"logos/spar.svg",
"logos/tesco.svg",
"logos/tesco_express.svg",
"logos/tesco_extra.svg",
"logos/waitrose.svg",
"public_transport/london_tube.svg",
"visuals/mns.svg",
]
DERIVED_POI_ICON_PATHS = [
("costco_logo", "brands/costco.svg", "logos/costco.svg"),
(
"embedded_png",
"brands/iceland_food_warehouse_24px.svg",
"logos/the_food_warehouse.png",
),
]
POI_ICON_SVG_CROPS = {
"brands_2023/supermarkets/farmfoods.svg": (1.293, 7.314, 15.48, 3.293),
"brands_2023/supermarkets/heron_foods.svg": (0.062, 6.68, 17.995, 5.325),
"brands_2023/supermarkets/little_waitrose.svg": (0.916, 5.645, 16.365, 6.719),
"brands_2024/amazon_fresh.svg": (3.817, 1.646, 16.367, 16.358),
"brands_2024/booths.svg": (1.456, 7.143, 15.313, 3.512),
"brands_2024/budgens.svg": (2.251, 2.278, 13.6, 13.612),
"brands_2024/cook.svg": (5.028, 5.493, 13.945, 9.648),
"brands_2024/dunnes_stores.svg": (4.375, 7.732, 15.249, 5.055),
"brands_2024/iceland.svg": (1.136, 6.823, 16.067, 4.302),
"brands_2024/makro.svg": (4.411, 6.098, 16.397, 5.428),
"brands_2024/mns.svg": (4.042, 6.986, 16.171, 6.724),
"brands_2024/morrisons_daily.svg": (3.341, 4.414, 17.317, 8.248),
"brands_2024/sainsburys_local.svg": (4.58, 1.61, 14.84, 14.849),
"brands_2024/wholefoods.svg": (4.17, 2.193, 15.659, 15.668),
"logos/aldi.svg": (4.813, 2.563, 14.374, 14.383),
"logos/asda.svg": (3.91, 7.135, 16.181, 5.442),
"logos/centra.svg": (3.36, 7.35, 17.28, 4.651),
"logos/coop.svg": (6.407, 4.658, 11.187, 11.793),
"logos/costco.svg": (70.61, 144.908, 256.67, 85.825),
"logos/lidl.svg": (4.938, 2.973, 13.985, 13.985),
"logos/morrisons.svg": (5.231, 2.985, 13.538, 13.398),
"logos/planet_organic.svg": (5.528, 3.564, 12.943, 12.943),
"logos/sainsburys.svg": (7.502, 3.572, 8.996, 12.646),
"logos/spar.svg": (4.933, 2.968, 14.133, 13.853),
"logos/tesco.svg": (4.338, 6.865, 15.324, 5.359),
"logos/tesco_express.svg": (5.231, 5.933, 13.538, 8.345),
"logos/tesco_extra.svg": (4.933, 5.775, 14.133, 8.519),
"logos/waitrose.svg": (5.528, 6.09, 12.943, 9.855),
}
POI_ICON_SVG_INTRINSIC_MAX = 512
def collect_twemoji_codes() -> list[str]:
"""Derive twemoji hex codes from transform_poi categories.
@ -76,9 +109,6 @@ def collect_twemoji_codes() -> list[str]:
for emoji in NAPTAN_EMOJIS.values():
emojis.add(emoji)
for emoji in _FALLBACK_EMOJIS:
emojis.add(emoji)
# First codepoint hex, matching frontend logic
return sorted({f"{ord(e[0]):x}" for e in emojis})
@ -97,6 +127,214 @@ def download_file(url: str, dest: Path) -> tuple[bool, str]:
return False, url
def download_text(url: str) -> str:
with urllib.request.urlopen(url) as response:
return response.read().decode("utf-8")
def build_costco_logo(marker_svg: str) -> str:
start = marker_svg.find('<g><path d=" M 316.312')
end = marker_svg.rfind("</g></g></svg>")
if start < 0 or end < 0:
raise ValueError("Costco marker SVG layout changed")
logo_group = marker_svg[start : end + 4]
return (
'<?xml version="1.0" encoding="UTF-8"?>\n'
'<svg xmlns="http://www.w3.org/2000/svg" viewBox="70 145 260 90" '
'width="260pt" height="90pt" preserveAspectRatio="xMidYMid meet">\n'
f"{logo_group}\n"
"</svg>\n"
)
def trim_white_png(png_bytes: bytes) -> bytes:
image = Image.open(BytesIO(png_bytes)).convert("RGBA")
pixels = image.load()
for y in range(image.height):
for x in range(image.width):
red, green, blue, alpha = pixels[x, y]
if red > 245 and green > 245 and blue > 245:
pixels[x, y] = (red, green, blue, 0)
alpha_box = image.getchannel("A").getbbox()
if alpha_box:
image = image.crop(alpha_box)
out = BytesIO()
image.save(out, format="PNG")
return out.getvalue()
def extract_embedded_png(marker_svg: str) -> bytes:
match = re.search(r"base64,([^\"']+)", marker_svg)
if not match:
raise ValueError("POI marker SVG did not contain an embedded PNG")
return trim_white_png(base64.b64decode(match.group(1)))
def svg_intrinsic_size(width: float, height: float) -> tuple[int, int]:
if width <= 0 or height <= 0:
return (POI_ICON_SVG_INTRINSIC_MAX, POI_ICON_SVG_INTRINSIC_MAX)
if width >= height:
return (
POI_ICON_SVG_INTRINSIC_MAX,
max(1, round(POI_ICON_SVG_INTRINSIC_MAX * height / width)),
)
return (
max(1, round(POI_ICON_SVG_INTRINSIC_MAX * width / height)),
POI_ICON_SVG_INTRINSIC_MAX,
)
def set_svg_geometry(svg_text: str, crop: tuple[float, float, float, float]) -> str:
x, y, width, height = crop
view_box = f"{x:g} {y:g} {width:g} {height:g}"
intrinsic_width, intrinsic_height = svg_intrinsic_size(width, height)
svg_text = re.sub(r'viewBox="[^"]+"', f'viewBox="{view_box}"', svg_text, count=1)
if 'viewBox="' not in svg_text:
svg_text = re.sub(r"<svg\b", f'<svg viewBox="{view_box}"', svg_text, count=1)
svg_text = re.sub(r'width="[^"]+"', f'width="{intrinsic_width}"', svg_text, count=1)
if 'width="' not in svg_text:
svg_text = re.sub(
r"<svg\b", f'<svg width="{intrinsic_width}"', svg_text, count=1
)
svg_text = re.sub(
r'height="[^"]+"', f'height="{intrinsic_height}"', svg_text, count=1
)
if 'height="' not in svg_text:
svg_text = re.sub(
r"<svg\b", f'<svg height="{intrinsic_height}"', svg_text, count=1
)
return svg_text
def get_svg_view_box(svg_text: str) -> tuple[float, float, float, float] | None:
match = re.search(r'viewBox="([^"]+)"', svg_text)
if not match:
return None
parts = [
float(part) for part in re.split(r"[\s,]+", match.group(1).strip()) if part
]
if len(parts) != 4:
return None
return (parts[0], parts[1], parts[2], parts[3])
def crop_poi_svg_icons(poi_icons_dir: Path) -> None:
for icon_path, crop in POI_ICON_SVG_CROPS.items():
dest = poi_icons_dir / icon_path
if not dest.exists():
continue
svg_text = dest.read_text(encoding="utf-8")
if icon_path == "brands_2024/dunnes_stores.svg":
svg_text = svg_text.replace('fill="#fffcfc"', 'fill="#111111"')
svg_text = svg_text.replace('fill="#fcfcfc"', 'fill="#111111"')
dest.write_text(set_svg_geometry(svg_text, crop), encoding="utf-8")
for dest in poi_icons_dir.rglob("*.svg"):
svg_text = dest.read_text(encoding="utf-8")
view_box = get_svg_view_box(svg_text)
if view_box:
dest.write_text(set_svg_geometry(svg_text, view_box), encoding="utf-8")
def download_derived_poi_icon(
kind: str, source_path: str, dest: Path
) -> tuple[bool, str]:
url = f"{POI_ICON_BASE}/{source_path}"
dest.parent.mkdir(parents=True, exist_ok=True)
try:
source = download_text(url)
if kind == "costco_logo":
dest.write_text(build_costco_logo(source), encoding="utf-8")
elif kind == "embedded_png":
dest.write_bytes(extract_embedded_png(source))
else:
raise ValueError(f"Unknown derived POI icon kind: {kind}")
return True, url
except urllib.error.HTTPError as e:
print(f" {e.code} {url}", file=sys.stderr)
return False, url
except Exception as e:
print(f" ERROR {url}: {e}", file=sys.stderr)
return False, url
# Slategray accent used by civic POI icons (school, library, building, …) in
# protomaps' v4 sprite. We match it so the townhall blends in with its peers.
_TOWNHALL_COLOR = {
"light": (135, 128, 171),
"dark": (118, 118, 127),
}
_TOWNHALL_LOGICAL_SIZE = 17
def _render_townhall_glyph(size_px: int, color: tuple[int, int, int]) -> Image.Image:
# Draw at 8× resolution and downsample with Lanczos so the pediment's
# diagonals come out anti-aliased; PIL's polygon fill is otherwise aliased.
super_factor = 8
canvas = size_px * super_factor
img = Image.new("RGBA", (canvas, canvas), (0, 0, 0, 0))
draw = ImageDraw.Draw(img)
fill = (*color, 255)
def s(v: float) -> float:
return v * canvas / _TOWNHALL_LOGICAL_SIZE
draw.polygon([(s(8.5), s(1)), (s(15), s(6.5)), (s(2), s(6.5))], fill=fill)
draw.rectangle([(s(1), s(6.5)), (s(16), s(8.5))], fill=fill)
for column_x in (3, 8, 13):
draw.rectangle([(s(column_x), s(8.5)), (s(column_x + 1.5), s(14))], fill=fill)
draw.rectangle([(s(0), s(14)), (s(17), s(15.5))], fill=fill)
return img.resize((size_px, size_px), Image.LANCZOS)
def inject_townhall_sprite(sprites_dir: Path) -> None:
"""Append a townhall glyph to each downloaded sprite sheet.
Protomaps' v4 sprite omits `townhall` even though the basemap style
references it; we add the icon here so MapLibre can resolve the name
natively at runtime.
"""
for theme in ("light", "dark"):
color = _TOWNHALL_COLOR[theme]
for suffix, scale in (("", 1), ("@2x", 2)):
json_path = sprites_dir / f"{theme}{suffix}.json"
png_path = sprites_dir / f"{theme}{suffix}.png"
if not json_path.exists() or not png_path.exists():
continue
manifest = json.loads(json_path.read_text())
sheet = Image.open(png_path).convert("RGBA")
glyph_size = _TOWNHALL_LOGICAL_SIZE * scale
glyph = _render_townhall_glyph(glyph_size, color)
new_width = max(sheet.width, glyph_size)
new_height = sheet.height + glyph_size
extended = Image.new("RGBA", (new_width, new_height), (0, 0, 0, 0))
extended.paste(sheet, (0, 0))
extended.paste(glyph, (0, sheet.height))
extended.save(png_path, optimize=True)
manifest["townhall"] = {
"x": 0,
"y": sheet.height,
"width": glyph_size,
"height": glyph_size,
"pixelRatio": scale,
}
json_path.write_text(json.dumps(manifest))
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
@ -147,7 +385,7 @@ def main():
# Skip already-downloaded files
remaining = [(url, dest) for url, dest in tasks]
print(f"Downloading {len(remaining)} assets")
print(f"Downloading {len(remaining) + len(DERIVED_POI_ICON_PATHS)} assets")
ok = 0
fail = 0
@ -162,6 +400,18 @@ def main():
else:
fail += 1
for kind, source_path, dest_path in DERIVED_POI_ICON_PATHS:
success, _url = download_derived_poi_icon(
kind, source_path, poi_icons_dir / dest_path
)
if success:
ok += 1
else:
fail += 1
crop_poi_svg_icons(poi_icons_dir)
inject_townhall_sprite(sprites_dir)
print(f"Done: {ok} downloaded, {fail} failed")

View file

@ -6,6 +6,7 @@ Reuses the same england-latest.osm.pbf as pois.py.
"""
import argparse
import re
from pathlib import Path
import osmium
@ -44,11 +45,37 @@ _STATION_STRIP = (
" underground station",
" railway station",
" dlr station",
" station dlr",
" dlr",
" overground station",
" tram stop",
" station",
)
_DLR_CODE_RE = re.compile(r"ZZDL([A-Z0-9]{3})")
def _is_dlr_station(tags: dict[str, str]) -> bool:
name = tags.get("name", "").lower()
network = tags.get("network", "").lower()
operator = tags.get("operator", "").lower()
return (
"docklands" in network
or "dlr" in network
or "docklands" in operator
or "dlr" in operator
or name.endswith(" dlr")
or " dlr " in name
)
def _is_tram_station(tags: dict[str, str]) -> bool:
if _is_dlr_station(tags):
return False
station_tag = tags.get("station", "")
network = tags.get("network", "").lower()
return station_tag == "light_rail" or "tramlink" in network or "tram" in network
def _station_display_name(name: str, tags: dict[str, str]) -> str:
"""Build a descriptive station name like 'Bank tube station'."""
@ -78,6 +105,96 @@ def _station_display_name(name: str, tags: dict[str, str]) -> str:
return f"{name} {suffix}"
def _station_name_score(name: str) -> tuple[int, int]:
lower = name.lower()
suffix_penalty = int(
lower.endswith(
(
" underground station",
" tube station",
" dlr station",
" railway station",
" rail station",
" station dlr",
" station",
)
)
or lower.endswith(" dlr")
)
return (suffix_penalty, len(name))
def _naptan_dlr_stations(naptan_path: Path) -> list[dict]:
"""Extract station-level DLR destinations from NaPTAN access nodes."""
df = pl.read_parquet(naptan_path)
required = {"id", "name", "category", "lat", "lng"}
missing = required - set(df.columns)
if missing:
raise ValueError(f"NaPTAN file is missing columns: {sorted(missing)}")
rows: dict[str, dict] = {}
for row in df.iter_rows(named=True):
atco_id = str(row["id"] or "")
match = _DLR_CODE_RE.search(atco_id)
if not match:
continue
if row["category"] not in {"Tube station", "Rail station"}:
continue
code = match.group(1)
raw_name = str(row["name"] or "")
if not raw_name:
continue
lat = float(row["lat"])
lon = float(row["lng"])
current = rows.get(code)
if current is None:
rows[code] = {
"raw_name": raw_name,
"lat_sum": lat,
"lon_sum": lon,
"count": 1,
}
continue
current["lat_sum"] += lat
current["lon_sum"] += lon
current["count"] += 1
if _station_name_score(raw_name) < _station_name_score(current["raw_name"]):
current["raw_name"] = raw_name
stations = []
for station in rows.values():
count = station["count"]
display_name = _station_display_name(station["raw_name"], {"network": "DLR"})
stations.append(
{
"name": display_name,
"place_type": "station",
"lat": station["lat_sum"] / count,
"lon": station["lon_sum"] / count,
"population": 0,
"travel_destination": True,
}
)
return sorted(stations, key=lambda station: station["name"])
def _append_naptan_dlr_stations(places: list[dict], naptan_path: Path) -> int:
existing_names = {str(place["name"]).casefold() for place in places}
added = 0
for station in _naptan_dlr_stations(naptan_path):
key = station["name"].casefold()
if key in existing_names:
continue
places.append(station)
existing_names.add(key)
added += 1
return added
class PlaceHandler(osmium.SimpleHandler):
def __init__(self, progress: tqdm, england_polygon) -> None:
super().__init__()
@ -145,14 +262,7 @@ class PlaceHandler(osmium.SimpleHandler):
# Railway stations (tube, national rail, DLR, overground, Elizabeth line)
if n.tags.get("railway") == "station":
tags = dict(n.tags)
station_tag = tags.get("station", "")
network = tags.get("network", "").lower()
# Skip tram stops
if (
station_tag == "light_rail"
or "tramlink" in network
or "tram" in network
):
if _is_tram_station(tags):
return
display_name = _station_display_name(name, tags)
self._add(
@ -178,6 +288,11 @@ def main() -> None:
required=True,
help="England boundary GeoJSON file",
)
parser.add_argument(
"--naptan",
type=Path,
help="Optional NaPTAN parquet file used to add DLR station destinations",
)
args = parser.parse_args()
pbf_file = args.pbf
@ -195,6 +310,9 @@ def main() -> None:
handler.apply_file(str(pbf_file), locations=True)
print(f"Extracted {len(handler.places):,} place nodes")
if args.naptan:
added = _append_naptan_dlr_stations(handler.places, args.naptan)
print(f"Added {added:,} DLR station destinations from NaPTAN")
if handler.places:
df = pl.DataFrame(handler.places)

View file

@ -0,0 +1,81 @@
import polars as pl
from pipeline.download.places import (
_is_dlr_station,
_is_tram_station,
_naptan_dlr_stations,
_station_display_name,
)
def test_dlr_light_rail_is_not_treated_as_tram():
dlr_tags = {
"name": "Lewisham DLR",
"railway": "station",
"station": "light_rail",
"network": "Docklands Light Railway",
}
assert _is_dlr_station(dlr_tags)
assert not _is_tram_station(dlr_tags)
assert _station_display_name("Lewisham DLR", dlr_tags) == "Lewisham DLR station"
assert (
_station_display_name("Tower Gateway Station DLR", dlr_tags)
== "Tower Gateway DLR station"
)
def test_tram_light_rail_is_still_excluded():
tram_tags = {
"name": "East Croydon",
"railway": "station",
"station": "light_rail",
"network": "London Trams",
}
assert not _is_dlr_station(tram_tags)
assert _is_tram_station(tram_tags)
def test_naptan_dlr_stations_are_deduplicated_by_atco_code(tmp_path):
naptan = tmp_path / "naptan.parquet"
pl.DataFrame(
{
"id": [
"4900ZZDLSHA3",
"9400ZZDLSHA",
"4900ZZDLGRE1",
"490002076RV",
"4900ZZLUBNK",
],
"name": [
"Shadwell DLR",
"Shadwell DLR Station",
"Greenwich Station",
"Tower Gateway Station DLR",
"Bank",
],
"category": [
"Tube station",
"Tube station",
"Rail station",
"Bus stop",
"Tube station",
],
"lat": [51.51156, 51.511693, 51.47794, 51.510575, 51.5131],
"lng": [-0.055595, -0.056643, -0.01442, -0.07514, -0.0894],
}
).write_parquet(naptan)
stations = _naptan_dlr_stations(naptan)
assert [station["name"] for station in stations] == [
"Greenwich DLR station",
"Shadwell DLR station",
]
shadwell = next(
station for station in stations if station["name"].startswith("Shadwell")
)
assert shadwell["lat"] == (51.51156 + 51.511693) / 2
assert shadwell["place_type"] == "station"
assert shadwell["travel_destination"] is True

View file

@ -56,6 +56,7 @@ NR_AUTH_URL = "https://opendata.nationalrail.co.uk/authenticate"
NR_TIMETABLE_URL = "https://opendata.nationalrail.co.uk/api/staticfeeds/3.0/timetable"
USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"
TRANSXCHANGE2GTFS_PACKAGE = "transxchange2gtfs@1.12.0"
def _download_http(
@ -473,10 +474,50 @@ def convert_tfl_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
download_naptan()
print("Converting TfL TransXChange → GTFS...")
# The shim patches known packaging/runtime issues in the pinned npm package
# before loading its CLI from npx's temporary install.
shim_path = Path(__file__).with_name("transxchange2gtfs_shim.js")
subprocess.run(
["npx", "--yes", "transxchange2gtfs", str(txc_path), str(dest)],
[
"npx",
"--yes",
"--package",
TRANSXCHANGE2GTFS_PACKAGE,
"sh",
"-c",
"\n".join(
[
'bin="$(command -v transxchange2gtfs)"',
'script="$(readlink -f "$bin")"',
'pkg_dir="$(dirname "$(dirname "$script")")"',
'shim="$1"',
"shift",
'exec node "$shim" "$pkg_dir" "$@"',
]
),
"transxchange2gtfs",
str(shim_path.resolve()),
str(txc_path.resolve()),
str(dest.resolve()),
],
check=True,
)
required_files = {
"agency.txt",
"calendar.txt",
"calendar_dates.txt",
"routes.txt",
"stop_times.txt",
"stops.txt",
"trips.txt",
}
if not dest.exists() or not zipfile.is_zipfile(dest):
raise RuntimeError(f"transxchange2gtfs did not create a valid GTFS zip: {dest}")
with zipfile.ZipFile(dest) as z:
missing = required_files - set(z.namelist())
if missing:
missing_str = ", ".join(sorted(missing))
raise RuntimeError(f"TfL GTFS zip is missing required files: {missing_str}")
size_mb = dest.stat().st_size / (1024 * 1024)
print(f" Saved to {dest} ({size_mb:.1f} MB)")
return dest

View file

@ -0,0 +1,76 @@
#!/usr/bin/env node
"use strict";
const fs = require("fs");
const path = require("path");
const { createRequire } = require("module");
const [pkgDirArg, ...converterArgs] = process.argv.slice(2);
if (!pkgDirArg || converterArgs.length < 2) {
console.error(
"Usage: transxchange2gtfs_shim.js <package-dir> <input...> <output>",
);
process.exit(2);
}
const pkgDir = path.resolve(pkgDirArg);
function replaceOnce(relativePath, before, after) {
const file = path.join(pkgDir, relativePath);
const original = fs.readFileSync(file, "utf8");
if (original.includes(before)) {
fs.writeFileSync(file, original.replace(before, after));
} else if (original.includes(after)) {
return;
} else {
throw new Error(`Could not patch ${relativePath}: expected text not found`);
}
}
// The published 1.12.0 package has a few compatibility issues with current
// TfL TransXChange exports:
// - the bin script points at dist/src/cli.js, but the package ships dist/cli.js
// - the compiled date-holidays import expects a synthetic default export
// - some TfL journeys reference timing links without matching route-link geometry
//
// GTFS shapes are optional for R5 routing. Clear shape references and omit
// shapes.txt so missing route geometry does not drop otherwise usable trips.
function patchPackage() {
replaceOnce(
"dist/transxchange/TransXChangeJourneyStream.js",
"distanceSoFarM += routeLink.Distance;",
"distanceSoFarM += routeLink ? routeLink.Distance : 0;",
);
replaceOnce(
"dist/gtfs/TripsStream.js",
"(0, crypto_1.createHash)('md5').update(JSON.stringify({ routeId: journey.route, routeLinkSeq: journey.routeLinkIds })).digest(\"hex\"));",
"\"\");",
);
replaceOnce(
"dist/gtfs/StopTimesStream.js",
"stop.shapeDistTraveled, stop.exactTime ? \"1\" : \"0\");",
"\"\", stop.exactTime ? \"1\" : \"0\");",
);
replaceOnce(
"dist/Container.js",
"\"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex)),\n \"shapes.txt\": journeyStream.pipe(new ShapesStream_1.ShapesStream())",
"\"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
);
replaceOnce(
"dist/Container.js",
"\"routes.txt\": transxchange.pipe(new RoutesStream_1.RoutesStream()),\n \"transfers.txt\": transxchange.pipe(new TransfersStream_1.TransfersStream(naptanIndex, locationIndex)),\n \"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
"\"routes.txt\": transxchange.pipe(new RoutesStream_1.RoutesStream()),\n \"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
);
}
patchPackage();
const pkgRequire = createRequire(path.join(pkgDir, "package.json"));
const Holidays = pkgRequire("date-holidays");
if (!Holidays.default) {
Holidays.default = Holidays;
}
process.argv = [process.argv[0], "transxchange2gtfs", ...converterArgs];
require(path.join(pkgDir, "dist", "cli.js"));

View file

@ -7,6 +7,15 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
MIN_FLOOR_AREA_M2 = 10
_IOD_PERCENTILE_COLUMNS = [
"Education, Skills and Training Score",
"Income Score (rate)",
"Employment Score (rate)",
"Health Deprivation and Disability Score",
"Indoors Sub-domain Score",
"Outdoors Sub-domain Score",
]
_AREA_COLUMNS = [
"Postcode",
@ -51,6 +60,14 @@ _AREA_COLUMNS = [
"Number of parks within 1km",
"Distance to nearest train or tube station (km)",
"Distance to nearest park (km)",
"Distance to nearest grocery store (km)",
"Distance to nearest tube station (km)",
"Distance to nearest rail station (km)",
"Distance to nearest Waitrose (km)",
"Distance to nearest Tesco (km)",
"Distance to nearest cafe (km)",
"Distance to nearest pub (km)",
"Distance to nearest restaurant (km)",
# Environment
"Noise (dB)",
"Max available download speed (Mbps)",
@ -76,6 +93,34 @@ _AREA_COLUMNS = [
]
def _is_dynamic_poi_metric_column(column: str) -> bool:
return (
column.startswith("Distance to nearest ")
and column.endswith(" POI (km)")
) or (
column.startswith("Number of ")
and (column.endswith(" POIs within 2km") or column.endswith(" POIs within 5km"))
)
def _less_deprived_percentile_expr(column: str) -> pl.Expr:
"""Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
non_null_count = pl.col(column).count()
descending_rank = pl.col(column).rank("average", descending=True)
return (
pl.when(pl.col(column).is_null())
.then(None)
.when(pl.col(column) == pl.col(column).min())
.then(100.0)
.when(pl.col(column) == pl.col(column).max())
.then(0.0)
.when(non_null_count > 1)
.then(((descending_rank - 1) / (non_null_count - 1) * 100).round(1))
.otherwise(100.0)
.alias(column)
)
def _build(
epc_pp_path: Path,
arcgis_path: Path,
@ -134,20 +179,11 @@ def _build(
)
wide = wide.join(arcgis, on="postcode", how="left")
iod = pl.scan_parquet(iod_path)
iod = pl.scan_parquet(iod_path).with_columns(
*(_less_deprived_percentile_expr(c) for c in _IOD_PERCENTILE_COLUMNS)
)
wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
# Invert deprivation scores so that higher values = less deprived (better)
iod_score_cols = [
"Education, Skills and Training Score",
"Income Score (rate)",
"Employment Score (rate)",
"Health Deprivation and Disability Score",
"Indoors Sub-domain Score",
"Outdoors Sub-domain Score",
]
wide = wide.with_columns(*(pl.col(c).max() - pl.col(c) for c in iod_score_cols))
ethnicity = pl.scan_parquet(ethnicity_path)
wide = wide.join(
ethnicity,
@ -351,6 +387,14 @@ def _build(
"parks_1km": "Number of parks within 1km",
"train_tube_nearest_km": "Distance to nearest train or tube station (km)",
"parks_nearest_km": "Distance to nearest park (km)",
"grocery_store_nearest_km": "Distance to nearest grocery store (km)",
"tube_station_nearest_km": "Distance to nearest tube station (km)",
"rail_station_nearest_km": "Distance to nearest rail station (km)",
"waitrose_nearest_km": "Distance to nearest Waitrose (km)",
"tesco_nearest_km": "Distance to nearest Tesco (km)",
"cafe_nearest_km": "Distance to nearest cafe (km)",
"pub_nearest_km": "Distance to nearest pub (km)",
"restaurant_nearest_km": "Distance to nearest restaurant (km)",
"latest_price": "Last known price",
"number_habitable_rooms": "Number of bedrooms & living rooms",
"noise_lden_db": "Noise (dB)",
@ -381,10 +425,14 @@ def _build(
# Split into postcode-level and property-level dataframes
area_cols = [c for c in _AREA_COLUMNS if c in df.columns]
area_cols.extend(
c for c in df.columns if _is_dynamic_poi_metric_column(c) and c not in area_cols
)
area_col_set = set(area_cols)
postcode_df = df.select(area_cols).group_by("Postcode").first()
print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
property_cols = [c for c in df.columns if c not in _AREA_COLUMNS or c == "Postcode"]
property_cols = [c for c in df.columns if c not in area_col_set or c == "Postcode"]
properties_df = df.select(property_cols)
print(f"Property rows: {properties_df.height}")

View file

@ -1,6 +1,8 @@
"""Compute POI proximity counts and distances per postcode from ArcGIS + filtered POIs."""
import argparse
import re
import unicodedata
from pathlib import Path
import polars as pl
@ -15,9 +17,25 @@ POI_GROUPS_2KM = {
"groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
}
# Groups for which to compute distance to nearest POI (from filtered POIs)
# Groups for which to compute distance to nearest POI (from filtered POIs).
# Keep `train_tube` for the existing backend feature; the individual POI
# distance filters below power the frontend dropdown.
DISTANCE_GROUPS = {
"train_tube": ["Tube station", "Rail station"],
"grocery_store": [
"Greengrocer",
"Supermarket",
"Convenience Store",
"Waitrose",
"Tesco",
],
"tube_station": ["Tube station"],
"rail_station": ["Rail station"],
"waitrose": ["Waitrose"],
"tesco": ["Tesco"],
"cafe": ["Café"],
"pub": ["Pub"],
"restaurant": ["Restaurant"],
}
# OS Open Greenspace function types used for park counts and distance calculation.
@ -27,6 +45,69 @@ GREENSPACE_PARK_FUNCTIONS = {
"parks": ["Public Park Or Garden", "Playing Field", "Play Space"],
}
GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure"}
DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS = {"Groceries"}
def _poi_category_slug(category: str) -> str:
ascii_text = (
unicodedata.normalize("NFKD", category)
.encode("ascii", "ignore")
.decode("ascii")
.lower()
)
slug = re.sub(r"[^a-z0-9]+", "_", ascii_text).strip("_")
return slug or "poi"
def _build_poi_category_groups(
pois: pl.DataFrame,
) -> tuple[dict[str, list[str]], dict[str, str]]:
"""Build one proximity group for each POI category selected for filters."""
if "group" not in pois.columns:
raise ValueError("POI dataframe must include a 'group' column")
categories = (
pois.group_by("group", "category")
.len()
.filter(
pl.col("group").is_in(list(DYNAMIC_FILTER_ALL_GROUPS))
| (
pl.col("group").is_in(list(DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS))
& (pl.col("len") > GROCERY_DYNAMIC_FILTER_MIN_POIS)
)
)
.select("category")
.sort("category")
.to_series()
.to_list()
)
used_slugs: dict[str, int] = {}
groups: dict[str, list[str]] = {}
display_names: dict[str, str] = {}
for category in categories:
if not isinstance(category, str) or not category:
continue
base_slug = f"poi_{_poi_category_slug(category)}"
slug_count = used_slugs.get(base_slug, 0)
used_slugs[base_slug] = slug_count + 1
group_key = base_slug if slug_count == 0 else f"{base_slug}_{slug_count + 1}"
groups[group_key] = [category]
display_names[group_key] = category
return groups, display_names
def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]:
renames: dict[str, str] = {}
for group_key, category in display_names.items():
renames[f"{group_key}_nearest_km"] = f"Distance to nearest {category} POI (km)"
renames[f"{group_key}_2km"] = f"Number of {category} POIs within 2km"
renames[f"{group_key}_5km"] = f"Number of {category} POIs within 5km"
return renames
def main():
parser = argparse.ArgumentParser(
@ -56,12 +137,35 @@ def main():
)
pois = pl.read_parquet(args.pois)
poi_category_groups, poi_display_names = _build_poi_category_groups(pois)
# Count amenity POIs within 2km
counts_2km = count_pois_per_postcode(
postcodes, pois, groups=POI_GROUPS_2KM, radius_km=2
)
# Dynamic POI filters: nearest distance plus counts within 2km and 5km for
# the selected public transport, grocery, and leisure categories.
dynamic_counts_2km = count_pois_per_postcode(
postcodes, pois, groups=poi_category_groups, radius_km=2
)
dynamic_counts_5km = count_pois_per_postcode(
postcodes, pois, groups=poi_category_groups, radius_km=5
)
dynamic_distances = min_distance_per_postcode(
postcodes, pois, groups=poi_category_groups
)
dynamic_renames = _dynamic_poi_metric_renames(poi_display_names)
dynamic_counts_2km = dynamic_counts_2km.rename(
{k: v for k, v in dynamic_renames.items() if k in dynamic_counts_2km.columns}
)
dynamic_counts_5km = dynamic_counts_5km.rename(
{k: v for k, v in dynamic_renames.items() if k in dynamic_counts_5km.columns}
)
dynamic_distances = dynamic_distances.rename(
{k: v for k, v in dynamic_renames.items() if k in dynamic_distances.columns}
)
# Distance to nearest train/tube station (from filtered POIs)
distances = min_distance_per_postcode(postcodes, pois, groups=DISTANCE_GROUPS)
@ -77,6 +181,9 @@ def main():
# Join all results on postcode
result = (
counts_2km.join(distances, on="postcode")
.join(dynamic_counts_2km, on="postcode")
.join(dynamic_counts_5km, on="postcode")
.join(dynamic_distances, on="postcode")
.join(park_counts_1km, on="postcode")
.join(park_distances, on="postcode")
)

View file

@ -0,0 +1,33 @@
import polars as pl
from pipeline.transform.merge import (
_is_dynamic_poi_metric_column,
_less_deprived_percentile_expr,
)
def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None:
df = pl.DataFrame({"Income Score (rate)": [1.0, 2.0, 3.0, None]})
result = df.lazy().with_columns(
_less_deprived_percentile_expr("Income Score (rate)")
).collect()
assert result["Income Score (rate)"].to_list() == [100.0, 50.0, 0.0, None]
def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None:
df = pl.DataFrame({"Income Score (rate)": [1.0, 1.0, 2.0, 3.0, 3.0]})
result = df.lazy().with_columns(
_less_deprived_percentile_expr("Income Score (rate)")
).collect()
assert result["Income Score (rate)"].to_list() == [100.0, 100.0, 50.0, 0.0, 0.0]
def test_dynamic_poi_metric_columns_are_area_level() -> None:
assert _is_dynamic_poi_metric_column("Distance to nearest Cafe POI (km)")
assert _is_dynamic_poi_metric_column("Number of Cafe POIs within 2km")
assert _is_dynamic_poi_metric_column("Number of Cafe POIs within 5km")
assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")

View file

@ -0,0 +1,41 @@
import polars as pl
from pipeline.transform.poi_proximity import _build_poi_category_groups
def test_dynamic_poi_groups_include_requested_categories_only() -> None:
pois = pl.DataFrame(
{
"group": (
["Public Transport"] * 2
+ ["Leisure"] * 2
+ ["Groceries"] * 101
+ ["Groceries"] * 100
+ ["Education"] * 200
+ ["Health"] * 200
),
"category": (
["Rail station", "Bus stop"]
+ ["Café", "Restaurant"]
+ ["Tesco"] * 101
+ ["Waitrose"] * 100
+ ["School"] * 200
+ ["Pharmacy"] * 200
),
"lat": [51.5] * 605,
"lng": [-0.1] * 605,
}
)
groups, display_names = _build_poi_category_groups(pois)
assert set(display_names.values()) == {
"Bus stop",
"Café",
"Rail station",
"Restaurant",
"Tesco",
}
assert "poi_waitrose" not in groups
assert "poi_school" not in groups
assert "poi_pharmacy" not in groups

View file

@ -1128,12 +1128,18 @@ GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
def normalize_grocery_retailer(retailer: str | None) -> str:
if retailer is None:
return ""
return GROCERY_RETAILER_DISPLAY_NAMES.get(retailer, retailer)
display_name = GROCERY_RETAILER_DISPLAY_NAMES.get(retailer)
if display_name is None:
raise ValueError(f"Missing grocery retailer display name for {retailer!r}")
return display_name
def normalize_grocery_icon_category(fascia: str | None, retailer: str | None) -> str:
if fascia:
return GROCERY_FASCIA_ICON_NAMES.get(fascia, normalize_grocery_retailer(fascia))
icon_name = GROCERY_FASCIA_ICON_NAMES.get(fascia)
if icon_name is None:
raise ValueError(f"Missing grocery fascia icon name for {fascia!r}")
return icon_name
return normalize_grocery_retailer(retailer)

View file

@ -2,9 +2,12 @@
import numpy as np
import polars as pl
from scipy.spatial import cKDTree
from .haversine import haversine_km
EARTH_RADIUS_KM = 6371.0088
def _build_poi_grid(
pois: pl.DataFrame, grid_size: float = 0.05
@ -49,6 +52,21 @@ def _get_nearby_indices(
return np.concatenate(nearby_indices)
def _project_lat_lng_km(
lats: np.ndarray, lngs: np.ndarray, origin_lat: float
) -> np.ndarray:
"""Project WGS84 coordinates to local km coordinates for nearest-neighbour lookup."""
lat_rad = np.radians(lats)
lng_rad = np.radians(lngs)
origin_lat_rad = np.radians(origin_lat)
return np.column_stack(
(
EARTH_RADIUS_KM * lng_rad * np.cos(origin_lat_rad),
EARTH_RADIUS_KM * lat_rad,
)
)
def count_pois_per_postcode(
postcodes_df: pl.DataFrame,
pois: pl.DataFrame,
@ -136,7 +154,7 @@ def min_distance_per_postcode(
) -> pl.DataFrame:
"""
For each postcode, compute the distance (km) to the closest POI per group.
Returns NaN where no POI of that group exists within the grid search range (~5.5km).
Returns NaN where no POI of that group exists.
"""
print("Computing minimum POI distances per postcode...")
@ -144,51 +162,84 @@ def min_distance_per_postcode(
n_pois = len(pois)
print(f" {n_postcodes:,} postcodes, {n_pois:,} POIs")
grid_size = 0.05
print(" Building POI spatial grid...")
poi_lats, poi_lngs, poi_cats, poi_grid = _build_poi_grid(pois, grid_size)
print(f" POI grid has {len(poi_grid):,} occupied cells")
category_masks = {}
for group, categories in groups.items():
mask = np.isin(poi_cats, categories)
category_masks[group] = mask
print(f" {group}: {mask.sum():,} POIs")
pc_lats = postcodes_df["lat"].to_numpy()
pc_lons = postcodes_df["lon"].to_numpy()
pc_codes = postcodes_df["postcode"].to_list()
valid_pc_mask = np.isfinite(pc_lats) & np.isfinite(pc_lons)
valid_pc_indices = np.flatnonzero(valid_pc_mask)
result_min_dist = {
group: np.full(n_postcodes, np.nan, dtype=np.float32) for group in groups
}
batch_size = 50000
n_batches = (n_postcodes + batch_size - 1) // batch_size
print(f" Processing {n_postcodes:,} postcodes in {n_batches} batches...")
if n_pois == 0 or len(valid_pc_indices) == 0:
print(" No valid postcode/POI coordinates; returning NaN distances")
return pl.DataFrame(
{
"postcode": pc_codes,
**{
f"{group}_nearest_km": values
for group, values in result_min_dist.items()
},
}
)
for batch_idx in range(n_batches):
start_idx = batch_idx * batch_size
end_idx = min(start_idx + batch_size, n_postcodes)
poi_lats = pois["lat"].to_numpy()
poi_lngs = pois["lng"].to_numpy()
poi_cats = pois["category"].to_numpy()
valid_poi_mask = np.isfinite(poi_lats) & np.isfinite(poi_lngs)
origin_lat = float(np.nanmean(pc_lats[valid_pc_mask]))
query_xy = _project_lat_lng_km(
pc_lats[valid_pc_indices], pc_lons[valid_pc_indices], origin_lat
)
if batch_idx % 5 == 0:
print(
f" Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}"
)
batch_size = 200_000
n_batches = (len(valid_pc_indices) + batch_size - 1) // batch_size
for i in range(start_idx, end_idx):
nearby = _get_nearby_indices(pc_lats[i], pc_lons[i], poi_grid, grid_size)
if nearby is None:
continue
for group, categories in groups.items():
group_indices = np.flatnonzero(valid_poi_mask & np.isin(poi_cats, categories))
print(f" {group}: {len(group_indices):,} POIs")
if len(group_indices) == 0:
continue
distances = haversine_km(
poi_lats[nearby], poi_lngs[nearby], pc_lats[i], pc_lons[i]
)
poi_xy = _project_lat_lng_km(
poi_lats[group_indices], poi_lngs[group_indices], origin_lat
)
tree = cKDTree(poi_xy)
k = min(8, len(group_indices))
for group, cat_mask in category_masks.items():
group_mask = cat_mask[nearby]
if group_mask.any():
result_min_dist[group][i] = distances[group_mask].min()
for batch_idx in range(n_batches):
start_idx = batch_idx * batch_size
end_idx = min(start_idx + batch_size, len(valid_pc_indices))
batch_pc_indices = valid_pc_indices[start_idx:end_idx]
batch_xy = query_xy[start_idx:end_idx]
if batch_idx == 0 or (batch_idx + 1) % 5 == 0:
print(
f" Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}"
)
_, nearest = tree.query(batch_xy, k=k)
nearest = np.asarray(nearest)
if k == 1:
candidate_indices = group_indices[nearest]
distances = haversine_km(
poi_lats[candidate_indices],
poi_lngs[candidate_indices],
pc_lats[batch_pc_indices],
pc_lons[batch_pc_indices],
)
else:
candidate_indices = group_indices[nearest]
distances = haversine_km(
poi_lats[candidate_indices],
poi_lngs[candidate_indices],
pc_lats[batch_pc_indices, None],
pc_lons[batch_pc_indices, None],
).min(axis=1)
result_min_dist[group][batch_pc_indices] = distances.astype(np.float32)
result_data = {"postcode": pc_codes}
for group in groups:

View file

@ -113,9 +113,9 @@ def test_min_distance_finds_nearest(postcodes, pois):
# Restaurant is co-located — distance ~0
assert ec1a["restaurants_nearest_km"][0] < 0.01
# Far-away postcode should have NaN (no POIs within grid range)
# Far-away postcode should still get the global nearest distance.
zz99 = result.filter(pl.col("postcode") == "ZZ99 9ZZ")
assert np.isnan(zz99["train_tube_nearest_km"][0])
assert zz99["train_tube_nearest_km"][0] > 300
def test_min_distance_no_pois_returns_nan(postcodes):