Rerun data pipelines

This commit is contained in:
Andras Schmelczer 2026-05-10 14:49:53 +01:00
parent 4c95815dc8
commit fc10381692
27 changed files with 2143 additions and 215 deletions

393
pipeline/download/crime.py Normal file
View file

@ -0,0 +1,393 @@
"""Download police.uk crime archive ZIPs.
The archive page lists rolling monthly snapshots. Newer snapshots overlap older
ones, so extraction keeps files already written by newer archives.
"""
from __future__ import annotations
import argparse
import hashlib
import html
import json
import re
import shutil
import sys
import zipfile
from dataclasses import asdict, dataclass
from datetime import UTC, datetime
from pathlib import Path, PurePosixPath
from urllib.parse import urljoin
import httpx
from tqdm import tqdm
ARCHIVE_URL = "https://data.police.uk/data/archive/"
ARCHIVE_LINK_RE = re.compile(
r'<div class="download">\s*.*?'
r'<a href="(?P<href>/data/archive/(?P<month>\d{4}-\d{2})\.zip)">'
r"(?P<label>[^<]+)</a>\s*\((?P<size>[^)]+)\)</span>\s*"
r'<p class="contained-range">\s*(?P<contained_range>.*?)\s*</p>\s*'
r'<p class="md5sum">(?P<md5>.*?)</p>',
re.DOTALL,
)
VALID_MD5_RE = re.compile(r"^[0-9a-fA-F]{32}$")
MONTH_RE = re.compile(r"^\d{4}-\d{2}$")
@dataclass(frozen=True)
class CrimeArchive:
month: str
label: str
url: str
filename: str
size: str
contained_range: str
md5: str | None
raw_md5: str
def _clean_text(value: str) -> str:
text = re.sub(r"<[^>]+>", " ", value)
return re.sub(r"\s+", " ", html.unescape(text)).strip()
def parse_archives(page_html: str, base_url: str = ARCHIVE_URL) -> list[CrimeArchive]:
"""Parse monthly crime archive links from the police.uk archive page."""
archives: list[CrimeArchive] = []
for match in ARCHIVE_LINK_RE.finditer(page_html):
raw_md5 = _clean_text(match.group("md5")).lower()
md5 = raw_md5 if VALID_MD5_RE.fullmatch(raw_md5) else None
href = html.unescape(match.group("href"))
archives.append(
CrimeArchive(
month=match.group("month"),
label=_clean_text(match.group("label")),
url=urljoin(base_url, href),
filename=Path(href).name,
size=_clean_text(match.group("size")),
contained_range=_clean_text(match.group("contained_range")),
md5=md5,
raw_md5=raw_md5,
)
)
return archives
def fetch_archives(archive_url: str = ARCHIVE_URL) -> list[CrimeArchive]:
"""Fetch and parse the archive index."""
with httpx.Client(
follow_redirects=True,
timeout=httpx.Timeout(30.0, read=60.0),
headers={"User-Agent": "perfect-postcode-data-pipeline/1.0"},
) as client:
response = client.get(archive_url)
response.raise_for_status()
archives = parse_archives(response.text, archive_url)
if not archives:
raise RuntimeError(f"No monthly archive ZIPs found at {archive_url}")
return archives
def filter_archives(
archives: list[CrimeArchive],
*,
from_month: str | None = None,
to_month: str | None = None,
limit: int | None = None,
) -> list[CrimeArchive]:
"""Filter archives by inclusive YYYY-MM bounds while preserving page order."""
filtered = [
archive
for archive in archives
if (from_month is None or archive.month >= from_month)
and (to_month is None or archive.month <= to_month)
]
if limit is not None:
filtered = filtered[:limit]
return filtered
def file_md5(path: Path) -> str:
digest = hashlib.md5()
with path.open("rb") as file:
for chunk in iter(lambda: file.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()
def download_archive(
archive: CrimeArchive,
archive_dir: Path,
*,
verify: bool,
force: bool,
timeout: float,
) -> Path:
"""Download one archive ZIP, resuming an existing .part file when possible."""
dest = archive_dir / archive.filename
partial = dest.with_suffix(dest.suffix + ".part")
if force:
dest.unlink(missing_ok=True)
partial.unlink(missing_ok=True)
if dest.exists():
if verify and archive.md5 is not None:
actual_md5 = file_md5(dest)
if actual_md5 == archive.md5:
print(f"{archive.filename}: already downloaded")
return dest
print(
f"{archive.filename}: checksum mismatch, downloading again",
file=sys.stderr,
)
dest.unlink()
partial.unlink(missing_ok=True)
else:
print(f"{archive.filename}: already downloaded")
return dest
resume_from = partial.stat().st_size if partial.exists() else 0
headers = {"Range": f"bytes={resume_from}-"} if resume_from else {}
with httpx.stream(
"GET",
archive.url,
headers=headers,
follow_redirects=True,
timeout=httpx.Timeout(30.0, read=timeout),
) as response:
if response.status_code == 206 and resume_from:
mode = "ab"
initial = resume_from
else:
response.raise_for_status()
mode = "wb"
initial = 0
total_header = int(response.headers.get("content-length", 0))
total = initial + total_header if total_header else None
with (
partial.open(mode) as output,
tqdm(
total=total,
initial=initial,
unit="B",
unit_scale=True,
unit_divisor=1024,
desc=archive.filename,
) as progress,
):
for chunk in response.iter_bytes(chunk_size=1024 * 1024):
output.write(chunk)
progress.update(len(chunk))
partial.replace(dest)
if verify and archive.md5 is not None:
actual_md5 = file_md5(dest)
if actual_md5 != archive.md5:
dest.unlink(missing_ok=True)
raise RuntimeError(
f"{archive.filename}: MD5 mismatch: expected {archive.md5}, got {actual_md5}"
)
return dest
def _safe_csv_members(
archive: zipfile.ZipFile,
) -> list[tuple[zipfile.ZipInfo, PurePosixPath]]:
members: list[tuple[zipfile.ZipInfo, PurePosixPath]] = []
for info in archive.infolist():
rel_path = PurePosixPath(info.filename)
if (
info.is_dir()
or rel_path.is_absolute()
or ".." in rel_path.parts
or rel_path.suffix.lower() != ".csv"
):
continue
members.append((info, rel_path))
return members
def extract_csvs(
zip_path: Path,
output_dir: Path,
*,
overwrite: bool = False,
) -> tuple[int, int]:
"""Extract CSVs from one ZIP. Returns (extracted, skipped)."""
extracted = 0
skipped = 0
with zipfile.ZipFile(zip_path) as archive:
for info, rel_path in _safe_csv_members(archive):
dest = output_dir.joinpath(*rel_path.parts)
if dest.exists() and not overwrite:
skipped += 1
continue
dest.parent.mkdir(parents=True, exist_ok=True)
with archive.open(info) as source, dest.open("wb") as target:
shutil.copyfileobj(source, target)
extracted += 1
return extracted, skipped
def write_manifest(
output_dir: Path, archive_url: str, archives: list[CrimeArchive]
) -> None:
manifest = {
"source": archive_url,
"fetched_at": datetime.now(UTC).isoformat(),
"archives": [asdict(archive) for archive in archives],
}
path = output_dir / "archive_manifest.json"
path.write_text(json.dumps(manifest, indent=2) + "\n")
def _month_arg(value: str) -> str:
if not MONTH_RE.fullmatch(value):
raise argparse.ArgumentTypeError("month must be in YYYY-MM format")
return value
def main() -> None:
parser = argparse.ArgumentParser(
description="Download all monthly police.uk crime archive ZIPs"
)
parser.add_argument(
"--output",
type=Path,
required=True,
help="Directory for extracted CSVs; ZIPs are kept under _archives/",
)
parser.add_argument(
"--archive-url",
default=ARCHIVE_URL,
help=f"Archive index URL (default: {ARCHIVE_URL})",
)
parser.add_argument(
"--from-month",
type=_month_arg,
help="Only download archives from this YYYY-MM onwards",
)
parser.add_argument(
"--to-month",
type=_month_arg,
help="Only download archives up to this YYYY-MM",
)
parser.add_argument(
"--limit",
type=int,
help="Download at most this many archives after filtering",
)
parser.add_argument(
"--list",
action="store_true",
help="Print the archive URLs that would be downloaded and exit",
)
parser.add_argument(
"--no-extract",
dest="extract",
action="store_false",
help="Download ZIPs only; do not extract CSVs",
)
parser.add_argument(
"--overwrite-extracted",
action="store_true",
help="Overwrite CSVs when extracting overlapping archive snapshots",
)
parser.add_argument(
"--no-verify",
dest="verify",
action="store_false",
help="Skip MD5 verification",
)
parser.add_argument(
"--force",
action="store_true",
help="Redownload archives even if ZIP files already exist",
)
parser.add_argument(
"--timeout",
type=float,
default=600.0,
help="Per-read timeout in seconds for large ZIP downloads",
)
args = parser.parse_args()
print("Fetching police.uk archive index...")
archives = filter_archives(
fetch_archives(args.archive_url),
from_month=args.from_month,
to_month=args.to_month,
limit=args.limit,
)
if not archives:
raise SystemExit("No archives matched the requested filters")
bad_md5 = [
archive.filename for archive in archives if archive.raw_md5 and not archive.md5
]
if bad_md5:
print(
"Warning: ignoring malformed MD5 values for "
+ ", ".join(bad_md5[:5])
+ ("..." if len(bad_md5) > 5 else ""),
file=sys.stderr,
)
print(f"Found {len(archives)} monthly archive ZIPs")
if args.list:
for archive in archives:
print(f"{archive.month}\t{archive.url}\t{archive.raw_md5}")
return
args.output.mkdir(parents=True, exist_ok=True)
archive_dir = args.output / "_archives"
archive_dir.mkdir(parents=True, exist_ok=True)
write_manifest(args.output, args.archive_url, archives)
total_extracted = 0
total_skipped = 0
for index, archive in enumerate(archives, start=1):
print(f"[{index}/{len(archives)}] {archive.label} ({archive.size})")
zip_path = download_archive(
archive,
archive_dir,
verify=args.verify,
force=args.force,
timeout=args.timeout,
)
if args.extract:
extracted, skipped = extract_csvs(
zip_path,
args.output,
overwrite=args.overwrite_extracted,
)
total_extracted += extracted
total_skipped += skipped
print(
f"{archive.filename}: extracted {extracted} CSVs"
+ (f", skipped {skipped} existing CSVs" if skipped else "")
)
if args.extract:
print(
f"Done. ZIPs saved in {archive_dir}; extracted {total_extracted} CSVs"
+ (f" and skipped {total_skipped} existing CSVs" if total_skipped else "")
+ "."
)
else:
print(f"Done. ZIPs saved in {archive_dir}.")
if __name__ == "__main__":
main()

View file

@ -1,9 +1,15 @@
import argparse
import base64
import json
import re
import sys
import urllib.request
from concurrent.futures import ThreadPoolExecutor, as_completed
from io import BytesIO
from pathlib import Path
from PIL import Image, ImageDraw
from pipeline.transform.transform_poi import NAPTAN_EMOJIS, _CATEGORIES
GLYPHS_BASE = "https://protomaps.github.io/basemaps-assets/fonts"
@ -14,53 +20,80 @@ POI_ICON_BASE = "https://geolytix.github.io/MapIcons"
# Font stacks used by @protomaps/basemaps with lang='en'
FONT_STACKS = ["Noto Sans Regular", "Noto Sans Italic", "Noto Sans Medium"]
# Fallback emoji not in any category
_FALLBACK_EMOJIS = ["📍"]
POI_ICON_PATHS = [
"asda/asda_express_24px.svg",
"asda/asda_green_basket_24px.svg",
"asda/asda_green_trolley_24px.svg",
"asda/asda_living_24px.svg",
"asda/asda_pfs_24px.svg",
"asda/asda_primary.svg",
"asda/asda_superstore_green_trolley_24px.svg",
"brands/aldi_24px.svg",
"brands/amazon_fresh_alt_24px.svg",
"brands/booths_24px.svg",
"brands/budgens_24px.svg",
"brands/centra_24px.svg",
"brands/cook.svg",
"brands/coop_24px.svg",
"brands/costco_24px.svg",
"brands/dunnes_stores_24px.svg",
"brands/farmfoods_updated_24px.svg",
"brands/heron_24px.svg",
"brands/iceland_24px.svg",
"brands/iceland_food_warehouse_24px.svg",
"brands/lidl_24px.svg",
"brands/little_waitrose_24px.svg",
"brands/makro_24px.svg",
"brands/mns_24px.svg",
"brands/mns_food_24px.svg",
"brands/mns_high_street_24px.svg",
"brands/mns_hospital_24px.svg",
"brands/mns_moto_24px.svg",
"brands/mns_outlet_24px.svg",
"brands/morrisons_24px.svg",
"brands/morrisons_daily_24px.svg",
"brands/sainsburys_24px.svg",
"brands/sainsburys_local_24px.svg",
"brands/spar_24px.svg",
"brands/tesco_24px.svg",
"brands/tesco_express_24px.svg",
"brands/tesco_extra_24px.svg",
"brands/waitrose_24px.svg",
"brands/wholefoods_24px.svg",
"logos/planet_organic_24px.svg",
"brands_2023/supermarkets/farmfoods.svg",
"brands_2023/supermarkets/heron_foods.svg",
"brands_2023/supermarkets/little_waitrose.svg",
"brands_2024/amazon_fresh.svg",
"brands_2024/booths.svg",
"brands_2024/budgens.svg",
"brands_2024/cook.svg",
"brands_2024/dunnes_stores.svg",
"brands_2024/iceland.svg",
"brands_2024/makro.svg",
"brands_2024/mns.svg",
"brands_2024/morrisons_daily.svg",
"brands_2024/sainsburys_local.svg",
"brands_2024/wholefoods.svg",
"logos/aldi.svg",
"logos/asda.svg",
"logos/centra.svg",
"logos/coop.svg",
"logos/lidl.svg",
"logos/morrisons.svg",
"logos/planet_organic.svg",
"logos/sainsburys.svg",
"logos/spar.svg",
"logos/tesco.svg",
"logos/tesco_express.svg",
"logos/tesco_extra.svg",
"logos/waitrose.svg",
"public_transport/london_tube.svg",
"visuals/mns.svg",
]
DERIVED_POI_ICON_PATHS = [
("costco_logo", "brands/costco.svg", "logos/costco.svg"),
(
"embedded_png",
"brands/iceland_food_warehouse_24px.svg",
"logos/the_food_warehouse.png",
),
]
POI_ICON_SVG_CROPS = {
"brands_2023/supermarkets/farmfoods.svg": (1.293, 7.314, 15.48, 3.293),
"brands_2023/supermarkets/heron_foods.svg": (0.062, 6.68, 17.995, 5.325),
"brands_2023/supermarkets/little_waitrose.svg": (0.916, 5.645, 16.365, 6.719),
"brands_2024/amazon_fresh.svg": (3.817, 1.646, 16.367, 16.358),
"brands_2024/booths.svg": (1.456, 7.143, 15.313, 3.512),
"brands_2024/budgens.svg": (2.251, 2.278, 13.6, 13.612),
"brands_2024/cook.svg": (5.028, 5.493, 13.945, 9.648),
"brands_2024/dunnes_stores.svg": (4.375, 7.732, 15.249, 5.055),
"brands_2024/iceland.svg": (1.136, 6.823, 16.067, 4.302),
"brands_2024/makro.svg": (4.411, 6.098, 16.397, 5.428),
"brands_2024/mns.svg": (4.042, 6.986, 16.171, 6.724),
"brands_2024/morrisons_daily.svg": (3.341, 4.414, 17.317, 8.248),
"brands_2024/sainsburys_local.svg": (4.58, 1.61, 14.84, 14.849),
"brands_2024/wholefoods.svg": (4.17, 2.193, 15.659, 15.668),
"logos/aldi.svg": (4.813, 2.563, 14.374, 14.383),
"logos/asda.svg": (3.91, 7.135, 16.181, 5.442),
"logos/centra.svg": (3.36, 7.35, 17.28, 4.651),
"logos/coop.svg": (6.407, 4.658, 11.187, 11.793),
"logos/costco.svg": (70.61, 144.908, 256.67, 85.825),
"logos/lidl.svg": (4.938, 2.973, 13.985, 13.985),
"logos/morrisons.svg": (5.231, 2.985, 13.538, 13.398),
"logos/planet_organic.svg": (5.528, 3.564, 12.943, 12.943),
"logos/sainsburys.svg": (7.502, 3.572, 8.996, 12.646),
"logos/spar.svg": (4.933, 2.968, 14.133, 13.853),
"logos/tesco.svg": (4.338, 6.865, 15.324, 5.359),
"logos/tesco_express.svg": (5.231, 5.933, 13.538, 8.345),
"logos/tesco_extra.svg": (4.933, 5.775, 14.133, 8.519),
"logos/waitrose.svg": (5.528, 6.09, 12.943, 9.855),
}
POI_ICON_SVG_INTRINSIC_MAX = 512
def collect_twemoji_codes() -> list[str]:
"""Derive twemoji hex codes from transform_poi categories.
@ -76,9 +109,6 @@ def collect_twemoji_codes() -> list[str]:
for emoji in NAPTAN_EMOJIS.values():
emojis.add(emoji)
for emoji in _FALLBACK_EMOJIS:
emojis.add(emoji)
# First codepoint hex, matching frontend logic
return sorted({f"{ord(e[0]):x}" for e in emojis})
@ -97,6 +127,214 @@ def download_file(url: str, dest: Path) -> tuple[bool, str]:
return False, url
def download_text(url: str) -> str:
with urllib.request.urlopen(url) as response:
return response.read().decode("utf-8")
def build_costco_logo(marker_svg: str) -> str:
start = marker_svg.find('<g><path d=" M 316.312')
end = marker_svg.rfind("</g></g></svg>")
if start < 0 or end < 0:
raise ValueError("Costco marker SVG layout changed")
logo_group = marker_svg[start : end + 4]
return (
'<?xml version="1.0" encoding="UTF-8"?>\n'
'<svg xmlns="http://www.w3.org/2000/svg" viewBox="70 145 260 90" '
'width="260pt" height="90pt" preserveAspectRatio="xMidYMid meet">\n'
f"{logo_group}\n"
"</svg>\n"
)
def trim_white_png(png_bytes: bytes) -> bytes:
image = Image.open(BytesIO(png_bytes)).convert("RGBA")
pixels = image.load()
for y in range(image.height):
for x in range(image.width):
red, green, blue, alpha = pixels[x, y]
if red > 245 and green > 245 and blue > 245:
pixels[x, y] = (red, green, blue, 0)
alpha_box = image.getchannel("A").getbbox()
if alpha_box:
image = image.crop(alpha_box)
out = BytesIO()
image.save(out, format="PNG")
return out.getvalue()
def extract_embedded_png(marker_svg: str) -> bytes:
match = re.search(r"base64,([^\"']+)", marker_svg)
if not match:
raise ValueError("POI marker SVG did not contain an embedded PNG")
return trim_white_png(base64.b64decode(match.group(1)))
def svg_intrinsic_size(width: float, height: float) -> tuple[int, int]:
if width <= 0 or height <= 0:
return (POI_ICON_SVG_INTRINSIC_MAX, POI_ICON_SVG_INTRINSIC_MAX)
if width >= height:
return (
POI_ICON_SVG_INTRINSIC_MAX,
max(1, round(POI_ICON_SVG_INTRINSIC_MAX * height / width)),
)
return (
max(1, round(POI_ICON_SVG_INTRINSIC_MAX * width / height)),
POI_ICON_SVG_INTRINSIC_MAX,
)
def set_svg_geometry(svg_text: str, crop: tuple[float, float, float, float]) -> str:
x, y, width, height = crop
view_box = f"{x:g} {y:g} {width:g} {height:g}"
intrinsic_width, intrinsic_height = svg_intrinsic_size(width, height)
svg_text = re.sub(r'viewBox="[^"]+"', f'viewBox="{view_box}"', svg_text, count=1)
if 'viewBox="' not in svg_text:
svg_text = re.sub(r"<svg\b", f'<svg viewBox="{view_box}"', svg_text, count=1)
svg_text = re.sub(r'width="[^"]+"', f'width="{intrinsic_width}"', svg_text, count=1)
if 'width="' not in svg_text:
svg_text = re.sub(
r"<svg\b", f'<svg width="{intrinsic_width}"', svg_text, count=1
)
svg_text = re.sub(
r'height="[^"]+"', f'height="{intrinsic_height}"', svg_text, count=1
)
if 'height="' not in svg_text:
svg_text = re.sub(
r"<svg\b", f'<svg height="{intrinsic_height}"', svg_text, count=1
)
return svg_text
def get_svg_view_box(svg_text: str) -> tuple[float, float, float, float] | None:
match = re.search(r'viewBox="([^"]+)"', svg_text)
if not match:
return None
parts = [
float(part) for part in re.split(r"[\s,]+", match.group(1).strip()) if part
]
if len(parts) != 4:
return None
return (parts[0], parts[1], parts[2], parts[3])
def crop_poi_svg_icons(poi_icons_dir: Path) -> None:
for icon_path, crop in POI_ICON_SVG_CROPS.items():
dest = poi_icons_dir / icon_path
if not dest.exists():
continue
svg_text = dest.read_text(encoding="utf-8")
if icon_path == "brands_2024/dunnes_stores.svg":
svg_text = svg_text.replace('fill="#fffcfc"', 'fill="#111111"')
svg_text = svg_text.replace('fill="#fcfcfc"', 'fill="#111111"')
dest.write_text(set_svg_geometry(svg_text, crop), encoding="utf-8")
for dest in poi_icons_dir.rglob("*.svg"):
svg_text = dest.read_text(encoding="utf-8")
view_box = get_svg_view_box(svg_text)
if view_box:
dest.write_text(set_svg_geometry(svg_text, view_box), encoding="utf-8")
def download_derived_poi_icon(
kind: str, source_path: str, dest: Path
) -> tuple[bool, str]:
url = f"{POI_ICON_BASE}/{source_path}"
dest.parent.mkdir(parents=True, exist_ok=True)
try:
source = download_text(url)
if kind == "costco_logo":
dest.write_text(build_costco_logo(source), encoding="utf-8")
elif kind == "embedded_png":
dest.write_bytes(extract_embedded_png(source))
else:
raise ValueError(f"Unknown derived POI icon kind: {kind}")
return True, url
except urllib.error.HTTPError as e:
print(f" {e.code} {url}", file=sys.stderr)
return False, url
except Exception as e:
print(f" ERROR {url}: {e}", file=sys.stderr)
return False, url
# Slategray accent used by civic POI icons (school, library, building, …) in
# protomaps' v4 sprite. We match it so the townhall blends in with its peers.
_TOWNHALL_COLOR = {
"light": (135, 128, 171),
"dark": (118, 118, 127),
}
_TOWNHALL_LOGICAL_SIZE = 17
def _render_townhall_glyph(size_px: int, color: tuple[int, int, int]) -> Image.Image:
# Draw at 8× resolution and downsample with Lanczos so the pediment's
# diagonals come out anti-aliased; PIL's polygon fill is otherwise aliased.
super_factor = 8
canvas = size_px * super_factor
img = Image.new("RGBA", (canvas, canvas), (0, 0, 0, 0))
draw = ImageDraw.Draw(img)
fill = (*color, 255)
def s(v: float) -> float:
return v * canvas / _TOWNHALL_LOGICAL_SIZE
draw.polygon([(s(8.5), s(1)), (s(15), s(6.5)), (s(2), s(6.5))], fill=fill)
draw.rectangle([(s(1), s(6.5)), (s(16), s(8.5))], fill=fill)
for column_x in (3, 8, 13):
draw.rectangle([(s(column_x), s(8.5)), (s(column_x + 1.5), s(14))], fill=fill)
draw.rectangle([(s(0), s(14)), (s(17), s(15.5))], fill=fill)
return img.resize((size_px, size_px), Image.LANCZOS)
def inject_townhall_sprite(sprites_dir: Path) -> None:
"""Append a townhall glyph to each downloaded sprite sheet.
Protomaps' v4 sprite omits `townhall` even though the basemap style
references it; we add the icon here so MapLibre can resolve the name
natively at runtime.
"""
for theme in ("light", "dark"):
color = _TOWNHALL_COLOR[theme]
for suffix, scale in (("", 1), ("@2x", 2)):
json_path = sprites_dir / f"{theme}{suffix}.json"
png_path = sprites_dir / f"{theme}{suffix}.png"
if not json_path.exists() or not png_path.exists():
continue
manifest = json.loads(json_path.read_text())
sheet = Image.open(png_path).convert("RGBA")
glyph_size = _TOWNHALL_LOGICAL_SIZE * scale
glyph = _render_townhall_glyph(glyph_size, color)
new_width = max(sheet.width, glyph_size)
new_height = sheet.height + glyph_size
extended = Image.new("RGBA", (new_width, new_height), (0, 0, 0, 0))
extended.paste(sheet, (0, 0))
extended.paste(glyph, (0, sheet.height))
extended.save(png_path, optimize=True)
manifest["townhall"] = {
"x": 0,
"y": sheet.height,
"width": glyph_size,
"height": glyph_size,
"pixelRatio": scale,
}
json_path.write_text(json.dumps(manifest))
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
@ -147,7 +385,7 @@ def main():
# Skip already-downloaded files
remaining = [(url, dest) for url, dest in tasks]
print(f"Downloading {len(remaining)} assets")
print(f"Downloading {len(remaining) + len(DERIVED_POI_ICON_PATHS)} assets")
ok = 0
fail = 0
@ -162,6 +400,18 @@ def main():
else:
fail += 1
for kind, source_path, dest_path in DERIVED_POI_ICON_PATHS:
success, _url = download_derived_poi_icon(
kind, source_path, poi_icons_dir / dest_path
)
if success:
ok += 1
else:
fail += 1
crop_poi_svg_icons(poi_icons_dir)
inject_townhall_sprite(sprites_dir)
print(f"Done: {ok} downloaded, {fail} failed")

View file

@ -18,6 +18,7 @@ endpoint is broken for that coverage).
import argparse
import tempfile
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
@ -29,8 +30,10 @@ from pyproj import Transformer
from rasterio.merge import merge
from rasterio.transform import rowcol
# Noise sources: (label, column_name, WCS base URL, coverage ID, WCS version)
# Road/rail work with WCS 1.0.0; airport requires WCS 2.0.1.
# Noise sources:
# (label, column_name, WCS base URL, coverage ID, WCS version, allow_missing_tiles)
# Road/rail work with WCS 1.0.0; airport requires WCS 2.0.1 and returns 500
# for many sparse/no-coverage tiles, which should become nulls.
NOISE_SOURCES = [
(
"Road",
@ -38,6 +41,7 @@ NOISE_SOURCES = [
"https://environment.data.gov.uk/spatialdata/road-noise-all-metrics-england-round-4/wcs",
"Road_Noise_Lden_England_Round_4_All",
"1.0.0",
False,
),
(
"Rail",
@ -45,6 +49,7 @@ NOISE_SOURCES = [
"https://environment.data.gov.uk/spatialdata/noise-data/wcs",
"Rail_Noise_Lden_England_Round_4_All",
"1.0.0",
False,
),
(
"Airport",
@ -52,6 +57,7 @@ NOISE_SOURCES = [
"https://environment.data.gov.uk/spatialdata/airport-noise-all-metrics-england-round-4/wcs",
"dac9cba4-abe7-43bd-b8e9-8a83da52edd8__Airport_Noise_ALL_Lden",
"2.0.1",
True,
),
]
@ -74,6 +80,14 @@ NATIVE_RESOLUTION = 10
# and keeps download size ~100x smaller than native 10m)
RESOLUTION = 100
# Retry/split behaviour for slow Defra WCS requests. Some 100km eastern tiles
# intermittently return 504s; smaller fallback requests usually succeed.
MAX_RETRIES = 3
RETRY_BACKOFF_SECONDS = 5
MIN_TILE_SIZE = 25_000
type Tile = tuple[int, int, int, int]
def _wcs_get_coverage_url(
wcs_base: str,
@ -117,6 +131,53 @@ def _bng_from_latlon(lat: np.ndarray, lon: np.ndarray) -> tuple[np.ndarray, np.n
return _TO_BNG.transform(lon, lat) # pyproj takes (x=lon, y=lat)
def _looks_like_tiff(response: httpx.Response) -> bool:
content_type = response.headers.get("content-type", "")
return "tiff" in content_type or response.content[:4] in (b"II*\x00", b"MM\x00*")
def _fetch_tile_bytes(
wcs_base: str,
coverage_id: str,
min_e: int,
min_n: int,
max_e: int,
max_n: int,
wcs_version: str = "1.0.0",
) -> bytes | None:
"""Fetch one WCS tile. Returns None when the server reports no GeoTIFF."""
url = _wcs_get_coverage_url(
wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
)
with httpx.Client(timeout=300, follow_redirects=True) as client:
resp = client.get(url)
resp.raise_for_status()
if not _looks_like_tiff(resp):
return None
return resp.content
def _split_tile(min_e: int, min_n: int, max_e: int, max_n: int) -> list[Tile]:
e_edges = [min_e, max_e]
n_edges = [min_n, max_n]
if max_e - min_e > MIN_TILE_SIZE:
e_edges.insert(1, (min_e + max_e) // 2)
if max_n - min_n > MIN_TILE_SIZE:
n_edges.insert(1, (min_n + max_n) // 2)
subtiles: list[Tile] = []
for e0, e1 in zip(e_edges, e_edges[1:]):
for n0, n1 in zip(n_edges, n_edges[1:]):
if e1 > e0 and n1 > n0:
subtiles.append((e0, n0, e1, n1))
return subtiles
def _tile_path(tile_dir: Path, min_e: int, min_n: int, max_e: int, max_n: int) -> Path:
return tile_dir / f"tile_{min_e}_{min_n}_{max_e}_{max_n}.tif"
def _download_tile(
wcs_base: str,
coverage_id: str,
@ -124,30 +185,63 @@ def _download_tile(
min_n: int,
max_e: int,
max_n: int,
tile_path: Path,
tile_dir: Path,
wcs_version: str = "1.0.0",
) -> Path | None:
"""Download a single WCS tile. Returns path if successful, None otherwise."""
url = _wcs_get_coverage_url(
wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
split_failures: bool = True,
) -> tuple[list[Path], list[Tile]]:
"""Download a WCS tile, splitting on repeated server failures."""
tile_path = _tile_path(tile_dir, min_e, min_n, max_e, max_n)
if tile_path.exists() and tile_path.stat().st_size > 0:
return [tile_path], []
last_error: Exception | None = None
for attempt in range(1, MAX_RETRIES + 1):
try:
content = _fetch_tile_bytes(
wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
)
if content is None:
return [], []
tile_path.write_bytes(content)
return [tile_path], []
except (httpx.HTTPStatusError, httpx.TimeoutException, httpx.ConnectError) as e:
last_error = e
if attempt < MAX_RETRIES:
sleep_for = RETRY_BACKOFF_SECONDS * attempt
print(
f" Retrying tile ({min_e},{min_n})-({max_e},{max_n}) "
f"after {type(e).__name__} ({attempt}/{MAX_RETRIES})"
)
time.sleep(sleep_for)
subtiles = _split_tile(min_e, min_n, max_e, max_n) if split_failures else []
if len(subtiles) > 1:
print(
f" Splitting tile ({min_e},{min_n})-({max_e},{max_n}) "
f"into {len(subtiles)} smaller requests after: {last_error}"
)
paths: list[Path] = []
failures: list[Tile] = []
for e0, n0, e1, n1 in subtiles:
child_paths, child_failures = _download_tile(
wcs_base,
coverage_id,
e0,
n0,
e1,
n1,
tile_dir,
wcs_version,
split_failures,
)
paths.extend(child_paths)
failures.extend(child_failures)
return paths, failures
print(
f" Failed to download tile ({min_e},{min_n})-({max_e},{max_n}): {last_error}"
)
try:
with httpx.Client(timeout=300, follow_redirects=True) as client:
resp = client.get(url)
resp.raise_for_status()
content_type = resp.headers.get("content-type", "")
if "tiff" not in content_type and resp.content[:4] not in (
b"II*\x00",
b"MM\x00*",
):
return None
tile_path.write_bytes(resp.content)
return tile_path
except (httpx.HTTPStatusError, httpx.TimeoutException, httpx.ConnectError) as e:
print(f" Failed to download tile ({min_e},{min_n})-({max_e},{max_n}): {e}")
return None
return [], [(min_e, min_n, max_e, max_n)]
def download_raster(
@ -156,6 +250,7 @@ def download_raster(
coverage_id: str,
label: str,
wcs_version: str = "1.0.0",
allow_missing_tiles: bool = False,
) -> list[Path]:
"""Download noise GeoTIFF raster covering England, returning paths to saved files."""
tiles = []
@ -168,13 +263,13 @@ def download_raster(
print(
f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)..."
)
paths = []
paths: list[Path] = []
failures: list[Tile] = []
completed = 0
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
futures = {}
for min_e, min_n, max_e, max_n in tiles:
tile_path = tile_dir / f"tile_{min_e}_{min_n}.tif"
fut = executor.submit(
_download_tile,
wcs_base,
@ -183,23 +278,41 @@ def download_raster(
min_n,
max_e,
max_n,
tile_path,
tile_dir,
wcs_version,
not allow_missing_tiles,
)
futures[fut] = (min_e, min_n)
for fut in as_completed(futures):
completed += 1
result = fut.result()
if result is not None:
paths.append(result)
result_paths, result_failures = fut.result()
paths.extend(result_paths)
failures.extend(result_failures)
print(
f"\r [{completed}/{len(tiles)}] Downloaded {len(paths)} valid tiles",
f"\r [{completed}/{len(tiles)}] Downloaded {len(paths)} GeoTIFFs",
end="",
flush=True,
)
print(f"\n[{label}] Downloaded {len(paths)}/{len(tiles)} tiles")
if failures:
preview = ", ".join(
f"({e0},{n0})-({e1},{n1})" for e0, n0, e1, n1 in failures[:5]
)
suffix = "..." if len(failures) > 5 else ""
if allow_missing_tiles:
print(
f"\n[{label}] WARNING: skipped {len(failures)} missing/no-data "
f"tile requests: {preview}{suffix}"
)
print(f"[{label}] Downloaded {len(paths)} GeoTIFFs from {len(tiles)} tiles")
return paths
raise RuntimeError(
f"[{label}] Failed to download {len(failures)} tile requests after "
f"retries and splitting: {preview}{suffix}"
)
print(f"\n[{label}] Downloaded {len(paths)} GeoTIFFs from {len(tiles)} tiles")
return paths
@ -281,11 +394,23 @@ def main() -> None:
result = postcodes.select("postcode")
with tempfile.TemporaryDirectory() as tmp:
for label, col_name, wcs_base, coverage_id, wcs_version in NOISE_SOURCES:
for (
label,
col_name,
wcs_base,
coverage_id,
wcs_version,
allow_missing_tiles,
) in NOISE_SOURCES:
tile_dir = Path(tmp) / label.lower()
tile_dir.mkdir()
tile_paths = download_raster(
tile_dir, wcs_base, coverage_id, label, wcs_version
tile_dir,
wcs_base,
coverage_id,
label,
wcs_version,
allow_missing_tiles,
)
if not tile_paths:

View file

@ -6,6 +6,7 @@ Reuses the same england-latest.osm.pbf as pois.py.
"""
import argparse
import re
from pathlib import Path
import osmium
@ -44,11 +45,37 @@ _STATION_STRIP = (
" underground station",
" railway station",
" dlr station",
" station dlr",
" dlr",
" overground station",
" tram stop",
" station",
)
_DLR_CODE_RE = re.compile(r"ZZDL([A-Z0-9]{3})")
def _is_dlr_station(tags: dict[str, str]) -> bool:
name = tags.get("name", "").lower()
network = tags.get("network", "").lower()
operator = tags.get("operator", "").lower()
return (
"docklands" in network
or "dlr" in network
or "docklands" in operator
or "dlr" in operator
or name.endswith(" dlr")
or " dlr " in name
)
def _is_tram_station(tags: dict[str, str]) -> bool:
if _is_dlr_station(tags):
return False
station_tag = tags.get("station", "")
network = tags.get("network", "").lower()
return station_tag == "light_rail" or "tramlink" in network or "tram" in network
def _station_display_name(name: str, tags: dict[str, str]) -> str:
"""Build a descriptive station name like 'Bank tube station'."""
@ -78,6 +105,96 @@ def _station_display_name(name: str, tags: dict[str, str]) -> str:
return f"{name} {suffix}"
def _station_name_score(name: str) -> tuple[int, int]:
lower = name.lower()
suffix_penalty = int(
lower.endswith(
(
" underground station",
" tube station",
" dlr station",
" railway station",
" rail station",
" station dlr",
" station",
)
)
or lower.endswith(" dlr")
)
return (suffix_penalty, len(name))
def _naptan_dlr_stations(naptan_path: Path) -> list[dict]:
"""Extract station-level DLR destinations from NaPTAN access nodes."""
df = pl.read_parquet(naptan_path)
required = {"id", "name", "category", "lat", "lng"}
missing = required - set(df.columns)
if missing:
raise ValueError(f"NaPTAN file is missing columns: {sorted(missing)}")
rows: dict[str, dict] = {}
for row in df.iter_rows(named=True):
atco_id = str(row["id"] or "")
match = _DLR_CODE_RE.search(atco_id)
if not match:
continue
if row["category"] not in {"Tube station", "Rail station"}:
continue
code = match.group(1)
raw_name = str(row["name"] or "")
if not raw_name:
continue
lat = float(row["lat"])
lon = float(row["lng"])
current = rows.get(code)
if current is None:
rows[code] = {
"raw_name": raw_name,
"lat_sum": lat,
"lon_sum": lon,
"count": 1,
}
continue
current["lat_sum"] += lat
current["lon_sum"] += lon
current["count"] += 1
if _station_name_score(raw_name) < _station_name_score(current["raw_name"]):
current["raw_name"] = raw_name
stations = []
for station in rows.values():
count = station["count"]
display_name = _station_display_name(station["raw_name"], {"network": "DLR"})
stations.append(
{
"name": display_name,
"place_type": "station",
"lat": station["lat_sum"] / count,
"lon": station["lon_sum"] / count,
"population": 0,
"travel_destination": True,
}
)
return sorted(stations, key=lambda station: station["name"])
def _append_naptan_dlr_stations(places: list[dict], naptan_path: Path) -> int:
existing_names = {str(place["name"]).casefold() for place in places}
added = 0
for station in _naptan_dlr_stations(naptan_path):
key = station["name"].casefold()
if key in existing_names:
continue
places.append(station)
existing_names.add(key)
added += 1
return added
class PlaceHandler(osmium.SimpleHandler):
def __init__(self, progress: tqdm, england_polygon) -> None:
super().__init__()
@ -145,14 +262,7 @@ class PlaceHandler(osmium.SimpleHandler):
# Railway stations (tube, national rail, DLR, overground, Elizabeth line)
if n.tags.get("railway") == "station":
tags = dict(n.tags)
station_tag = tags.get("station", "")
network = tags.get("network", "").lower()
# Skip tram stops
if (
station_tag == "light_rail"
or "tramlink" in network
or "tram" in network
):
if _is_tram_station(tags):
return
display_name = _station_display_name(name, tags)
self._add(
@ -178,6 +288,11 @@ def main() -> None:
required=True,
help="England boundary GeoJSON file",
)
parser.add_argument(
"--naptan",
type=Path,
help="Optional NaPTAN parquet file used to add DLR station destinations",
)
args = parser.parse_args()
pbf_file = args.pbf
@ -195,6 +310,9 @@ def main() -> None:
handler.apply_file(str(pbf_file), locations=True)
print(f"Extracted {len(handler.places):,} place nodes")
if args.naptan:
added = _append_naptan_dlr_stations(handler.places, args.naptan)
print(f"Added {added:,} DLR station destinations from NaPTAN")
if handler.places:
df = pl.DataFrame(handler.places)

View file

@ -0,0 +1,60 @@
from zipfile import ZipFile
from pipeline.download.crime import extract_csvs, parse_archives
def test_parse_archives_reads_monthly_zip_links_only():
html = """
<p><a href="/data/archive/latest.zip">latest.zip</a></p>
<div class="archive crime">
<div class="download">
<i class="icon-file"></i> <span><a href="/data/archive/2026-03.zip">March 2026</a> (1.6&nbsp;GB)</span>
<p class="contained-range">Contains data from Apr 2023 to Mar 2026</p>
<p class="md5sum">6dde462489389445877f3988ef3f4f4b</p>
</div>
<div class="download">
<i class="icon-file"></i> <span><a href="/data/archive/2019-06.zip">June 2019</a> (1.6&nbsp;GB)</span>
<p class="contained-range">Contains data from Jul 2016 to Jun 2019</p>
<p class="md5sum">d6494297b24c1434bdb2504e95261bf8-100</p>
</div>
</div>
<div class="archive neighbourhood">
<div class="download">
<span><a href="/data/neighbourhood.zip">Neighbourhood crime</a> (2.2 MB)</span>
<small class="md5sum">6b80e2b97d87f6668b7a45953924d191</small>
</div>
</div>
"""
archives = parse_archives(html, "https://data.police.uk/data/archive/")
assert [archive.filename for archive in archives] == [
"2026-03.zip",
"2019-06.zip",
]
assert archives[0].url == "https://data.police.uk/data/archive/2026-03.zip"
assert archives[0].md5 == "6dde462489389445877f3988ef3f4f4b"
assert archives[1].md5 is None
assert archives[1].raw_md5 == "d6494297b24c1434bdb2504e95261bf8-100"
def test_extract_csvs_preserves_existing_newer_files(tmp_path):
zip_path = tmp_path / "older.zip"
output = tmp_path / "crime"
existing = output / "2023-01" / "2023-01-city-street.csv"
existing.parent.mkdir(parents=True)
existing.write_text("newer\n")
with ZipFile(zip_path, "w") as archive:
archive.writestr("2023-01/2023-01-city-street.csv", "older\n")
archive.writestr("2022-12/2022-12-city-street.csv", "old\n")
archive.writestr("../escape.csv", "bad\n")
archive.writestr("notes.txt", "ignored\n")
extracted, skipped = extract_csvs(zip_path, output)
assert extracted == 1
assert skipped == 1
assert existing.read_text() == "newer\n"
assert (output / "2022-12" / "2022-12-city-street.csv").read_text() == "old\n"
assert not (tmp_path / "escape.csv").exists()

View file

@ -0,0 +1,89 @@
import httpx
import pytest
from pipeline.download import noise
def test_download_tile_splits_after_retries(monkeypatch, tmp_path):
monkeypatch.setattr(noise, "MAX_RETRIES", 1)
monkeypatch.setattr(noise, "MIN_TILE_SIZE", 50)
def fake_fetch_tile_bytes(
wcs_base,
coverage_id,
min_e,
min_n,
max_e,
max_n,
wcs_version="1.0.0",
):
if max_e - min_e > 50 or max_n - min_n > 50:
raise httpx.TimeoutException("too large")
return b"II*\x00fake-tiff"
monkeypatch.setattr(noise, "_fetch_tile_bytes", fake_fetch_tile_bytes)
paths, failures = noise._download_tile("base", "coverage", 0, 0, 100, 100, tmp_path)
assert failures == []
assert len(paths) == 4
assert sorted(path.name for path in paths) == [
"tile_0_0_50_50.tif",
"tile_0_50_50_100.tif",
"tile_50_0_100_50.tif",
"tile_50_50_100_100.tif",
]
def test_download_tile_reports_unsplittable_failure(monkeypatch, tmp_path):
monkeypatch.setattr(noise, "MAX_RETRIES", 1)
monkeypatch.setattr(noise, "MIN_TILE_SIZE", 100)
def fake_fetch_tile_bytes(*args, **kwargs):
raise httpx.ConnectError("offline")
monkeypatch.setattr(noise, "_fetch_tile_bytes", fake_fetch_tile_bytes)
paths, failures = noise._download_tile("base", "coverage", 0, 0, 100, 100, tmp_path)
assert paths == []
assert failures == [(0, 0, 100, 100)]
def test_download_raster_tolerates_missing_tiles_when_allowed(monkeypatch, tmp_path):
monkeypatch.setattr(noise, "BNG_MIN_E", 0)
monkeypatch.setattr(noise, "BNG_MAX_E", 100)
monkeypatch.setattr(noise, "BNG_MIN_N", 0)
monkeypatch.setattr(noise, "BNG_MAX_N", 100)
monkeypatch.setattr(noise, "TILE_SIZE", 100)
def fake_download_tile(*args, **kwargs):
return [], [(0, 0, 100, 100)]
monkeypatch.setattr(noise, "_download_tile", fake_download_tile)
paths = noise.download_raster(
tmp_path,
"base",
"coverage",
"Airport",
allow_missing_tiles=True,
)
assert paths == []
def test_download_raster_raises_on_missing_strict_tiles(monkeypatch, tmp_path):
monkeypatch.setattr(noise, "BNG_MIN_E", 0)
monkeypatch.setattr(noise, "BNG_MAX_E", 100)
monkeypatch.setattr(noise, "BNG_MIN_N", 0)
monkeypatch.setattr(noise, "BNG_MAX_N", 100)
monkeypatch.setattr(noise, "TILE_SIZE", 100)
def fake_download_tile(*args, **kwargs):
return [], [(0, 0, 100, 100)]
monkeypatch.setattr(noise, "_download_tile", fake_download_tile)
with pytest.raises(RuntimeError, match=r"\[Road\] Failed to download"):
noise.download_raster(tmp_path, "base", "coverage", "Road")

View file

@ -0,0 +1,81 @@
import polars as pl
from pipeline.download.places import (
_is_dlr_station,
_is_tram_station,
_naptan_dlr_stations,
_station_display_name,
)
def test_dlr_light_rail_is_not_treated_as_tram():
dlr_tags = {
"name": "Lewisham DLR",
"railway": "station",
"station": "light_rail",
"network": "Docklands Light Railway",
}
assert _is_dlr_station(dlr_tags)
assert not _is_tram_station(dlr_tags)
assert _station_display_name("Lewisham DLR", dlr_tags) == "Lewisham DLR station"
assert (
_station_display_name("Tower Gateway Station DLR", dlr_tags)
== "Tower Gateway DLR station"
)
def test_tram_light_rail_is_still_excluded():
tram_tags = {
"name": "East Croydon",
"railway": "station",
"station": "light_rail",
"network": "London Trams",
}
assert not _is_dlr_station(tram_tags)
assert _is_tram_station(tram_tags)
def test_naptan_dlr_stations_are_deduplicated_by_atco_code(tmp_path):
naptan = tmp_path / "naptan.parquet"
pl.DataFrame(
{
"id": [
"4900ZZDLSHA3",
"9400ZZDLSHA",
"4900ZZDLGRE1",
"490002076RV",
"4900ZZLUBNK",
],
"name": [
"Shadwell DLR",
"Shadwell DLR Station",
"Greenwich Station",
"Tower Gateway Station DLR",
"Bank",
],
"category": [
"Tube station",
"Tube station",
"Rail station",
"Bus stop",
"Tube station",
],
"lat": [51.51156, 51.511693, 51.47794, 51.510575, 51.5131],
"lng": [-0.055595, -0.056643, -0.01442, -0.07514, -0.0894],
}
).write_parquet(naptan)
stations = _naptan_dlr_stations(naptan)
assert [station["name"] for station in stations] == [
"Greenwich DLR station",
"Shadwell DLR station",
]
shadwell = next(
station for station in stations if station["name"].startswith("Shadwell")
)
assert shadwell["lat"] == (51.51156 + 51.511693) / 2
assert shadwell["place_type"] == "station"
assert shadwell["travel_destination"] is True

View file

@ -56,6 +56,7 @@ NR_AUTH_URL = "https://opendata.nationalrail.co.uk/authenticate"
NR_TIMETABLE_URL = "https://opendata.nationalrail.co.uk/api/staticfeeds/3.0/timetable"
USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"
TRANSXCHANGE2GTFS_PACKAGE = "transxchange2gtfs@1.12.0"
def _download_http(
@ -473,10 +474,50 @@ def convert_tfl_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
download_naptan()
print("Converting TfL TransXChange → GTFS...")
# The shim patches known packaging/runtime issues in the pinned npm package
# before loading its CLI from npx's temporary install.
shim_path = Path(__file__).with_name("transxchange2gtfs_shim.js")
subprocess.run(
["npx", "--yes", "transxchange2gtfs", str(txc_path), str(dest)],
[
"npx",
"--yes",
"--package",
TRANSXCHANGE2GTFS_PACKAGE,
"sh",
"-c",
"\n".join(
[
'bin="$(command -v transxchange2gtfs)"',
'script="$(readlink -f "$bin")"',
'pkg_dir="$(dirname "$(dirname "$script")")"',
'shim="$1"',
"shift",
'exec node "$shim" "$pkg_dir" "$@"',
]
),
"transxchange2gtfs",
str(shim_path.resolve()),
str(txc_path.resolve()),
str(dest.resolve()),
],
check=True,
)
required_files = {
"agency.txt",
"calendar.txt",
"calendar_dates.txt",
"routes.txt",
"stop_times.txt",
"stops.txt",
"trips.txt",
}
if not dest.exists() or not zipfile.is_zipfile(dest):
raise RuntimeError(f"transxchange2gtfs did not create a valid GTFS zip: {dest}")
with zipfile.ZipFile(dest) as z:
missing = required_files - set(z.namelist())
if missing:
missing_str = ", ".join(sorted(missing))
raise RuntimeError(f"TfL GTFS zip is missing required files: {missing_str}")
size_mb = dest.stat().st_size / (1024 * 1024)
print(f" Saved to {dest} ({size_mb:.1f} MB)")
return dest

View file

@ -0,0 +1,76 @@
#!/usr/bin/env node
"use strict";
const fs = require("fs");
const path = require("path");
const { createRequire } = require("module");
const [pkgDirArg, ...converterArgs] = process.argv.slice(2);
if (!pkgDirArg || converterArgs.length < 2) {
console.error(
"Usage: transxchange2gtfs_shim.js <package-dir> <input...> <output>",
);
process.exit(2);
}
const pkgDir = path.resolve(pkgDirArg);
function replaceOnce(relativePath, before, after) {
const file = path.join(pkgDir, relativePath);
const original = fs.readFileSync(file, "utf8");
if (original.includes(before)) {
fs.writeFileSync(file, original.replace(before, after));
} else if (original.includes(after)) {
return;
} else {
throw new Error(`Could not patch ${relativePath}: expected text not found`);
}
}
// The published 1.12.0 package has a few compatibility issues with current
// TfL TransXChange exports:
// - the bin script points at dist/src/cli.js, but the package ships dist/cli.js
// - the compiled date-holidays import expects a synthetic default export
// - some TfL journeys reference timing links without matching route-link geometry
//
// GTFS shapes are optional for R5 routing. Clear shape references and omit
// shapes.txt so missing route geometry does not drop otherwise usable trips.
function patchPackage() {
replaceOnce(
"dist/transxchange/TransXChangeJourneyStream.js",
"distanceSoFarM += routeLink.Distance;",
"distanceSoFarM += routeLink ? routeLink.Distance : 0;",
);
replaceOnce(
"dist/gtfs/TripsStream.js",
"(0, crypto_1.createHash)('md5').update(JSON.stringify({ routeId: journey.route, routeLinkSeq: journey.routeLinkIds })).digest(\"hex\"));",
"\"\");",
);
replaceOnce(
"dist/gtfs/StopTimesStream.js",
"stop.shapeDistTraveled, stop.exactTime ? \"1\" : \"0\");",
"\"\", stop.exactTime ? \"1\" : \"0\");",
);
replaceOnce(
"dist/Container.js",
"\"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex)),\n \"shapes.txt\": journeyStream.pipe(new ShapesStream_1.ShapesStream())",
"\"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
);
replaceOnce(
"dist/Container.js",
"\"routes.txt\": transxchange.pipe(new RoutesStream_1.RoutesStream()),\n \"transfers.txt\": transxchange.pipe(new TransfersStream_1.TransfersStream(naptanIndex, locationIndex)),\n \"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
"\"routes.txt\": transxchange.pipe(new RoutesStream_1.RoutesStream()),\n \"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
);
}
patchPackage();
const pkgRequire = createRequire(path.join(pkgDir, "package.json"));
const Holidays = pkgRequire("date-holidays");
if (!Holidays.default) {
Holidays.default = Holidays;
}
process.argv = [process.argv[0], "transxchange2gtfs", ...converterArgs];
require(path.join(pkgDir, "dist", "cli.js"));