Codex changes

2026-05-04 16:19:09 +01:00 · 2026-05-04 16:19:09 +01:00 · d4dde21ad2
commit d4dde21ad2
parent 0bae902e08
46 changed files with 4953 additions and 966 deletions
--- a/pipeline/download/transit_network.py
+++ b/pipeline/download/transit_network.py
@ -19,6 +19,8 @@ Output directory: property-data/transit/
 """

 import argparse
+import csv
+import io
 import json
 import os
 import shutil
@ -108,6 +110,30 @@ def download_bods_gtfs(output_dir: Path) -> Path:
    return dest


+def _parse_csv_line(line: bytes | str) -> list[str]:
+    """Parse a single GTFS CSV record."""
+    if isinstance(line, bytes):
+        line = line.decode("utf-8", errors="replace")
+    line = line.rstrip("\r\n")
+    if not line:
+        return []
+    return next(csv.reader([line]))
+
+
+def _format_csv_row(parts: list[str]) -> bytes:
+    """Serialize one GTFS CSV row with stable LF line endings."""
+    output = io.StringIO()
+    csv.writer(output, lineterminator="\n").writerow(parts)
+    return output.getvalue().encode("utf-8")
+
+
+def _format_csv_rows(rows: list[list[str]]) -> str:
+    output = io.StringIO()
+    writer = csv.writer(output, lineterminator="\n")
+    writer.writerows(rows)
+    return output.getvalue()
+
+
 def clean_gtfs(src: Path, dst: Path) -> None:
    """Fix R5-incompatible entries in GTFS.

@ -128,8 +154,7 @@ def clean_gtfs(src: Path, dst: Path) -> None:
                dropped = 0
                with zin.open(info) as f:
                    header = f.readline()
-                    header_str = header.decode("utf-8").strip()
-                    cols = header_str.split(",")
+                    cols = _parse_csv_line(header)
                    arr_idx = (
                        cols.index("arrival_time") if "arrival_time" in cols else -1
                    )
@ -143,10 +168,9 @@ def clean_gtfs(src: Path, dst: Path) -> None:
                    tmp.write(header)

                    for line in f:
-                        line_str = line.decode("utf-8", errors="replace").strip()
-                        if not line_str:
+                        parts = _parse_csv_line(line)
+                        if not parts:
                            continue
-                        parts = line_str.split(",")
                        skip = False
                        for idx in [arr_idx, dep_idx]:
                            if 0 <= idx < len(parts):
@ -171,12 +195,13 @@ def clean_gtfs(src: Path, dst: Path) -> None:

            elif info.filename == "feed_info.txt":
                data = zin.read(info).decode("utf-8")
-                lines = data.strip().split("\n")
-                header_line = lines[0]
-                feed_cols = header_line.split(",")
-                fixed_lines = [header_line]
-                for line in lines[1:]:
-                    parts = line.split(",")
+                rows = list(csv.reader(io.StringIO(data)))
+                if not rows:
+                    zout.writestr("feed_info.txt", data)
+                    continue
+                feed_cols = rows[0]
+                fixed_rows = [feed_cols]
+                for parts in rows[1:]:
                    for i, col_name in enumerate(feed_cols):
                        if "end_date" in col_name.lower() and i < len(parts):
                            date_val = parts[i].strip('"')
@ -187,8 +212,8 @@ def clean_gtfs(src: Path, dst: Path) -> None:
                                    print(
                                        f"  feed_info: capped end_date {date_val} → 20991231"
                                    )
-                    fixed_lines.append(",".join(parts))
-                zout.writestr("feed_info.txt", "\n".join(fixed_lines) + "\n")
+                    fixed_rows.append(parts)
+                zout.writestr("feed_info.txt", _format_csv_rows(fixed_rows))
            else:
                zout.writestr(info, zin.read(info))

@ -237,12 +262,11 @@ def convert_high_freq_to_frequency_based(
        # Step 1: Find metro/tram route IDs
        target_route_ids: set[str] = set()
        with zin.open("routes.txt") as f:
-            header = f.readline().decode("utf-8").strip()
-            cols = header.split(",")
+            cols = _parse_csv_line(f.readline())
            route_id_idx = cols.index("route_id")
            rt_idx = cols.index("route_type")
            for line in f:
-                parts = line.decode("utf-8", errors="replace").strip().split(",")
+                parts = _parse_csv_line(line)
                if not parts:
                    continue
                route_type = parts[rt_idx].strip('"')
@ -259,14 +283,13 @@ def convert_high_freq_to_frequency_based(
        # Step 2: Map target trips to grouping keys
        trip_group_key: dict[str, tuple[str, str, str]] = {}
        with zin.open("trips.txt") as f:
-            header = f.readline().decode("utf-8").strip()
-            cols = header.split(",")
+            cols = _parse_csv_line(f.readline())
            trip_id_idx = cols.index("trip_id")
            route_id_idx = cols.index("route_id")
            dir_idx = cols.index("direction_id") if "direction_id" in cols else -1
            service_idx = cols.index("service_id")
            for line in f:
-                parts = line.decode("utf-8", errors="replace").strip().split(",")
+                parts = _parse_csv_line(line)
                if not parts:
                    continue
                route_id = parts[route_id_idx].strip('"')
@ -282,14 +305,13 @@ def convert_high_freq_to_frequency_based(
        trip_first_dep: dict[str, int] = {}
        trip_first_stop: dict[str, str] = {}
        with zin.open("stop_times.txt") as f:
-            header = f.readline().decode("utf-8").strip()
-            cols = header.split(",")
+            cols = _parse_csv_line(f.readline())
            trip_id_idx = cols.index("trip_id")
            dep_idx = cols.index("departure_time")
            seq_idx = cols.index("stop_sequence")
            stop_id_idx = cols.index("stop_id")
            for line in f:
-                parts = line.decode("utf-8", errors="replace").strip().split(",")
+                parts = _parse_csv_line(line)
                if not parts:
                    continue
                trip_id = parts[trip_id_idx].strip('"')
@ -361,8 +383,7 @@ def convert_high_freq_to_frequency_based(
            if info.filename == "trips.txt":
                with zin.open(info) as f:
                    header = f.readline()
-                    header_str = header.decode("utf-8").strip()
-                    cols = header_str.split(",")
+                    cols = _parse_csv_line(header)
                    trip_id_idx = cols.index("trip_id")

                    tmp = tempfile.NamedTemporaryFile(
@ -370,9 +391,7 @@ def convert_high_freq_to_frequency_based(
                    )
                    tmp.write(header)
                    for line in f:
-                        parts = (
-                            line.decode("utf-8", errors="replace").strip().split(",")
-                        )
+                        parts = _parse_csv_line(line)
                        if not parts:
                            continue
                        if parts[trip_id_idx].strip('"') not in trips_to_remove:
@ -384,8 +403,7 @@ def convert_high_freq_to_frequency_based(
            elif info.filename == "stop_times.txt":
                with zin.open(info) as f:
                    header = f.readline()
-                    header_str = header.decode("utf-8").strip()
-                    cols = header_str.split(",")
+                    cols = _parse_csv_line(header)
                    trip_id_idx = cols.index("trip_id")

                    tmp = tempfile.NamedTemporaryFile(
@ -393,9 +411,7 @@ def convert_high_freq_to_frequency_based(
                    )
                    tmp.write(header)
                    for line in f:
-                        parts = (
-                            line.decode("utf-8", errors="replace").strip().split(",")
-                        )
+                        parts = _parse_csv_line(line)
                        if not parts:
                            continue
                        if parts[trip_id_idx].strip('"') not in trips_to_remove:
@ -535,25 +551,23 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
    with zipfile.ZipFile(src, "r") as zin:
        # Load valid stop IDs
        with zin.open("stops.txt") as f:
-            header = f.readline().decode("utf-8").strip()
-            stop_id_idx = header.split(",").index("stop_id")
-            lat_idx = header.split(",").index("stop_lat")
+            cols = _parse_csv_line(f.readline())
+            stop_id_idx = cols.index("stop_id")
            for line in f:
-                parts = line.decode("utf-8", errors="replace").strip().split(",")
+                parts = _parse_csv_line(line)
                if parts:
                    stop_ids.add(parts[stop_id_idx])

        # Find trips with backwards travel times
        with zin.open("stop_times.txt") as f:
-            st_header = f.readline().decode("utf-8").strip()
-            st_cols = st_header.split(",")
+            st_cols = _parse_csv_line(f.readline())
            trip_id_idx = st_cols.index("trip_id")
            dep_idx = st_cols.index("departure_time")

            prev_trip = ""
            prev_dep_secs = -1
            for line in f:
-                parts = line.decode("utf-8", errors="replace").strip().split(",")
+                parts = _parse_csv_line(line)
                if not parts:
                    continue
                trip_id = parts[trip_id_idx].strip('"')
@ -594,8 +608,7 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
            if info.filename == "stop_times.txt":
                with zin.open(info) as f:
                    header = f.readline()
-                    header_str = header.decode("utf-8").strip()
-                    cols = header_str.split(",")
+                    cols = _parse_csv_line(header)
                    trip_id_idx = cols.index("trip_id")
                    stop_id_idx = cols.index("stop_id")
                    seq_idx = cols.index("stop_sequence")
@ -614,10 +627,9 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
                    prev_trip = ""
                    seq_counter = 0
                    for line in f:
-                        line_str = line.decode("utf-8", errors="replace").strip()
-                        if not line_str:
+                        parts = _parse_csv_line(line)
+                        if not parts:
                            continue
-                        parts = line_str.split(",")
                        trip_id = parts[trip_id_idx].strip('"')
                        stop_id = parts[stop_id_idx].strip('"')

@ -651,7 +663,7 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
                        if old_seq != str(seq_counter):
                            seqs_renumbered += 1

-                        tmp.write((",".join(parts) + "\n").encode("utf-8"))
+                        tmp.write(_format_csv_row(parts))

                    tmp.close()
                    zout.write(tmp.name, "stop_times.txt")
@ -660,8 +672,7 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
            elif info.filename == "stops.txt":
                with zin.open(info) as f:
                    header = f.readline()
-                    header_str = header.decode("utf-8").strip()
-                    cols = header_str.split(",")
+                    cols = _parse_csv_line(header)
                    lat_idx = cols.index("stop_lat")
                    lon_idx = cols.index("stop_lon")

@ -671,10 +682,9 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
                    tmp.write(header)

                    for line in f:
-                        line_str = line.decode("utf-8", errors="replace").strip()
-                        if not line_str:
+                        parts = _parse_csv_line(line)
+                        if not parts:
                            continue
-                        parts = line_str.split(",")
                        try:
                            lat = float(parts[lat_idx])
                            # Fix bogus Irish CIE coordinates (South Atlantic)
@ -685,7 +695,7 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
                                coords_fixed += 1
                        except ValueError:
                            pass
-                        tmp.write((",".join(parts) + "\n").encode("utf-8"))
+                        tmp.write(_format_csv_row(parts))

                    tmp.close()
                    zout.write(tmp.name, "stops.txt")
@ -694,8 +704,7 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
            elif info.filename == "routes.txt":
                with zin.open(info) as f:
                    header = f.readline()
-                    header_str = header.decode("utf-8").strip()
-                    cols = header_str.split(",")
+                    cols = _parse_csv_line(header)
                    rt_idx = cols.index("route_type")

                    tmp = tempfile.NamedTemporaryFile(
@ -704,14 +713,13 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
                    tmp.write(header)

                    for line in f:
-                        line_str = line.decode("utf-8", errors="replace").strip()
-                        if not line_str:
+                        parts = _parse_csv_line(line)
+                        if not parts:
                            continue
-                        parts = line_str.split(",")
                        if parts[rt_idx].strip('"') == "714":
                            parts[rt_idx] = "3"
                            route_types_fixed += 1
-                        tmp.write((",".join(parts) + "\n").encode("utf-8"))
+                        tmp.write(_format_csv_row(parts))

                    tmp.close()
                    zout.write(tmp.name, "routes.txt")
@ -721,8 +729,7 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
                # Remove trips that have backwards travel times
                with zin.open(info) as f:
                    header = f.readline()
-                    header_str = header.decode("utf-8").strip()
-                    cols = header_str.split(",")
+                    cols = _parse_csv_line(header)
                    trip_id_idx = cols.index("trip_id")

                    tmp = tempfile.NamedTemporaryFile(
@ -731,10 +738,9 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
                    tmp.write(header)

                    for line in f:
-                        line_str = line.decode("utf-8", errors="replace").strip()
-                        if not line_str:
+                        parts = _parse_csv_line(line)
+                        if not parts:
                            continue
-                        parts = line_str.split(",")
                        if parts[trip_id_idx].strip('"') not in bad_trip_ids:
                            tmp.write(line)

@ -746,8 +752,7 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
                # Cap end_date year to 2099
                with zin.open(info) as f:
                    header = f.readline()
-                    header_str = header.decode("utf-8").strip()
-                    cols = header_str.split(",")
+                    cols = _parse_csv_line(header)
                    end_idx = cols.index("end_date")

                    tmp = tempfile.NamedTemporaryFile(
@ -756,10 +761,9 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
                    tmp.write(header)

                    for line in f:
-                        line_str = line.decode("utf-8", errors="replace").strip()
-                        if not line_str:
+                        parts = _parse_csv_line(line)
+                        if not parts:
                            continue
-                        parts = line_str.split(",")
                        date_val = parts[end_idx].strip('"')
                        if len(date_val) == 8:
                            try:
@ -768,7 +772,7 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
                                    parts[end_idx] = "20991231"
                            except ValueError:
                                pass
-                        tmp.write((",".join(parts) + "\n").encode("utf-8"))
+                        tmp.write(_format_csv_row(parts))

                    tmp.close()
                    zout.write(tmp.name, "calendar.txt")