good

2026-05-12 22:30:36 +01:00 · 2026-05-12 22:30:36 +01:00 · 63713c3a2b
commit 63713c3a2b
parent 81a16f543c
15 changed files with 492 additions and 159 deletions
--- a/pipeline/download/places.py
+++ b/pipeline/download/places.py
@ -1,7 +1,8 @@
-"""Extract place=* nodes and railway stations from OSM PBF → data/places.parquet.
+"""Extract places, stations, and universities → data/places.parquet.

 Extracts named place nodes and railway stations (tube, national rail, DLR,
-etc.) for typeahead search.
+etc.) for typeahead search. Official English university providers from the
+Office for Students register can also be added as travel-time destinations.
 Reuses the same england-latest.osm.pbf as pois.py.
 """

@ -53,6 +54,19 @@ _STATION_STRIP = (
 )

 _DLR_CODE_RE = re.compile(r"ZZDL([A-Z0-9]{3})")
+_POSTCODE_RE = re.compile(r"\b([A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2})\b", re.I)
+
+_NOISY_PROVIDER_SUFFIXES = (
+    " higher education corporation",
+    " limited",
+    " ltd",
+)
+
+_LEGAL_NAME_FALLBACK_MARKERS = (
+    "the chancellor",
+    "chancellor, masters",
+    "chancellor masters",
+)


 def _is_dlr_station(tags: dict[str, str]) -> bool:
@ -124,6 +138,170 @@ def _station_name_score(name: str) -> tuple[int, int]:
    return (suffix_penalty, len(name))


+def _cell_text(value: object) -> str:
+    if value is None:
+        return ""
+    return str(value).strip()
+
+
+def _header_key(value: object) -> str:
+    return re.sub(r"[^a-z0-9]+", " ", _cell_text(value).lower()).strip()
+
+
+def _find_header_row(rows: list[tuple]) -> int:
+    for idx, row in enumerate(rows):
+        keys = [_header_key(value) for value in row]
+        has_legal_name = any(
+            all(token in key for token in ("provider", "legal", "name"))
+            for key in keys
+        )
+        has_university_title = any(
+            all(token in key for token in ("right", "use", "university"))
+            for key in keys
+        )
+        if has_legal_name and has_university_title:
+            return idx
+    raise ValueError("Could not find the OfS register header row")
+
+
+def _find_column(headers: list[object], *tokens: str) -> int:
+    for idx, header in enumerate(headers):
+        key = _header_key(header)
+        if all(token in key for token in tokens):
+            return idx
+    raise ValueError(f"Could not find OfS register column containing {tokens}")
+
+
+def _normalize_postcode(postcode: str) -> str:
+    return re.sub(r"[^A-Z0-9]", "", postcode.upper())
+
+
+def _extract_postcode(address: str) -> str | None:
+    match = _POSTCODE_RE.search(address)
+    if match is None:
+        return None
+    return _normalize_postcode(match.group(1))
+
+
+def _clean_provider_name(name: str) -> str:
+    name = re.sub(r"\s+", " ", name).strip(" ,")
+    if name.lower().endswith(", the"):
+        name = f"The {name[:-5].strip(' ,')}"
+    for suffix in _NOISY_PROVIDER_SUFFIXES:
+        if name.lower().endswith(suffix):
+            name = name[: -len(suffix)].strip(" ,")
+            break
+    if name.startswith("The ") and name != "The Open University":
+        name = name[4:].strip()
+    return name
+
+
+def _split_trading_names(trading_names: str) -> list[str]:
+    if not trading_names or trading_names.casefold() == "not applicable":
+        return []
+    return [
+        _clean_provider_name(name)
+        for name in trading_names.splitlines()
+        if _clean_provider_name(name)
+    ]
+
+
+def _needs_trading_name(legal_name: str) -> bool:
+    lower = legal_name.lower()
+    return any(marker in lower for marker in _LEGAL_NAME_FALLBACK_MARKERS) or any(
+        lower.endswith(suffix) for suffix in _NOISY_PROVIDER_SUFFIXES
+    )
+
+
+def _select_university_name(legal_name: str, trading_names: str) -> str:
+    legal = _clean_provider_name(legal_name)
+    trading = _split_trading_names(trading_names)
+    if _needs_trading_name(legal_name):
+        for name in trading:
+            if "university" in name.lower() or "imperial college" in name.lower():
+                return name
+        if trading:
+            return trading[0]
+    return legal
+
+
+def _slugify_name(name: str) -> str:
+    slug = name.lower()
+    slug = re.sub(r"[^a-z0-9 -]", "", slug)
+    return re.sub(r"\s+", "-", slug).strip("-")
+
+
+def _postcode_lookup(postcodes_path: Path) -> dict[str, tuple[float, float]]:
+    df = pl.read_parquet(
+        postcodes_path,
+        columns=["pcds", "lat", "long", "ctry25cd", "doterm"],
+    ).filter((pl.col("ctry25cd") == "E92000001") & pl.col("doterm").is_null())
+    return {
+        _normalize_postcode(postcode): (float(lat), float(lon))
+        for postcode, lat, lon in df.select(["pcds", "lat", "long"]).iter_rows()
+    }
+
+
+def _ofs_universities(
+    raw: pl.DataFrame, postcode_coords: dict[str, tuple[float, float]]
+) -> tuple[list[dict], int]:
+    rows = raw.rows()
+    header_idx = _find_header_row(rows)
+    headers = list(rows[header_idx])
+    legal_idx = _find_column(headers, "provider", "legal", "name")
+    trading_idx = _find_column(headers, "trading", "name")
+    address_idx = _find_column(headers, "contact", "address")
+    university_title_idx = _find_column(headers, "right", "use", "university")
+
+    universities: list[dict] = []
+    skipped = 0
+    for row in rows[header_idx + 1 :]:
+        if _cell_text(row[university_title_idx]).casefold() != "yes":
+            continue
+
+        name = _select_university_name(
+            _cell_text(row[legal_idx]), _cell_text(row[trading_idx])
+        )
+        postcode = _extract_postcode(_cell_text(row[address_idx]))
+        coords = postcode_coords.get(postcode or "")
+        if not name or coords is None:
+            skipped += 1
+            continue
+
+        lat, lon = coords
+        universities.append(
+            {
+                "name": name,
+                "place_type": "university",
+                "lat": lat,
+                "lon": lon,
+                "population": 0,
+                "travel_destination": True,
+            }
+        )
+
+    return universities, skipped
+
+
+def _append_ofs_universities(
+    places: list[dict], register_path: Path, postcodes_path: Path
+) -> tuple[int, int]:
+    postcode_coords = _postcode_lookup(postcodes_path)
+    raw = pl.read_excel(register_path, has_header=False)
+    universities, skipped = _ofs_universities(raw, postcode_coords)
+
+    existing_slugs = {_slugify_name(str(place["name"])) for place in places}
+    added = 0
+    for university in universities:
+        slug = _slugify_name(university["name"])
+        if slug in existing_slugs:
+            continue
+        places.append(university)
+        existing_slugs.add(slug)
+        added += 1
+    return added, skipped
+
+
 def _naptan_dlr_stations(naptan_path: Path) -> list[dict]:
    """Extract station-level DLR destinations from NaPTAN access nodes."""
    df = pl.read_parquet(naptan_path)
@ -293,6 +471,16 @@ def main() -> None:
        type=Path,
        help="Optional NaPTAN parquet file used to add DLR station destinations",
    )
+    parser.add_argument(
+        "--university-register",
+        type=Path,
+        help="Optional OfS register spreadsheet used to add university destinations",
+    )
+    parser.add_argument(
+        "--postcodes",
+        type=Path,
+        help="Postcode parquet used to geocode OfS university contact postcodes",
+    )
    args = parser.parse_args()

    pbf_file = args.pbf
@ -313,6 +501,17 @@ def main() -> None:
    if args.naptan:
        added = _append_naptan_dlr_stations(handler.places, args.naptan)
        print(f"Added {added:,} DLR station destinations from NaPTAN")
+    if args.university_register:
+        if not args.postcodes:
+            raise ValueError("--postcodes is required with --university-register")
+        added, skipped = _append_ofs_universities(
+            handler.places, args.university_register, args.postcodes
+        )
+        print(
+            f"Added {added:,} university travel destinations from the OfS register"
+        )
+        if skipped:
+            print(f"Skipped {skipped:,} OfS university rows without usable coordinates")

    if handler.places:
        df = pl.DataFrame(handler.places)