good

2026-05-12 22:30:36 +01:00 · 2026-05-12 22:30:36 +01:00 · 63713c3a2b
commit 63713c3a2b
parent 81a16f543c
15 changed files with 492 additions and 159 deletions
--- a/pipeline/download/places.py
+++ b/pipeline/download/places.py
@ -1,7 +1,8 @@
-"""Extract place=* nodes and railway stations from OSM PBF → data/places.parquet.
+"""Extract places, stations, and universities → data/places.parquet.

 Extracts named place nodes and railway stations (tube, national rail, DLR,
-etc.) for typeahead search.
+etc.) for typeahead search. Official English university providers from the
+Office for Students register can also be added as travel-time destinations.
 Reuses the same england-latest.osm.pbf as pois.py.
 """

@ -53,6 +54,19 @@ _STATION_STRIP = (
 )

 _DLR_CODE_RE = re.compile(r"ZZDL([A-Z0-9]{3})")
+_POSTCODE_RE = re.compile(r"\b([A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2})\b", re.I)
+
+_NOISY_PROVIDER_SUFFIXES = (
+    " higher education corporation",
+    " limited",
+    " ltd",
+)
+
+_LEGAL_NAME_FALLBACK_MARKERS = (
+    "the chancellor",
+    "chancellor, masters",
+    "chancellor masters",
+)


 def _is_dlr_station(tags: dict[str, str]) -> bool:
@ -124,6 +138,170 @@ def _station_name_score(name: str) -> tuple[int, int]:
    return (suffix_penalty, len(name))


+def _cell_text(value: object) -> str:
+    if value is None:
+        return ""
+    return str(value).strip()
+
+
+def _header_key(value: object) -> str:
+    return re.sub(r"[^a-z0-9]+", " ", _cell_text(value).lower()).strip()
+
+
+def _find_header_row(rows: list[tuple]) -> int:
+    for idx, row in enumerate(rows):
+        keys = [_header_key(value) for value in row]
+        has_legal_name = any(
+            all(token in key for token in ("provider", "legal", "name"))
+            for key in keys
+        )
+        has_university_title = any(
+            all(token in key for token in ("right", "use", "university"))
+            for key in keys
+        )
+        if has_legal_name and has_university_title:
+            return idx
+    raise ValueError("Could not find the OfS register header row")
+
+
+def _find_column(headers: list[object], *tokens: str) -> int:
+    for idx, header in enumerate(headers):
+        key = _header_key(header)
+        if all(token in key for token in tokens):
+            return idx
+    raise ValueError(f"Could not find OfS register column containing {tokens}")
+
+
+def _normalize_postcode(postcode: str) -> str:
+    return re.sub(r"[^A-Z0-9]", "", postcode.upper())
+
+
+def _extract_postcode(address: str) -> str | None:
+    match = _POSTCODE_RE.search(address)
+    if match is None:
+        return None
+    return _normalize_postcode(match.group(1))
+
+
+def _clean_provider_name(name: str) -> str:
+    name = re.sub(r"\s+", " ", name).strip(" ,")
+    if name.lower().endswith(", the"):
+        name = f"The {name[:-5].strip(' ,')}"
+    for suffix in _NOISY_PROVIDER_SUFFIXES:
+        if name.lower().endswith(suffix):
+            name = name[: -len(suffix)].strip(" ,")
+            break
+    if name.startswith("The ") and name != "The Open University":
+        name = name[4:].strip()
+    return name
+
+
+def _split_trading_names(trading_names: str) -> list[str]:
+    if not trading_names or trading_names.casefold() == "not applicable":
+        return []
+    return [
+        _clean_provider_name(name)
+        for name in trading_names.splitlines()
+        if _clean_provider_name(name)
+    ]
+
+
+def _needs_trading_name(legal_name: str) -> bool:
+    lower = legal_name.lower()
+    return any(marker in lower for marker in _LEGAL_NAME_FALLBACK_MARKERS) or any(
+        lower.endswith(suffix) for suffix in _NOISY_PROVIDER_SUFFIXES
+    )
+
+
+def _select_university_name(legal_name: str, trading_names: str) -> str:
+    legal = _clean_provider_name(legal_name)
+    trading = _split_trading_names(trading_names)
+    if _needs_trading_name(legal_name):
+        for name in trading:
+            if "university" in name.lower() or "imperial college" in name.lower():
+                return name
+        if trading:
+            return trading[0]
+    return legal
+
+
+def _slugify_name(name: str) -> str:
+    slug = name.lower()
+    slug = re.sub(r"[^a-z0-9 -]", "", slug)
+    return re.sub(r"\s+", "-", slug).strip("-")
+
+
+def _postcode_lookup(postcodes_path: Path) -> dict[str, tuple[float, float]]:
+    df = pl.read_parquet(
+        postcodes_path,
+        columns=["pcds", "lat", "long", "ctry25cd", "doterm"],
+    ).filter((pl.col("ctry25cd") == "E92000001") & pl.col("doterm").is_null())
+    return {
+        _normalize_postcode(postcode): (float(lat), float(lon))
+        for postcode, lat, lon in df.select(["pcds", "lat", "long"]).iter_rows()
+    }
+
+
+def _ofs_universities(
+    raw: pl.DataFrame, postcode_coords: dict[str, tuple[float, float]]
+) -> tuple[list[dict], int]:
+    rows = raw.rows()
+    header_idx = _find_header_row(rows)
+    headers = list(rows[header_idx])
+    legal_idx = _find_column(headers, "provider", "legal", "name")
+    trading_idx = _find_column(headers, "trading", "name")
+    address_idx = _find_column(headers, "contact", "address")
+    university_title_idx = _find_column(headers, "right", "use", "university")
+
+    universities: list[dict] = []
+    skipped = 0
+    for row in rows[header_idx + 1 :]:
+        if _cell_text(row[university_title_idx]).casefold() != "yes":
+            continue
+
+        name = _select_university_name(
+            _cell_text(row[legal_idx]), _cell_text(row[trading_idx])
+        )
+        postcode = _extract_postcode(_cell_text(row[address_idx]))
+        coords = postcode_coords.get(postcode or "")
+        if not name or coords is None:
+            skipped += 1
+            continue
+
+        lat, lon = coords
+        universities.append(
+            {
+                "name": name,
+                "place_type": "university",
+                "lat": lat,
+                "lon": lon,
+                "population": 0,
+                "travel_destination": True,
+            }
+        )
+
+    return universities, skipped
+
+
+def _append_ofs_universities(
+    places: list[dict], register_path: Path, postcodes_path: Path
+) -> tuple[int, int]:
+    postcode_coords = _postcode_lookup(postcodes_path)
+    raw = pl.read_excel(register_path, has_header=False)
+    universities, skipped = _ofs_universities(raw, postcode_coords)
+
+    existing_slugs = {_slugify_name(str(place["name"])) for place in places}
+    added = 0
+    for university in universities:
+        slug = _slugify_name(university["name"])
+        if slug in existing_slugs:
+            continue
+        places.append(university)
+        existing_slugs.add(slug)
+        added += 1
+    return added, skipped
+
+
 def _naptan_dlr_stations(naptan_path: Path) -> list[dict]:
    """Extract station-level DLR destinations from NaPTAN access nodes."""
    df = pl.read_parquet(naptan_path)
@ -293,6 +471,16 @@ def main() -> None:
        type=Path,
        help="Optional NaPTAN parquet file used to add DLR station destinations",
    )
+    parser.add_argument(
+        "--university-register",
+        type=Path,
+        help="Optional OfS register spreadsheet used to add university destinations",
+    )
+    parser.add_argument(
+        "--postcodes",
+        type=Path,
+        help="Postcode parquet used to geocode OfS university contact postcodes",
+    )
    args = parser.parse_args()

    pbf_file = args.pbf
@ -313,6 +501,17 @@ def main() -> None:
    if args.naptan:
        added = _append_naptan_dlr_stations(handler.places, args.naptan)
        print(f"Added {added:,} DLR station destinations from NaPTAN")
+    if args.university_register:
+        if not args.postcodes:
+            raise ValueError("--postcodes is required with --university-register")
+        added, skipped = _append_ofs_universities(
+            handler.places, args.university_register, args.postcodes
+        )
+        print(
+            f"Added {added:,} university travel destinations from the OfS register"
+        )
+        if skipped:
+            print(f"Skipped {skipped:,} OfS university rows without usable coordinates")

    if handler.places:
        df = pl.DataFrame(handler.places)
--- a/pipeline/download/test_places.py
+++ b/pipeline/download/test_places.py
@ -4,6 +4,8 @@ from pipeline.download.places import (
    _is_dlr_station,
    _is_tram_station,
    _naptan_dlr_stations,
+    _ofs_universities,
+    _select_university_name,
    _station_display_name,
 )

@ -79,3 +81,68 @@ def test_naptan_dlr_stations_are_deduplicated_by_atco_code(tmp_path):
    assert shadwell["lat"] == (51.51156 + 51.511693) / 2
    assert shadwell["place_type"] == "station"
    assert shadwell["travel_destination"] is True
+
+
+def test_select_university_name_prefers_public_trading_name_for_noisy_legal_name():
+    assert (
+        _select_university_name(
+            "The Chancellor, Masters and Scholars of the University of Oxford",
+            "Oxford University\nThe University of Oxford",
+        )
+        == "Oxford University"
+    )
+    assert (
+        _select_university_name(
+            "Bournemouth University Higher Education Corporation",
+            "Bournemouth University",
+        )
+        == "Bournemouth University"
+    )
+    assert (
+        _select_university_name("The University of Surrey", "Not applicable")
+        == "University of Surrey"
+    )
+
+
+def test_ofs_universities_extracts_university_title_rows_with_postcode_coords():
+    raw_register = pl.DataFrame(
+        [
+            ["OfS Register", None, None, None],
+            ["Note row", None, None, None],
+            [
+                "Provider's legal name",
+                "Provider's trading name(s)",
+                "Provider's contact address",
+                "Does the provider have the right to use university in its title?",
+            ],
+            [
+                "The Chancellor, Masters and Scholars of the University of Oxford",
+                "Oxford University\nThe University of Oxford",
+                "University Offices\nWellington Square\nOxford\nOX1 2JD\nUnited Kingdom",
+                "Yes",
+            ],
+            [
+                "Example College",
+                "Not applicable",
+                "Example Street\nLondon\nSW1A 1AA\nUnited Kingdom",
+                "No",
+            ],
+        ],
+        orient="row",
+    )
+
+    universities, skipped = _ofs_universities(
+        raw_register, {"OX12JD": (51.7585, -1.2643)}
+    )
+
+    assert skipped == 0
+    assert universities == [
+        {
+            "name": "Oxford University",
+            "place_type": "university",
+            "lat": 51.7585,
+            "lon": -1.2643,
+            "population": 0,
+            "travel_destination": True,
+        }
+    ]
--- a/pipeline/transform/test_merge.py
+++ b/pipeline/transform/test_merge.py
@ -1,8 +1,6 @@
 import polars as pl

 from pipeline.transform.merge import (
-    _AREA_COLUMNS,
-    _STATIC_POI_DISTANCE_RENAMES,
    _is_dynamic_poi_metric_column,
    _less_deprived_percentile_expr,
 )
@ -11,9 +9,11 @@ from pipeline.transform.merge import (
 def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None:
    df = pl.DataFrame({"Income Score (rate)": [1.0, 2.0, 3.0, None]})

-    result = df.lazy().with_columns(
-        _less_deprived_percentile_expr("Income Score (rate)")
-    ).collect()
+    result = (
+        df.lazy()
+        .with_columns(_less_deprived_percentile_expr("Income Score (rate)"))
+        .collect()
+    )

    assert result["Income Score (rate)"].to_list() == [100.0, 50.0, 0.0, None]

@ -21,28 +21,18 @@ def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None:
 def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None:
    df = pl.DataFrame({"Income Score (rate)": [1.0, 1.0, 2.0, 3.0, 3.0]})

-    result = df.lazy().with_columns(
-        _less_deprived_percentile_expr("Income Score (rate)")
-    ).collect()
+    result = (
+        df.lazy()
+        .with_columns(_less_deprived_percentile_expr("Income Score (rate)"))
+        .collect()
+    )

    assert result["Income Score (rate)"].to_list() == [100.0, 100.0, 50.0, 0.0, 0.0]


 def test_dynamic_poi_metric_columns_are_area_level() -> None:
    assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Cafe) (km)")
+    assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Park) (km)")
    assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 2km")
    assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 5km")
    assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")
-
-
-def test_static_poi_distance_columns_are_renamed_to_configured_area_features() -> None:
-    expected = {
-        "parks_nearest_km": "Distance to nearest park (km)",
-        "grocery_store_nearest_km": "Distance to nearest grocery store (km)",
-        "cafe_nearest_km": "Distance to nearest cafe (km)",
-        "pub_nearest_km": "Distance to nearest pub (km)",
-        "restaurant_nearest_km": "Distance to nearest restaurant (km)",
-    }
-
-    assert _STATIC_POI_DISTANCE_RENAMES == expected
-    assert set(expected.values()).issubset(_AREA_COLUMNS)
--- a/pipeline/utils/test_fuzzy_join.py
+++ b/pipeline/utils/test_fuzzy_join.py
@ -2,45 +2,72 @@ import polars as pl

 from pipeline.utils import fuzzy_join_on_postcode

-POSTCODE = "E14 2DG"

-# Price paid: unique addresses for this postcode
-pp = (
-    pl.scan_parquet("data/price-paid-complete.parquet")
-    .filter(pl.col("postcode") == POSTCODE)
-    .select("paon", "saon", "street", "postcode")
-    .unique()
-    .sort("saon")
-    .with_columns(
-        pl.concat_str(
-            [pl.col("saon"), pl.col("paon"), pl.col("street")],
-            separator=" ",
-            ignore_nulls=True,
-        ).alias("pp_address"),
+def test_fuzzy_join_on_postcode_matches_addresses_within_postcode():
+    left = pl.LazyFrame(
+        {
+            "left_id": ["flat", "house", "unmatched"],
+            "left_address": [
+                "Flat 2, 10 High Street",
+                "12 High Street",
+                "99 Other Road",
+            ],
+            "left_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
+        }
+    )
+    right = pl.LazyFrame(
+        {
+            "right_id": ["flat_epc", "house_epc", "other_postcode"],
+            "right_address": [
+                "10 HIGH STREET FLAT 2",
+                "12 High-Street",
+                "99 Other Road",
+            ],
+            "right_postcode": [" AB1 2CD ", "AB1 2CD", "ZZ9 9ZZ"],
+        }
    )
-)

-# EPC: latest inspection per address for this postcode
-epc = (
-    pl.scan_csv("data/epc/certificates.csv")
-    .select("ADDRESS", "POSTCODE", "INSPECTION_DATE")
-    .filter(pl.col("POSTCODE").str.strip_chars() == POSTCODE)
-    .sort("INSPECTION_DATE", descending=True)
-    .unique("ADDRESS")
-    .sort("ADDRESS")
-)
+    result = (
+        fuzzy_join_on_postcode(
+            left=left,
+            right=right,
+            left_address_col="left_address",
+            right_address_col="right_address",
+            left_postcode_col="left_postcode",
+            right_postcode_col="right_postcode",
+        )
+        .sort("left_id")
+        .collect()
+    )

-result = fuzzy_join_on_postcode(
-    left=pp,
-    right=epc,
-    left_address_col="pp_address",
-    right_address_col="ADDRESS",
-    left_postcode_col="postcode",
-    right_postcode_col="POSTCODE",
-).collect()
+    assert result.select("left_id", "right_id").to_dicts() == [
+        {"left_id": "flat", "right_id": "flat_epc"},
+        {"left_id": "house", "right_id": "house_epc"},
+        {"left_id": "unmatched", "right_id": None},
+    ]

-snapshot = result.select("pp_address", "ADDRESS").sort("pp_address")

-print("Testing the matching between EPC and PP addresses")
-with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=80):
-    print(snapshot)
+def test_fuzzy_join_on_postcode_requires_matching_numbers():
+    left = pl.LazyFrame(
+        {
+            "left_address": ["10 High Street"],
+            "left_postcode": ["AB1 2CD"],
+        }
+    )
+    right = pl.LazyFrame(
+        {
+            "right_address": ["11 High Street"],
+            "right_postcode": ["AB1 2CD"],
+        }
+    )
+
+    result = fuzzy_join_on_postcode(
+        left=left,
+        right=right,
+        left_address_col="left_address",
+        right_address_col="right_address",
+        left_postcode_col="left_postcode",
+        right_postcode_col="right_postcode",
+    ).collect()
+
+    assert result["right_address"].to_list() == [None]