Improve data

2026-06-10 07:54:25 +01:00 · 2026-06-10 07:54:25 +01:00 · 85da1941aa
commit 85da1941aa
parent b4d66a28c1
31 changed files with 901 additions and 319 deletions
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -33,6 +33,14 @@ DROP_CATEGORIES = {
    "emergency/water_tank",
    "leisure/bleachers",
    "leisure/schoolyard",
+    # Park "furniture" / incidental features — not parks; they massively
+    # inflated the Park count (picnic_table ~15k, outdoor_seating ~5.8k).
+    "leisure/bandstand",
+    "leisure/bird_hide",
+    "leisure/firepit",
+    "leisure/outdoor_seating",
+    "leisure/picnic_table",
+    "leisure/wildlife_hide",
    "public_transport/pay_scale_area",
    "shop/taxi",
    "amenity/feeding_place",
@ -182,9 +190,13 @@ DROP_CATEGORIES = {
    "tourism/village_sign",
    "tourism/wilderness_hut",
    "tourism/yes",
-    # Public transport (from NaPTAN instead)
+    # Public transport (from NaPTAN instead). public_transport/platform is the
+    # EXCEPTION: it is mapped to "Bus stop" (see _CATEGORIES) to fill NaPTAN's
+    # authority-level bus-stop gaps (e.g. West Cumbria, North Norfolk, where
+    # NaPTAN has zero stops), then deduped against NaPTAN so covered areas keep
+    # a single stop. stop_position is left dropped to avoid double-counting the
+    # same stop (platform + stop_position).
    "public_transport/entrance",
-    "public_transport/platform",
    "public_transport/station",
    "public_transport/stop_position",
    # Education amenities — schools come from GIAS instead. OSM coverage for
@ -301,16 +313,13 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        "🌳",
        [
            "leisure/park",
+            # leisure/garden is dominated by private residential gardens (98%+
+            # unnamed); it is name-gated in transform() via REQUIRE_NAME_CATEGORIES
+            # so only named (public/notable) gardens count as a Park.
            "leisure/garden",
            "leisure/common",
            "leisure/nature_reserve",
            "leisure/dog_park",
-            "leisure/bandstand",
-            "leisure/bird_hide",
-            "leisure/firepit",
-            "leisure/outdoor_seating",
-            "leisure/picnic_table",
-            "leisure/wildlife_hide",
        ],
    ),
    (
@ -329,6 +338,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        [
            "leisure/sports_centre",
            "leisure/sports_hall",
+            # leisure/pitch (73% of the old bucket) and leisure/swimming_pool
+            # (98% unnamed = private/garden pools) are name-gated in transform()
+            # via REQUIRE_NAME_CATEGORIES so only named public facilities count.
            "leisure/pitch",
            "leisure/track",
            "leisure/golf_course",
@ -1123,8 +1135,36 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "amenity/townhall",
        ],
    ),
+    # ── Public transport (OSM supplement to NaPTAN) ──────────
+    # OSM bus platforms fill NaPTAN's authority-level coverage gaps. Same group
+    # / friendly name / emoji as the NaPTAN "Bus stop" rows so they merge into
+    # one metric; OSM platforms that duplicate a NaPTAN stop are deduped in
+    # transform() (osm_stops_near_naptan).
+    (
+        "Public Transport",
+        "Bus stop",
+        "🚏",
+        [
+            "public_transport/platform",
+        ],
+    ),
 ]

+# Raw OSM tags whose UNNAMED instances are dropped before category mapping.
+# These tags are overwhelmingly private/incidental when unnamed: a nameless
+# `leisure/garden` is a private residential garden (not a public park), and a
+# nameless `leisure/pitch`/`swimming_pool` is a school cage or back-garden pool.
+# Keeping only named instances stops them inflating Park / Sports Centre counts
+# while preserving genuinely public, notable facilities (which carry a name).
+REQUIRE_NAME_CATEGORIES = {
+    "leisure/garden",
+    "leisure/pitch",
+    "leisure/practice_pitch",
+    "leisure/swimming_pool",
+    "leisure/paddling_pool",
+}
+
+
 # Build flat lookup: OSM category → (group, friendly_name, emoji)
 CATEGORY_MAP: dict[str, tuple[str, str, str]] = {
    osm_key: (group, name, emoji)
@ -1431,18 +1471,25 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
    )


-def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
+def transform_gias_schools(
+    gias_path: Path, ofsted_path: Path, boundary_path: Path
+) -> pl.LazyFrame:
    """Convert the GIAS register parquet into POI rows with school metadata.
    Ofsted ratings are joined by URN so each school carries its latest OEIF
    overall effectiveness grade (Outstanding/Good/Requires improvement/
-    Inadequate/Not judged), surfaced in the map popup."""
+    Inadequate/Not judged), surfaced in the map popup.
+
+    Clipped to England (like NaPTAN/GEOLYTIX) because the GIAS register is
+    GB-wide, so ~1,400 Welsh/Scottish/IoM schools would otherwise leak into the
+    England-only Education layer (and depress apparent Ofsted coverage, since
+    Wales is inspected by Estyn, not Ofsted)."""
    icon_category_expr = _school_icon_category_expr()
    emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES)
    ofsted = _load_ofsted_ratings(ofsted_path)
    # category mirrors icon_category so the dashboard renders one toggle per
    # school type (Nursery / Primary / Secondary / Sixth form / University /…)
    # instead of bundling every GIAS row under a single "School" pill.
-    return (
+    schools = (
        pl.scan_parquet(gias_path)
        .join(ofsted, on="urn", how="left")
        .select(
@ -1477,7 +1524,14 @@ def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
            pl.col("head_name").alias("school_head_name"),
            pl.col("ofsted_rating").alias("school_ofsted_rating"),
        )
+        .collect()
    )
+    mask = in_england_mask(
+        boundary_path,
+        schools["lat"].to_numpy(),
+        schools["lng"].to_numpy(),
+    )
+    return schools.filter(pl.Series(mask)).lazy()


 # OSM convenience-format stores that GEOLYTIX also covers (Tesco Express,
@ -1511,6 +1565,45 @@ def _significant_tokens(name: str | None) -> set[str]:
    return tokens


+# OSM bus platforms are added to "Bus stop" to fill NaPTAN's authority-level
+# gaps. Where NaPTAN already has a stop within this radius the area is covered,
+# so the colocated OSM platform is dropped to avoid double-counting; OSM
+# platforms with no nearby NaPTAN stop (the gaps) are kept.
+BUS_STOP_DEDUP_RADIUS_M = 50.0
+
+
+def osm_stops_near_naptan(
+    osm_stops: pl.DataFrame,
+    naptan_stops: pl.DataFrame,
+    radius_m: float = BUS_STOP_DEDUP_RADIUS_M,
+) -> list[str]:
+    """Return OSM bus-stop ids within ``radius_m`` of any NaPTAN bus stop.
+
+    Purely spatial (no name match): in NaPTAN-covered areas the OSM platform is
+    a duplicate and is dropped; only OSM platforms that fill a NaPTAN gap (no
+    NaPTAN stop within the radius) survive. Both frames need ``id``/``lat``/``lng``.
+    """
+    if osm_stops.is_empty() or naptan_stops.is_empty():
+        return []
+
+    from scipy.spatial import cKDTree
+
+    n_lat = naptan_stops["lat"].to_numpy().astype(float)
+    n_lng = naptan_stops["lng"].to_numpy().astype(float)
+    o_lat = osm_stops["lat"].to_numpy().astype(float)
+    o_lng = osm_stops["lng"].to_numpy().astype(float)
+    o_ids = osm_stops["id"].to_list()
+
+    mean_lat = float(np.mean(np.concatenate([n_lat, o_lat])))
+    cos_lat = float(np.cos(np.radians(mean_lat)))
+    n_xy = np.column_stack([n_lng * cos_lat * 111_320.0, n_lat * 110_540.0])
+    o_xy = np.column_stack([o_lng * cos_lat * 111_320.0, o_lat * 110_540.0])
+
+    tree = cKDTree(n_xy)
+    dist, _ = tree.query(o_xy, k=1)
+    return [o_ids[i] for i in range(len(o_ids)) if dist[i] <= radius_m]
+
+
 def osm_groceries_colocated_with_geolytix(
    osm_groceries: pl.DataFrame,
    geolytix: pl.DataFrame,
@ -1601,6 +1694,19 @@ def transform(
    # Drop unwanted categories
    lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))

+    # Drop UNNAMED instances of private-dominated tags (gardens, pitches,
+    # pools) so they don't inflate Park / Sports Centre proximity counts. Done
+    # while `category` still holds the raw OSM key, before the friendly mapping.
+    lf = lf.filter(
+        ~(
+            pl.col("category").is_in(list(REQUIRE_NAME_CATEGORIES))
+            & (
+                pl.col("name").is_null()
+                | (pl.col("name").cast(pl.String).str.strip_chars() == "")
+            )
+        )
+    )
+
    # Build lookup expressions from the 3-tuple mapping
    group_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()}
    name_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()}
@ -1665,11 +1771,37 @@ def transform(
            ~((pl.col("group") == "Groceries") & pl.col("id").is_in(duplicate_ids))
        )

+    # Drop OSM bus platforms that duplicate a NaPTAN bus stop, so the OSM
+    # supplement only adds stops in NaPTAN's coverage gaps (no double-count in
+    # covered areas). OSM bus stops carry id-prefix "n"/"a" so they never clash
+    # with NaPTAN ATCO ids.
+    osm_bus_stops = (
+        lf.filter((pl.col("group") == "Public Transport") & (pl.col("category") == "Bus stop"))
+        .select("id", "lat", "lng")
+        .collect(engine="streaming")
+    )
+    naptan_bus_stops = naptan_df.filter(pl.col("category") == "Bus stop")
+    covered_bus_ids = osm_stops_near_naptan(osm_bus_stops, naptan_bus_stops)
+    kept_osm = osm_bus_stops.height - len(covered_bus_ids)
+    print(
+        f"OSM bus platforms: {osm_bus_stops.height:,} total, dropping "
+        f"{len(covered_bus_ids):,} that duplicate a NaPTAN stop, keeping "
+        f"{kept_osm:,} to fill NaPTAN gaps"
+    )
+    if covered_bus_ids:
+        lf = lf.filter(
+            ~(
+                (pl.col("group") == "Public Transport")
+                & (pl.col("category") == "Bus stop")
+                & pl.col("id").is_in(covered_bus_ids)
+            )
+        )
+
    frames = [
        lf,
        naptan,
        grocery_pois.lazy(),
-        transform_gias_schools(gias_path, ofsted_path),
+        transform_gias_schools(gias_path, ofsted_path, boundary_path),
    ]

    return pl.concat(frames, how="diagonal_relaxed")