Improve data

This commit is contained in:
Andras Schmelczer 2026-06-10 07:54:25 +01:00
parent b4d66a28c1
commit 85da1941aa
31 changed files with 901 additions and 319 deletions

View file

@ -33,6 +33,14 @@ DROP_CATEGORIES = {
"emergency/water_tank",
"leisure/bleachers",
"leisure/schoolyard",
# Park "furniture" / incidental features — not parks; they massively
# inflated the Park count (picnic_table ~15k, outdoor_seating ~5.8k).
"leisure/bandstand",
"leisure/bird_hide",
"leisure/firepit",
"leisure/outdoor_seating",
"leisure/picnic_table",
"leisure/wildlife_hide",
"public_transport/pay_scale_area",
"shop/taxi",
"amenity/feeding_place",
@ -182,9 +190,13 @@ DROP_CATEGORIES = {
"tourism/village_sign",
"tourism/wilderness_hut",
"tourism/yes",
# Public transport (from NaPTAN instead)
# Public transport (from NaPTAN instead). public_transport/platform is the
# EXCEPTION: it is mapped to "Bus stop" (see _CATEGORIES) to fill NaPTAN's
# authority-level bus-stop gaps (e.g. West Cumbria, North Norfolk, where
# NaPTAN has zero stops), then deduped against NaPTAN so covered areas keep
# a single stop. stop_position is left dropped to avoid double-counting the
# same stop (platform + stop_position).
"public_transport/entrance",
"public_transport/platform",
"public_transport/station",
"public_transport/stop_position",
# Education amenities — schools come from GIAS instead. OSM coverage for
@ -301,16 +313,13 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"🌳",
[
"leisure/park",
# leisure/garden is dominated by private residential gardens (98%+
# unnamed); it is name-gated in transform() via REQUIRE_NAME_CATEGORIES
# so only named (public/notable) gardens count as a Park.
"leisure/garden",
"leisure/common",
"leisure/nature_reserve",
"leisure/dog_park",
"leisure/bandstand",
"leisure/bird_hide",
"leisure/firepit",
"leisure/outdoor_seating",
"leisure/picnic_table",
"leisure/wildlife_hide",
],
),
(
@ -329,6 +338,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
[
"leisure/sports_centre",
"leisure/sports_hall",
# leisure/pitch (73% of the old bucket) and leisure/swimming_pool
# (98% unnamed = private/garden pools) are name-gated in transform()
# via REQUIRE_NAME_CATEGORIES so only named public facilities count.
"leisure/pitch",
"leisure/track",
"leisure/golf_course",
@ -1123,8 +1135,36 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"amenity/townhall",
],
),
# ── Public transport (OSM supplement to NaPTAN) ──────────
# OSM bus platforms fill NaPTAN's authority-level coverage gaps. Same group
# / friendly name / emoji as the NaPTAN "Bus stop" rows so they merge into
# one metric; OSM platforms that duplicate a NaPTAN stop are deduped in
# transform() (osm_stops_near_naptan).
(
"Public Transport",
"Bus stop",
"🚏",
[
"public_transport/platform",
],
),
]
# Raw OSM tags whose UNNAMED instances are dropped before category mapping.
# These tags are overwhelmingly private/incidental when unnamed: a nameless
# `leisure/garden` is a private residential garden (not a public park), and a
# nameless `leisure/pitch`/`swimming_pool` is a school cage or back-garden pool.
# Keeping only named instances stops them inflating Park / Sports Centre counts
# while preserving genuinely public, notable facilities (which carry a name).
REQUIRE_NAME_CATEGORIES = {
"leisure/garden",
"leisure/pitch",
"leisure/practice_pitch",
"leisure/swimming_pool",
"leisure/paddling_pool",
}
# Build flat lookup: OSM category → (group, friendly_name, emoji)
CATEGORY_MAP: dict[str, tuple[str, str, str]] = {
osm_key: (group, name, emoji)
@ -1431,18 +1471,25 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
)
def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
def transform_gias_schools(
gias_path: Path, ofsted_path: Path, boundary_path: Path
) -> pl.LazyFrame:
"""Convert the GIAS register parquet into POI rows with school metadata.
Ofsted ratings are joined by URN so each school carries its latest OEIF
overall effectiveness grade (Outstanding/Good/Requires improvement/
Inadequate/Not judged), surfaced in the map popup."""
Inadequate/Not judged), surfaced in the map popup.
Clipped to England (like NaPTAN/GEOLYTIX) because the GIAS register is
GB-wide, so ~1,400 Welsh/Scottish/IoM schools would otherwise leak into the
England-only Education layer (and depress apparent Ofsted coverage, since
Wales is inspected by Estyn, not Ofsted)."""
icon_category_expr = _school_icon_category_expr()
emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES)
ofsted = _load_ofsted_ratings(ofsted_path)
# category mirrors icon_category so the dashboard renders one toggle per
# school type (Nursery / Primary / Secondary / Sixth form / University /…)
# instead of bundling every GIAS row under a single "School" pill.
return (
schools = (
pl.scan_parquet(gias_path)
.join(ofsted, on="urn", how="left")
.select(
@ -1477,7 +1524,14 @@ def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
pl.col("head_name").alias("school_head_name"),
pl.col("ofsted_rating").alias("school_ofsted_rating"),
)
.collect()
)
mask = in_england_mask(
boundary_path,
schools["lat"].to_numpy(),
schools["lng"].to_numpy(),
)
return schools.filter(pl.Series(mask)).lazy()
# OSM convenience-format stores that GEOLYTIX also covers (Tesco Express,
@ -1511,6 +1565,45 @@ def _significant_tokens(name: str | None) -> set[str]:
return tokens
# OSM bus platforms are added to "Bus stop" to fill NaPTAN's authority-level
# gaps. Where NaPTAN already has a stop within this radius the area is covered,
# so the colocated OSM platform is dropped to avoid double-counting; OSM
# platforms with no nearby NaPTAN stop (the gaps) are kept.
BUS_STOP_DEDUP_RADIUS_M = 50.0
def osm_stops_near_naptan(
osm_stops: pl.DataFrame,
naptan_stops: pl.DataFrame,
radius_m: float = BUS_STOP_DEDUP_RADIUS_M,
) -> list[str]:
"""Return OSM bus-stop ids within ``radius_m`` of any NaPTAN bus stop.
Purely spatial (no name match): in NaPTAN-covered areas the OSM platform is
a duplicate and is dropped; only OSM platforms that fill a NaPTAN gap (no
NaPTAN stop within the radius) survive. Both frames need ``id``/``lat``/``lng``.
"""
if osm_stops.is_empty() or naptan_stops.is_empty():
return []
from scipy.spatial import cKDTree
n_lat = naptan_stops["lat"].to_numpy().astype(float)
n_lng = naptan_stops["lng"].to_numpy().astype(float)
o_lat = osm_stops["lat"].to_numpy().astype(float)
o_lng = osm_stops["lng"].to_numpy().astype(float)
o_ids = osm_stops["id"].to_list()
mean_lat = float(np.mean(np.concatenate([n_lat, o_lat])))
cos_lat = float(np.cos(np.radians(mean_lat)))
n_xy = np.column_stack([n_lng * cos_lat * 111_320.0, n_lat * 110_540.0])
o_xy = np.column_stack([o_lng * cos_lat * 111_320.0, o_lat * 110_540.0])
tree = cKDTree(n_xy)
dist, _ = tree.query(o_xy, k=1)
return [o_ids[i] for i in range(len(o_ids)) if dist[i] <= radius_m]
def osm_groceries_colocated_with_geolytix(
osm_groceries: pl.DataFrame,
geolytix: pl.DataFrame,
@ -1601,6 +1694,19 @@ def transform(
# Drop unwanted categories
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
# Drop UNNAMED instances of private-dominated tags (gardens, pitches,
# pools) so they don't inflate Park / Sports Centre proximity counts. Done
# while `category` still holds the raw OSM key, before the friendly mapping.
lf = lf.filter(
~(
pl.col("category").is_in(list(REQUIRE_NAME_CATEGORIES))
& (
pl.col("name").is_null()
| (pl.col("name").cast(pl.String).str.strip_chars() == "")
)
)
)
# Build lookup expressions from the 3-tuple mapping
group_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()}
name_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()}
@ -1665,11 +1771,37 @@ def transform(
~((pl.col("group") == "Groceries") & pl.col("id").is_in(duplicate_ids))
)
# Drop OSM bus platforms that duplicate a NaPTAN bus stop, so the OSM
# supplement only adds stops in NaPTAN's coverage gaps (no double-count in
# covered areas). OSM bus stops carry id-prefix "n"/"a" so they never clash
# with NaPTAN ATCO ids.
osm_bus_stops = (
lf.filter((pl.col("group") == "Public Transport") & (pl.col("category") == "Bus stop"))
.select("id", "lat", "lng")
.collect(engine="streaming")
)
naptan_bus_stops = naptan_df.filter(pl.col("category") == "Bus stop")
covered_bus_ids = osm_stops_near_naptan(osm_bus_stops, naptan_bus_stops)
kept_osm = osm_bus_stops.height - len(covered_bus_ids)
print(
f"OSM bus platforms: {osm_bus_stops.height:,} total, dropping "
f"{len(covered_bus_ids):,} that duplicate a NaPTAN stop, keeping "
f"{kept_osm:,} to fill NaPTAN gaps"
)
if covered_bus_ids:
lf = lf.filter(
~(
(pl.col("group") == "Public Transport")
& (pl.col("category") == "Bus stop")
& pl.col("id").is_in(covered_bus_ids)
)
)
frames = [
lf,
naptan,
grocery_pois.lazy(),
transform_gias_schools(gias_path, ofsted_path),
transform_gias_schools(gias_path, ofsted_path, boundary_path),
]
return pl.concat(frames, how="diagonal_relaxed")