Improve data
This commit is contained in:
parent
b4d66a28c1
commit
85da1941aa
31 changed files with 901 additions and 319 deletions
|
|
@ -33,6 +33,14 @@ DROP_CATEGORIES = {
|
|||
"emergency/water_tank",
|
||||
"leisure/bleachers",
|
||||
"leisure/schoolyard",
|
||||
# Park "furniture" / incidental features — not parks; they massively
|
||||
# inflated the Park count (picnic_table ~15k, outdoor_seating ~5.8k).
|
||||
"leisure/bandstand",
|
||||
"leisure/bird_hide",
|
||||
"leisure/firepit",
|
||||
"leisure/outdoor_seating",
|
||||
"leisure/picnic_table",
|
||||
"leisure/wildlife_hide",
|
||||
"public_transport/pay_scale_area",
|
||||
"shop/taxi",
|
||||
"amenity/feeding_place",
|
||||
|
|
@ -182,9 +190,13 @@ DROP_CATEGORIES = {
|
|||
"tourism/village_sign",
|
||||
"tourism/wilderness_hut",
|
||||
"tourism/yes",
|
||||
# Public transport (from NaPTAN instead)
|
||||
# Public transport (from NaPTAN instead). public_transport/platform is the
|
||||
# EXCEPTION: it is mapped to "Bus stop" (see _CATEGORIES) to fill NaPTAN's
|
||||
# authority-level bus-stop gaps (e.g. West Cumbria, North Norfolk, where
|
||||
# NaPTAN has zero stops), then deduped against NaPTAN so covered areas keep
|
||||
# a single stop. stop_position is left dropped to avoid double-counting the
|
||||
# same stop (platform + stop_position).
|
||||
"public_transport/entrance",
|
||||
"public_transport/platform",
|
||||
"public_transport/station",
|
||||
"public_transport/stop_position",
|
||||
# Education amenities — schools come from GIAS instead. OSM coverage for
|
||||
|
|
@ -301,16 +313,13 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
"🌳",
|
||||
[
|
||||
"leisure/park",
|
||||
# leisure/garden is dominated by private residential gardens (98%+
|
||||
# unnamed); it is name-gated in transform() via REQUIRE_NAME_CATEGORIES
|
||||
# so only named (public/notable) gardens count as a Park.
|
||||
"leisure/garden",
|
||||
"leisure/common",
|
||||
"leisure/nature_reserve",
|
||||
"leisure/dog_park",
|
||||
"leisure/bandstand",
|
||||
"leisure/bird_hide",
|
||||
"leisure/firepit",
|
||||
"leisure/outdoor_seating",
|
||||
"leisure/picnic_table",
|
||||
"leisure/wildlife_hide",
|
||||
],
|
||||
),
|
||||
(
|
||||
|
|
@ -329,6 +338,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
[
|
||||
"leisure/sports_centre",
|
||||
"leisure/sports_hall",
|
||||
# leisure/pitch (73% of the old bucket) and leisure/swimming_pool
|
||||
# (98% unnamed = private/garden pools) are name-gated in transform()
|
||||
# via REQUIRE_NAME_CATEGORIES so only named public facilities count.
|
||||
"leisure/pitch",
|
||||
"leisure/track",
|
||||
"leisure/golf_course",
|
||||
|
|
@ -1123,8 +1135,36 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
"amenity/townhall",
|
||||
],
|
||||
),
|
||||
# ── Public transport (OSM supplement to NaPTAN) ──────────
|
||||
# OSM bus platforms fill NaPTAN's authority-level coverage gaps. Same group
|
||||
# / friendly name / emoji as the NaPTAN "Bus stop" rows so they merge into
|
||||
# one metric; OSM platforms that duplicate a NaPTAN stop are deduped in
|
||||
# transform() (osm_stops_near_naptan).
|
||||
(
|
||||
"Public Transport",
|
||||
"Bus stop",
|
||||
"🚏",
|
||||
[
|
||||
"public_transport/platform",
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
# Raw OSM tags whose UNNAMED instances are dropped before category mapping.
|
||||
# These tags are overwhelmingly private/incidental when unnamed: a nameless
|
||||
# `leisure/garden` is a private residential garden (not a public park), and a
|
||||
# nameless `leisure/pitch`/`swimming_pool` is a school cage or back-garden pool.
|
||||
# Keeping only named instances stops them inflating Park / Sports Centre counts
|
||||
# while preserving genuinely public, notable facilities (which carry a name).
|
||||
REQUIRE_NAME_CATEGORIES = {
|
||||
"leisure/garden",
|
||||
"leisure/pitch",
|
||||
"leisure/practice_pitch",
|
||||
"leisure/swimming_pool",
|
||||
"leisure/paddling_pool",
|
||||
}
|
||||
|
||||
|
||||
# Build flat lookup: OSM category → (group, friendly_name, emoji)
|
||||
CATEGORY_MAP: dict[str, tuple[str, str, str]] = {
|
||||
osm_key: (group, name, emoji)
|
||||
|
|
@ -1431,18 +1471,25 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
|
|||
)
|
||||
|
||||
|
||||
def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
|
||||
def transform_gias_schools(
|
||||
gias_path: Path, ofsted_path: Path, boundary_path: Path
|
||||
) -> pl.LazyFrame:
|
||||
"""Convert the GIAS register parquet into POI rows with school metadata.
|
||||
Ofsted ratings are joined by URN so each school carries its latest OEIF
|
||||
overall effectiveness grade (Outstanding/Good/Requires improvement/
|
||||
Inadequate/Not judged), surfaced in the map popup."""
|
||||
Inadequate/Not judged), surfaced in the map popup.
|
||||
|
||||
Clipped to England (like NaPTAN/GEOLYTIX) because the GIAS register is
|
||||
GB-wide, so ~1,400 Welsh/Scottish/IoM schools would otherwise leak into the
|
||||
England-only Education layer (and depress apparent Ofsted coverage, since
|
||||
Wales is inspected by Estyn, not Ofsted)."""
|
||||
icon_category_expr = _school_icon_category_expr()
|
||||
emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES)
|
||||
ofsted = _load_ofsted_ratings(ofsted_path)
|
||||
# category mirrors icon_category so the dashboard renders one toggle per
|
||||
# school type (Nursery / Primary / Secondary / Sixth form / University /…)
|
||||
# instead of bundling every GIAS row under a single "School" pill.
|
||||
return (
|
||||
schools = (
|
||||
pl.scan_parquet(gias_path)
|
||||
.join(ofsted, on="urn", how="left")
|
||||
.select(
|
||||
|
|
@ -1477,7 +1524,14 @@ def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
|
|||
pl.col("head_name").alias("school_head_name"),
|
||||
pl.col("ofsted_rating").alias("school_ofsted_rating"),
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
mask = in_england_mask(
|
||||
boundary_path,
|
||||
schools["lat"].to_numpy(),
|
||||
schools["lng"].to_numpy(),
|
||||
)
|
||||
return schools.filter(pl.Series(mask)).lazy()
|
||||
|
||||
|
||||
# OSM convenience-format stores that GEOLYTIX also covers (Tesco Express,
|
||||
|
|
@ -1511,6 +1565,45 @@ def _significant_tokens(name: str | None) -> set[str]:
|
|||
return tokens
|
||||
|
||||
|
||||
# OSM bus platforms are added to "Bus stop" to fill NaPTAN's authority-level
|
||||
# gaps. Where NaPTAN already has a stop within this radius the area is covered,
|
||||
# so the colocated OSM platform is dropped to avoid double-counting; OSM
|
||||
# platforms with no nearby NaPTAN stop (the gaps) are kept.
|
||||
BUS_STOP_DEDUP_RADIUS_M = 50.0
|
||||
|
||||
|
||||
def osm_stops_near_naptan(
|
||||
osm_stops: pl.DataFrame,
|
||||
naptan_stops: pl.DataFrame,
|
||||
radius_m: float = BUS_STOP_DEDUP_RADIUS_M,
|
||||
) -> list[str]:
|
||||
"""Return OSM bus-stop ids within ``radius_m`` of any NaPTAN bus stop.
|
||||
|
||||
Purely spatial (no name match): in NaPTAN-covered areas the OSM platform is
|
||||
a duplicate and is dropped; only OSM platforms that fill a NaPTAN gap (no
|
||||
NaPTAN stop within the radius) survive. Both frames need ``id``/``lat``/``lng``.
|
||||
"""
|
||||
if osm_stops.is_empty() or naptan_stops.is_empty():
|
||||
return []
|
||||
|
||||
from scipy.spatial import cKDTree
|
||||
|
||||
n_lat = naptan_stops["lat"].to_numpy().astype(float)
|
||||
n_lng = naptan_stops["lng"].to_numpy().astype(float)
|
||||
o_lat = osm_stops["lat"].to_numpy().astype(float)
|
||||
o_lng = osm_stops["lng"].to_numpy().astype(float)
|
||||
o_ids = osm_stops["id"].to_list()
|
||||
|
||||
mean_lat = float(np.mean(np.concatenate([n_lat, o_lat])))
|
||||
cos_lat = float(np.cos(np.radians(mean_lat)))
|
||||
n_xy = np.column_stack([n_lng * cos_lat * 111_320.0, n_lat * 110_540.0])
|
||||
o_xy = np.column_stack([o_lng * cos_lat * 111_320.0, o_lat * 110_540.0])
|
||||
|
||||
tree = cKDTree(n_xy)
|
||||
dist, _ = tree.query(o_xy, k=1)
|
||||
return [o_ids[i] for i in range(len(o_ids)) if dist[i] <= radius_m]
|
||||
|
||||
|
||||
def osm_groceries_colocated_with_geolytix(
|
||||
osm_groceries: pl.DataFrame,
|
||||
geolytix: pl.DataFrame,
|
||||
|
|
@ -1601,6 +1694,19 @@ def transform(
|
|||
# Drop unwanted categories
|
||||
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
|
||||
|
||||
# Drop UNNAMED instances of private-dominated tags (gardens, pitches,
|
||||
# pools) so they don't inflate Park / Sports Centre proximity counts. Done
|
||||
# while `category` still holds the raw OSM key, before the friendly mapping.
|
||||
lf = lf.filter(
|
||||
~(
|
||||
pl.col("category").is_in(list(REQUIRE_NAME_CATEGORIES))
|
||||
& (
|
||||
pl.col("name").is_null()
|
||||
| (pl.col("name").cast(pl.String).str.strip_chars() == "")
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Build lookup expressions from the 3-tuple mapping
|
||||
group_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()}
|
||||
name_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()}
|
||||
|
|
@ -1665,11 +1771,37 @@ def transform(
|
|||
~((pl.col("group") == "Groceries") & pl.col("id").is_in(duplicate_ids))
|
||||
)
|
||||
|
||||
# Drop OSM bus platforms that duplicate a NaPTAN bus stop, so the OSM
|
||||
# supplement only adds stops in NaPTAN's coverage gaps (no double-count in
|
||||
# covered areas). OSM bus stops carry id-prefix "n"/"a" so they never clash
|
||||
# with NaPTAN ATCO ids.
|
||||
osm_bus_stops = (
|
||||
lf.filter((pl.col("group") == "Public Transport") & (pl.col("category") == "Bus stop"))
|
||||
.select("id", "lat", "lng")
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
naptan_bus_stops = naptan_df.filter(pl.col("category") == "Bus stop")
|
||||
covered_bus_ids = osm_stops_near_naptan(osm_bus_stops, naptan_bus_stops)
|
||||
kept_osm = osm_bus_stops.height - len(covered_bus_ids)
|
||||
print(
|
||||
f"OSM bus platforms: {osm_bus_stops.height:,} total, dropping "
|
||||
f"{len(covered_bus_ids):,} that duplicate a NaPTAN stop, keeping "
|
||||
f"{kept_osm:,} to fill NaPTAN gaps"
|
||||
)
|
||||
if covered_bus_ids:
|
||||
lf = lf.filter(
|
||||
~(
|
||||
(pl.col("group") == "Public Transport")
|
||||
& (pl.col("category") == "Bus stop")
|
||||
& pl.col("id").is_in(covered_bus_ids)
|
||||
)
|
||||
)
|
||||
|
||||
frames = [
|
||||
lf,
|
||||
naptan,
|
||||
grocery_pois.lazy(),
|
||||
transform_gias_schools(gias_path, ofsted_path),
|
||||
transform_gias_schools(gias_path, ofsted_path, boundary_path),
|
||||
]
|
||||
|
||||
return pl.concat(frames, how="diagonal_relaxed")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue