Fix data pipelines once and for all

This commit is contained in:
Andras Schmelczer 2026-06-10 21:27:32 +01:00
parent 08560476c5
commit 4012e4e047
46 changed files with 4508 additions and 855 deletions

View file

@ -86,6 +86,28 @@ DROP_CATEGORIES = {
"amenity/water_point",
"amenity/watering_place",
"amenity/weighbridge",
# Boating/cycle-hire infrastructure formerly miscategorised as
# "Entertainment" (46% of the bucket): cycle-hire dock stations, boat
# ramps and moorings are not entertainment venues.
"amenity/bicycle_rental",
"amenity/boat_rental",
"leisure/marina",
"leisure/slipway",
# Public art (statues, murals, village signs) formerly 93% of "Gallery".
"tourism/artwork",
# Outdoor exercise apparatus (pull-up bars, trim trails) formerly inflating
# "Gym & Fitness".
"leisure/fitness_station",
# Untyped healthcare rows and non-pharmacy health shops formerly bucketed
# under "Hospital & Clinic" / "Pharmacy".
"healthcare/yes",
"healthcare/alternative",
"shop/herbalist",
"shop/health",
# Street fountains and courthouses formerly bucketed as
# "Tourist Attraction".
"amenity/fountain",
"amenity/courthouse",
# Niche amenities not useful for home buyers
"amenity/animal_boarding",
"amenity/animal_breeding",
@ -373,10 +395,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"leisure/tanning_salon",
"shop/amusements",
"tourism/theme_park",
"amenity/bicycle_rental",
"amenity/boat_rental",
"leisure/marina",
"leisure/slipway",
# bicycle_rental/boat_rental/marina/slipway used to live here and
# made up ~46% of the bucket (cycle-hire docks, boat ramps); they
# are infrastructure, not entertainment venues — see DROP_CATEGORIES.
"leisure/hackerspace",
"leisure/yes",
],
@ -699,7 +720,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"🏋️",
[
"leisure/fitness_centre",
"leisure/fitness_station",
# leisure/fitness_station (outdoor pull-up bars / trim-trail
# apparatus, ~2.5k) is not a gym — see DROP_CATEGORIES.
"amenity/dojo",
"amenity/dancing_school",
],
@ -825,28 +847,37 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"amenity/pharmacy",
"healthcare/pharmacy",
"shop/chemist",
"shop/herbalist",
"shop/health",
"healthcare/alternative",
# healthcare/alternative, shop/herbalist and shop/health (homeopaths,
# herbalists, generic "health" shops) are not dispensing pharmacies
# — see DROP_CATEGORIES.
],
),
# "Hospital & Clinic" used to be one bucket; an actual hospital and a small
# clinic are very different amenities for a homebuyer, so they are split.
(
"Health",
"Hospital",
"🏥",
[
"amenity/hospital",
"healthcare/hospital",
],
),
(
"Health",
"Hospital & Clinic",
"🏥",
"Clinic",
"🩺",
[
"amenity/hospital",
"amenity/clinic",
"amenity/health_centre",
"healthcare/blood_donation",
"healthcare/hospital",
"healthcare/centre",
"healthcare/clinic",
"office/healthcare",
"healthcare/laboratory",
"healthcare/rehabilitation",
"healthcare/vaccination_centre",
"healthcare/yes",
# healthcare/yes (untyped junk rows) is dropped — see DROP_CATEGORIES.
],
),
(
@ -917,7 +948,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"🖼️",
[
"tourism/gallery",
"tourism/artwork",
# tourism/artwork (statues, murals, village signs) was 93% of this
# bucket and is not a visitable gallery — see DROP_CATEGORIES.
],
),
(
@ -961,9 +993,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
[
"tourism/attraction",
"tourism/aquarium",
"amenity/fountain",
"amenity/courthouse",
"tourism/chalet",
# amenity/fountain (street furniture) and amenity/courthouse are
# dropped; tourism/chalet (holiday lets) moved to "Hotel".
],
),
# Note: schools come from the GIAS register (see transform_gias_schools).
@ -982,6 +1013,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"leisure/resort",
"tourism/holiday_park",
"tourism/self_catering",
# Holiday-let chalets are accommodation, not tourist attractions
# (where they previously sat).
"tourism/chalet",
],
),
(
@ -1162,6 +1196,11 @@ REQUIRE_NAME_CATEGORIES = {
"leisure/practice_pitch",
"leisure/swimming_pool",
"leisure/paddling_pool",
# 83-84% unnamed: anonymous running tracks, private gallops/paddocks and
# fishing spots; only named public facilities count as a Sports Centre.
"leisure/track",
"leisure/horse_riding",
"leisure/fishing",
}
@ -1181,6 +1220,7 @@ NAPTAN_EMOJIS: dict[str, str] = {
"Bus station": "🚌",
"Taxi rank": "🚕",
"Tube station": "🚇",
"Tram & Metro stop": "🚊",
}
@ -1438,9 +1478,9 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
(null/"Not judged", e.g. schools last seen under the post-2024 ungraded
report-card framework) we fall back to "Ungraded inspection overall outcome"
so genuinely good/outstanding schools aren't dropped — mirroring
school_proximity.classify_good_plus_schools. Remaining nulls drop out."""
school_catchments.classify_good_plus_schools. Remaining nulls drop out."""
grade_col = pl.col("Latest OEIF overall effectiveness")
# See school_proximity: the ungraded outcome carries "School remains Good"/
# See school_catchments: the ungraded outcome carries "School remains Good"/
# "School remains Outstanding" (with optional "(Concerns)"/"(Improving)"
# suffixes) when the graded column is null/"Not judged".
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)