Fix data pipelines once and for all
This commit is contained in:
parent
08560476c5
commit
4012e4e047
46 changed files with 4508 additions and 855 deletions
|
|
@ -86,6 +86,28 @@ DROP_CATEGORIES = {
|
|||
"amenity/water_point",
|
||||
"amenity/watering_place",
|
||||
"amenity/weighbridge",
|
||||
# Boating/cycle-hire infrastructure formerly miscategorised as
|
||||
# "Entertainment" (46% of the bucket): cycle-hire dock stations, boat
|
||||
# ramps and moorings are not entertainment venues.
|
||||
"amenity/bicycle_rental",
|
||||
"amenity/boat_rental",
|
||||
"leisure/marina",
|
||||
"leisure/slipway",
|
||||
# Public art (statues, murals, village signs) formerly 93% of "Gallery".
|
||||
"tourism/artwork",
|
||||
# Outdoor exercise apparatus (pull-up bars, trim trails) formerly inflating
|
||||
# "Gym & Fitness".
|
||||
"leisure/fitness_station",
|
||||
# Untyped healthcare rows and non-pharmacy health shops formerly bucketed
|
||||
# under "Hospital & Clinic" / "Pharmacy".
|
||||
"healthcare/yes",
|
||||
"healthcare/alternative",
|
||||
"shop/herbalist",
|
||||
"shop/health",
|
||||
# Street fountains and courthouses formerly bucketed as
|
||||
# "Tourist Attraction".
|
||||
"amenity/fountain",
|
||||
"amenity/courthouse",
|
||||
# Niche amenities not useful for home buyers
|
||||
"amenity/animal_boarding",
|
||||
"amenity/animal_breeding",
|
||||
|
|
@ -373,10 +395,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
"leisure/tanning_salon",
|
||||
"shop/amusements",
|
||||
"tourism/theme_park",
|
||||
"amenity/bicycle_rental",
|
||||
"amenity/boat_rental",
|
||||
"leisure/marina",
|
||||
"leisure/slipway",
|
||||
# bicycle_rental/boat_rental/marina/slipway used to live here and
|
||||
# made up ~46% of the bucket (cycle-hire docks, boat ramps); they
|
||||
# are infrastructure, not entertainment venues — see DROP_CATEGORIES.
|
||||
"leisure/hackerspace",
|
||||
"leisure/yes",
|
||||
],
|
||||
|
|
@ -699,7 +720,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
"🏋️",
|
||||
[
|
||||
"leisure/fitness_centre",
|
||||
"leisure/fitness_station",
|
||||
# leisure/fitness_station (outdoor pull-up bars / trim-trail
|
||||
# apparatus, ~2.5k) is not a gym — see DROP_CATEGORIES.
|
||||
"amenity/dojo",
|
||||
"amenity/dancing_school",
|
||||
],
|
||||
|
|
@ -825,28 +847,37 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
"amenity/pharmacy",
|
||||
"healthcare/pharmacy",
|
||||
"shop/chemist",
|
||||
"shop/herbalist",
|
||||
"shop/health",
|
||||
"healthcare/alternative",
|
||||
# healthcare/alternative, shop/herbalist and shop/health (homeopaths,
|
||||
# herbalists, generic "health" shops) are not dispensing pharmacies
|
||||
# — see DROP_CATEGORIES.
|
||||
],
|
||||
),
|
||||
# "Hospital & Clinic" used to be one bucket; an actual hospital and a small
|
||||
# clinic are very different amenities for a homebuyer, so they are split.
|
||||
(
|
||||
"Health",
|
||||
"Hospital",
|
||||
"🏥",
|
||||
[
|
||||
"amenity/hospital",
|
||||
"healthcare/hospital",
|
||||
],
|
||||
),
|
||||
(
|
||||
"Health",
|
||||
"Hospital & Clinic",
|
||||
"🏥",
|
||||
"Clinic",
|
||||
"🩺",
|
||||
[
|
||||
"amenity/hospital",
|
||||
"amenity/clinic",
|
||||
"amenity/health_centre",
|
||||
"healthcare/blood_donation",
|
||||
"healthcare/hospital",
|
||||
"healthcare/centre",
|
||||
"healthcare/clinic",
|
||||
"office/healthcare",
|
||||
"healthcare/laboratory",
|
||||
"healthcare/rehabilitation",
|
||||
"healthcare/vaccination_centre",
|
||||
"healthcare/yes",
|
||||
# healthcare/yes (untyped junk rows) is dropped — see DROP_CATEGORIES.
|
||||
],
|
||||
),
|
||||
(
|
||||
|
|
@ -917,7 +948,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
"🖼️",
|
||||
[
|
||||
"tourism/gallery",
|
||||
"tourism/artwork",
|
||||
# tourism/artwork (statues, murals, village signs) was 93% of this
|
||||
# bucket and is not a visitable gallery — see DROP_CATEGORIES.
|
||||
],
|
||||
),
|
||||
(
|
||||
|
|
@ -961,9 +993,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
[
|
||||
"tourism/attraction",
|
||||
"tourism/aquarium",
|
||||
"amenity/fountain",
|
||||
"amenity/courthouse",
|
||||
"tourism/chalet",
|
||||
# amenity/fountain (street furniture) and amenity/courthouse are
|
||||
# dropped; tourism/chalet (holiday lets) moved to "Hotel".
|
||||
],
|
||||
),
|
||||
# Note: schools come from the GIAS register (see transform_gias_schools).
|
||||
|
|
@ -982,6 +1013,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
"leisure/resort",
|
||||
"tourism/holiday_park",
|
||||
"tourism/self_catering",
|
||||
# Holiday-let chalets are accommodation, not tourist attractions
|
||||
# (where they previously sat).
|
||||
"tourism/chalet",
|
||||
],
|
||||
),
|
||||
(
|
||||
|
|
@ -1162,6 +1196,11 @@ REQUIRE_NAME_CATEGORIES = {
|
|||
"leisure/practice_pitch",
|
||||
"leisure/swimming_pool",
|
||||
"leisure/paddling_pool",
|
||||
# 83-84% unnamed: anonymous running tracks, private gallops/paddocks and
|
||||
# fishing spots; only named public facilities count as a Sports Centre.
|
||||
"leisure/track",
|
||||
"leisure/horse_riding",
|
||||
"leisure/fishing",
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1181,6 +1220,7 @@ NAPTAN_EMOJIS: dict[str, str] = {
|
|||
"Bus station": "🚌",
|
||||
"Taxi rank": "🚕",
|
||||
"Tube station": "🚇",
|
||||
"Tram & Metro stop": "🚊",
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1438,9 +1478,9 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
|
|||
(null/"Not judged", e.g. schools last seen under the post-2024 ungraded
|
||||
report-card framework) we fall back to "Ungraded inspection overall outcome"
|
||||
so genuinely good/outstanding schools aren't dropped — mirroring
|
||||
school_proximity.classify_good_plus_schools. Remaining nulls drop out."""
|
||||
school_catchments.classify_good_plus_schools. Remaining nulls drop out."""
|
||||
grade_col = pl.col("Latest OEIF overall effectiveness")
|
||||
# See school_proximity: the ungraded outcome carries "School remains Good"/
|
||||
# See school_catchments: the ungraded outcome carries "School remains Good"/
|
||||
# "School remains Outstanding" (with optional "(Concerns)"/"(Improving)"
|
||||
# suffixes) when the graded column is null/"Not judged".
|
||||
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue