Fix data pipelines once and for all

This commit is contained in:
Andras Schmelczer 2026-06-10 21:27:32 +01:00
parent 08560476c5
commit 4012e4e047
46 changed files with 4508 additions and 855 deletions

View file

@ -544,6 +544,142 @@ def test_transform_grocery_dedup_drops_only_grocery_aspect(tmp_path):
assert n2_grocery.height == 1
def test_transform_drops_miscategorised_tags(tmp_path):
# Audit 2026-06-10: these tags polluted Entertainment (cycle-hire docks,
# slipways, marinas), Gallery (public artwork), Pharmacy (herbalists,
# alternative medicine), Hospital & Clinic (untyped healthcare/yes),
# Tourist Attraction (fountains, courthouses) and Gym & Fitness (outdoor
# apparatus). They must be dropped entirely.
dropped = [
"amenity/bicycle_rental",
"amenity/boat_rental",
"leisure/marina",
"leisure/slipway",
"tourism/artwork",
"healthcare/yes",
"healthcare/alternative",
"shop/herbalist",
"shop/health",
"amenity/fountain",
"amenity/courthouse",
"leisure/fitness_station",
]
raw = pl.DataFrame(
{
"id": [f"n{i}" for i in range(len(dropped))],
"name": [f"POI {i}" for i in range(len(dropped))],
"category": dropped,
"lat": [51.50] * len(dropped),
"lng": [-0.10] * len(dropped),
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
assert out.filter(pl.col("id").is_in(raw["id"].to_list())).height == 0
def test_transform_splits_hospital_and_clinic(tmp_path):
raw = pl.DataFrame(
{
"id": ["n1", "n2", "n3"],
"name": ["St Thomas'", "Vale Surgery Annexe", "Drop-in Centre"],
"category": [
"amenity/hospital",
"amenity/clinic",
"healthcare/clinic",
],
"lat": [51.50, 51.51, 51.52],
"lng": [-0.10, -0.11, -0.12],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
assert out.filter(pl.col("id") == "n1")["category"].to_list() == ["Hospital"]
assert out.filter(pl.col("id") == "n2")["category"].to_list() == ["Clinic"]
assert out.filter(pl.col("id") == "n3")["category"].to_list() == ["Clinic"]
assert "Hospital & Clinic" not in out["category"].to_list()
def test_transform_maps_chalet_to_hotel(tmp_path):
# Holiday-let chalets are accommodation, not Tourist Attractions.
raw = pl.DataFrame(
{
"id": ["n1"],
"name": ["Seaview Chalet"],
"category": ["tourism/chalet"],
"lat": [51.50],
"lng": [-0.10],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
assert out.filter(pl.col("id") == "n1")["category"].to_list() == ["Hotel"]
def test_transform_name_gates_track_horse_riding_fishing(tmp_path):
# leisure/track, leisure/horse_riding and leisure/fishing are 83-84%
# unnamed (anonymous tracks/gallops/fishing spots); only named public
# facilities survive as a Sports Centre.
raw = pl.DataFrame(
{
"id": ["n1", "n2", "n3", "n4"],
"name": [None, "", "Herne Hill Velodrome", "Royal Mews Riding School"],
"category": [
"leisure/track",
"leisure/fishing",
"leisure/track",
"leisure/horse_riding",
],
"lat": [51.50, 51.51, 51.52, 51.53],
"lng": [-0.10, -0.11, -0.12, -0.13],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
assert out.filter(pl.col("id").is_in(["n1", "n2"])).height == 0
named = out.filter(pl.col("id").is_in(["n3", "n4"]))
assert named["category"].to_list() == ["Sports Centre", "Sports Centre"]
def test_transform_passes_through_tram_metro_naptan_category(tmp_path):
# NaPTAN now emits "Tram & Metro stop" (non-LU TMU/MET networks); it must
# flow through with the Public Transport group and its own emoji.
raw = pl.DataFrame(
{
"id": ["n1"],
"name": ["A Cafe"],
"category": ["amenity/cafe"],
"lat": [51.50],
"lng": [-0.10],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
pl.DataFrame(
{
"id": ["naptan-1", "naptan-2"],
"name": ["Test Rail Station", "Weaste"],
"category": ["Rail station", "Tram & Metro stop"],
"lat": [51.51, 51.52],
"lng": [-0.13, -0.14],
}
).write_parquet(inputs["naptan_path"])
out = transform(**inputs).collect()
tram = out.filter(pl.col("category") == "Tram & Metro stop")
assert tram.height == 1
assert tram["group"].to_list() == ["Public Transport"]
assert tram["emoji"].to_list() == ["🚊"]
def test_transform_output_unique_per_id_category(tmp_path):
# Soundness: the full transform() output has at most one row per
# (id, category) overall, across every source.