Fix data pipelines once and for all
This commit is contained in:
parent
08560476c5
commit
4012e4e047
46 changed files with 4508 additions and 855 deletions
|
|
@ -544,6 +544,142 @@ def test_transform_grocery_dedup_drops_only_grocery_aspect(tmp_path):
|
|||
assert n2_grocery.height == 1
|
||||
|
||||
|
||||
def test_transform_drops_miscategorised_tags(tmp_path):
|
||||
# Audit 2026-06-10: these tags polluted Entertainment (cycle-hire docks,
|
||||
# slipways, marinas), Gallery (public artwork), Pharmacy (herbalists,
|
||||
# alternative medicine), Hospital & Clinic (untyped healthcare/yes),
|
||||
# Tourist Attraction (fountains, courthouses) and Gym & Fitness (outdoor
|
||||
# apparatus). They must be dropped entirely.
|
||||
dropped = [
|
||||
"amenity/bicycle_rental",
|
||||
"amenity/boat_rental",
|
||||
"leisure/marina",
|
||||
"leisure/slipway",
|
||||
"tourism/artwork",
|
||||
"healthcare/yes",
|
||||
"healthcare/alternative",
|
||||
"shop/herbalist",
|
||||
"shop/health",
|
||||
"amenity/fountain",
|
||||
"amenity/courthouse",
|
||||
"leisure/fitness_station",
|
||||
]
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": [f"n{i}" for i in range(len(dropped))],
|
||||
"name": [f"POI {i}" for i in range(len(dropped))],
|
||||
"category": dropped,
|
||||
"lat": [51.50] * len(dropped),
|
||||
"lng": [-0.10] * len(dropped),
|
||||
}
|
||||
)
|
||||
inputs = _write_transform_inputs(tmp_path, raw)
|
||||
|
||||
out = transform(**inputs).collect()
|
||||
|
||||
assert out.filter(pl.col("id").is_in(raw["id"].to_list())).height == 0
|
||||
|
||||
|
||||
def test_transform_splits_hospital_and_clinic(tmp_path):
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": ["n1", "n2", "n3"],
|
||||
"name": ["St Thomas'", "Vale Surgery Annexe", "Drop-in Centre"],
|
||||
"category": [
|
||||
"amenity/hospital",
|
||||
"amenity/clinic",
|
||||
"healthcare/clinic",
|
||||
],
|
||||
"lat": [51.50, 51.51, 51.52],
|
||||
"lng": [-0.10, -0.11, -0.12],
|
||||
}
|
||||
)
|
||||
inputs = _write_transform_inputs(tmp_path, raw)
|
||||
|
||||
out = transform(**inputs).collect()
|
||||
|
||||
assert out.filter(pl.col("id") == "n1")["category"].to_list() == ["Hospital"]
|
||||
assert out.filter(pl.col("id") == "n2")["category"].to_list() == ["Clinic"]
|
||||
assert out.filter(pl.col("id") == "n3")["category"].to_list() == ["Clinic"]
|
||||
assert "Hospital & Clinic" not in out["category"].to_list()
|
||||
|
||||
|
||||
def test_transform_maps_chalet_to_hotel(tmp_path):
|
||||
# Holiday-let chalets are accommodation, not Tourist Attractions.
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": ["n1"],
|
||||
"name": ["Seaview Chalet"],
|
||||
"category": ["tourism/chalet"],
|
||||
"lat": [51.50],
|
||||
"lng": [-0.10],
|
||||
}
|
||||
)
|
||||
inputs = _write_transform_inputs(tmp_path, raw)
|
||||
|
||||
out = transform(**inputs).collect()
|
||||
|
||||
assert out.filter(pl.col("id") == "n1")["category"].to_list() == ["Hotel"]
|
||||
|
||||
|
||||
def test_transform_name_gates_track_horse_riding_fishing(tmp_path):
|
||||
# leisure/track, leisure/horse_riding and leisure/fishing are 83-84%
|
||||
# unnamed (anonymous tracks/gallops/fishing spots); only named public
|
||||
# facilities survive as a Sports Centre.
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": ["n1", "n2", "n3", "n4"],
|
||||
"name": [None, "", "Herne Hill Velodrome", "Royal Mews Riding School"],
|
||||
"category": [
|
||||
"leisure/track",
|
||||
"leisure/fishing",
|
||||
"leisure/track",
|
||||
"leisure/horse_riding",
|
||||
],
|
||||
"lat": [51.50, 51.51, 51.52, 51.53],
|
||||
"lng": [-0.10, -0.11, -0.12, -0.13],
|
||||
}
|
||||
)
|
||||
inputs = _write_transform_inputs(tmp_path, raw)
|
||||
|
||||
out = transform(**inputs).collect()
|
||||
|
||||
assert out.filter(pl.col("id").is_in(["n1", "n2"])).height == 0
|
||||
named = out.filter(pl.col("id").is_in(["n3", "n4"]))
|
||||
assert named["category"].to_list() == ["Sports Centre", "Sports Centre"]
|
||||
|
||||
|
||||
def test_transform_passes_through_tram_metro_naptan_category(tmp_path):
|
||||
# NaPTAN now emits "Tram & Metro stop" (non-LU TMU/MET networks); it must
|
||||
# flow through with the Public Transport group and its own emoji.
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": ["n1"],
|
||||
"name": ["A Cafe"],
|
||||
"category": ["amenity/cafe"],
|
||||
"lat": [51.50],
|
||||
"lng": [-0.10],
|
||||
}
|
||||
)
|
||||
inputs = _write_transform_inputs(tmp_path, raw)
|
||||
pl.DataFrame(
|
||||
{
|
||||
"id": ["naptan-1", "naptan-2"],
|
||||
"name": ["Test Rail Station", "Weaste"],
|
||||
"category": ["Rail station", "Tram & Metro stop"],
|
||||
"lat": [51.51, 51.52],
|
||||
"lng": [-0.13, -0.14],
|
||||
}
|
||||
).write_parquet(inputs["naptan_path"])
|
||||
|
||||
out = transform(**inputs).collect()
|
||||
|
||||
tram = out.filter(pl.col("category") == "Tram & Metro stop")
|
||||
assert tram.height == 1
|
||||
assert tram["group"].to_list() == ["Public Transport"]
|
||||
assert tram["emoji"].to_list() == ["🚊"]
|
||||
|
||||
|
||||
def test_transform_output_unique_per_id_category(tmp_path):
|
||||
# Soundness: the full transform() output has at most one row per
|
||||
# (id, category) overall, across every source.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue