try
Some checks failed
CI / Check (push) Failing after 3m22s
Build and publish Docker image / build-and-push (push) Successful in 7m25s

This commit is contained in:
Andras Schmelczer 2026-06-04 22:34:26 +01:00
parent 843d14b7ba
commit c938b71904
13 changed files with 698 additions and 109 deletions

View file

@ -5,11 +5,72 @@ import polars as pl
from pipeline.transform.transform_poi import (
_load_ofsted_ratings,
_school_icon_category_expr,
osm_groceries_colocated_with_geolytix,
transform,
transform_grocery_retail_points,
)
def test_osm_groceries_colocated_with_geolytix_drops_only_brand_matched_duplicates():
# GEOLYTIX is authoritative for its chains. An OSM grocery that sits on top
# of a GEOLYTIX store AND carries its brand is the same physical store and
# must be dropped; an independent shop at the same spot, and a same-brand
# store far from any GEOLYTIX point, must be kept.
geolytix = pl.DataFrame(
{
"category": ["Tesco"],
"lat": [51.5000],
"lng": [-0.1000],
}
)
osm = pl.DataFrame(
{
"id": ["dup-brand", "independent", "far-brand"],
"name": ["Tesco Express", "Bob's Corner Shop", "Tesco Express"],
# ~1 m, ~2 m, and ~55 km from the GEOLYTIX Tesco.
"lat": [51.50001, 51.50002, 52.0],
"lng": [-0.10001, -0.1000, -1.0],
}
)
drop_ids = osm_groceries_colocated_with_geolytix(osm, geolytix, radius_m=50.0)
assert drop_ids == ["dup-brand"]
def test_osm_groceries_colocated_with_geolytix_dedupes_cooperative_spelling():
# GEOLYTIX brand "Co-op" tokenises to "coop"; OSM commonly spells it
# "The Co-operative Food" -> "cooperative". The alias folds them so the
# genuine duplicate is still dropped.
geolytix = pl.DataFrame({"category": ["Co-op"], "lat": [53.0], "lng": [-1.5]})
osm = pl.DataFrame(
{
"id": ["coop-dup"],
"name": ["The Co-operative Food"],
"lat": [53.00001],
"lng": [-1.5],
}
)
assert osm_groceries_colocated_with_geolytix(osm, geolytix, radius_m=50.0) == [
"coop-dup"
]
def test_osm_groceries_colocated_with_geolytix_handles_empty_inputs():
geolytix = pl.DataFrame({"category": ["Tesco"], "lat": [51.5], "lng": [-0.1]})
empty = pl.DataFrame(
schema={"id": pl.Utf8, "name": pl.Utf8, "lat": pl.Float64, "lng": pl.Float64}
)
assert osm_groceries_colocated_with_geolytix(empty, geolytix) == []
osm = pl.DataFrame(
{"id": ["x"], "name": ["Tesco Express"], "lat": [51.5], "lng": [-0.1]}
)
empty_glx = pl.DataFrame(
schema={"category": pl.Utf8, "lat": pl.Float64, "lng": pl.Float64}
)
assert osm_groceries_colocated_with_geolytix(osm, empty_glx) == []
def _write_boundary(tmp_path):
"""A FeatureCollection whose single feature covers the London-area test
coords used by the transform() fixtures, so in_england_mask keeps them."""
@ -345,12 +406,7 @@ def test_load_ofsted_ratings_falls_back_to_ungraded_outcome(tmp_path):
}
).write_parquet(ofsted_path)
ratings = (
_load_ofsted_ratings(ofsted_path)
.collect()
.sort("urn")
.to_dicts()
)
ratings = _load_ofsted_ratings(ofsted_path).collect().sort("urn").to_dicts()
assert ratings == [
{"urn": 1, "ofsted_rating": "Outstanding"},
@ -384,9 +440,9 @@ def test_school_icon_category_handles_one_sided_age_ranges():
},
)
categories = df.select(
_school_icon_category_expr().alias("category")
)["category"].to_list()
categories = df.select(_school_icon_category_expr().alias("category"))[
"category"
].to_list()
assert categories == [
"Nursery school",
@ -449,6 +505,45 @@ def test_osm_supermarkets_dropped(tmp_path):
assert convenience.height == 1
def test_transform_grocery_dedup_drops_only_grocery_aspect(tmp_path):
# The _write_transform_inputs fixture seeds 5 GEOLYTIX "Tesco" points at
# (51.52, -0.14). An OSM object colocated there carrying "Tesco" in its name
# is the same physical store, so its Convenience Store (Groceries) row is a
# duplicate and must be dropped — but its NON-grocery aspect (a Post Office
# sharing the same OSM id) must survive. An independent shop away from the
# GEOLYTIX point keeps its grocery row.
raw = pl.DataFrame(
{
"id": ["n1", "n1", "n2"],
"name": ["Tesco Express", "Tesco Express", "Corner Shop"],
"category": [
"shop/convenience",
"amenity/post_office",
"shop/convenience",
],
"lat": [51.52, 51.52, 51.40],
"lng": [-0.14, -0.14, -0.05],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
# The colocated, brand-matched grocery row is dropped.
n1_grocery = out.filter((pl.col("id") == "n1") & (pl.col("group") == "Groceries"))
assert n1_grocery.height == 0
# Its non-grocery aspect (Post Office) survives.
n1_post_office = out.filter(
(pl.col("id") == "n1") & (pl.col("category") == "Post Office")
)
assert n1_post_office.height == 1
# The independent corner shop (no brand, far away) keeps its grocery row.
n2_grocery = out.filter(
(pl.col("id") == "n2") & (pl.col("category") == "Convenience Store")
)
assert n2_grocery.height == 1
def test_transform_output_unique_per_id_category(tmp_path):
# Soundness: the full transform() output has at most one row per
# (id, category) overall, across every source.