try
This commit is contained in:
parent
843d14b7ba
commit
c938b71904
13 changed files with 698 additions and 109 deletions
|
|
@ -5,11 +5,72 @@ import polars as pl
|
|||
from pipeline.transform.transform_poi import (
|
||||
_load_ofsted_ratings,
|
||||
_school_icon_category_expr,
|
||||
osm_groceries_colocated_with_geolytix,
|
||||
transform,
|
||||
transform_grocery_retail_points,
|
||||
)
|
||||
|
||||
|
||||
def test_osm_groceries_colocated_with_geolytix_drops_only_brand_matched_duplicates():
|
||||
# GEOLYTIX is authoritative for its chains. An OSM grocery that sits on top
|
||||
# of a GEOLYTIX store AND carries its brand is the same physical store and
|
||||
# must be dropped; an independent shop at the same spot, and a same-brand
|
||||
# store far from any GEOLYTIX point, must be kept.
|
||||
geolytix = pl.DataFrame(
|
||||
{
|
||||
"category": ["Tesco"],
|
||||
"lat": [51.5000],
|
||||
"lng": [-0.1000],
|
||||
}
|
||||
)
|
||||
osm = pl.DataFrame(
|
||||
{
|
||||
"id": ["dup-brand", "independent", "far-brand"],
|
||||
"name": ["Tesco Express", "Bob's Corner Shop", "Tesco Express"],
|
||||
# ~1 m, ~2 m, and ~55 km from the GEOLYTIX Tesco.
|
||||
"lat": [51.50001, 51.50002, 52.0],
|
||||
"lng": [-0.10001, -0.1000, -1.0],
|
||||
}
|
||||
)
|
||||
|
||||
drop_ids = osm_groceries_colocated_with_geolytix(osm, geolytix, radius_m=50.0)
|
||||
|
||||
assert drop_ids == ["dup-brand"]
|
||||
|
||||
|
||||
def test_osm_groceries_colocated_with_geolytix_dedupes_cooperative_spelling():
|
||||
# GEOLYTIX brand "Co-op" tokenises to "coop"; OSM commonly spells it
|
||||
# "The Co-operative Food" -> "cooperative". The alias folds them so the
|
||||
# genuine duplicate is still dropped.
|
||||
geolytix = pl.DataFrame({"category": ["Co-op"], "lat": [53.0], "lng": [-1.5]})
|
||||
osm = pl.DataFrame(
|
||||
{
|
||||
"id": ["coop-dup"],
|
||||
"name": ["The Co-operative Food"],
|
||||
"lat": [53.00001],
|
||||
"lng": [-1.5],
|
||||
}
|
||||
)
|
||||
assert osm_groceries_colocated_with_geolytix(osm, geolytix, radius_m=50.0) == [
|
||||
"coop-dup"
|
||||
]
|
||||
|
||||
|
||||
def test_osm_groceries_colocated_with_geolytix_handles_empty_inputs():
|
||||
geolytix = pl.DataFrame({"category": ["Tesco"], "lat": [51.5], "lng": [-0.1]})
|
||||
empty = pl.DataFrame(
|
||||
schema={"id": pl.Utf8, "name": pl.Utf8, "lat": pl.Float64, "lng": pl.Float64}
|
||||
)
|
||||
assert osm_groceries_colocated_with_geolytix(empty, geolytix) == []
|
||||
osm = pl.DataFrame(
|
||||
{"id": ["x"], "name": ["Tesco Express"], "lat": [51.5], "lng": [-0.1]}
|
||||
)
|
||||
empty_glx = pl.DataFrame(
|
||||
schema={"category": pl.Utf8, "lat": pl.Float64, "lng": pl.Float64}
|
||||
)
|
||||
assert osm_groceries_colocated_with_geolytix(osm, empty_glx) == []
|
||||
|
||||
|
||||
def _write_boundary(tmp_path):
|
||||
"""A FeatureCollection whose single feature covers the London-area test
|
||||
coords used by the transform() fixtures, so in_england_mask keeps them."""
|
||||
|
|
@ -345,12 +406,7 @@ def test_load_ofsted_ratings_falls_back_to_ungraded_outcome(tmp_path):
|
|||
}
|
||||
).write_parquet(ofsted_path)
|
||||
|
||||
ratings = (
|
||||
_load_ofsted_ratings(ofsted_path)
|
||||
.collect()
|
||||
.sort("urn")
|
||||
.to_dicts()
|
||||
)
|
||||
ratings = _load_ofsted_ratings(ofsted_path).collect().sort("urn").to_dicts()
|
||||
|
||||
assert ratings == [
|
||||
{"urn": 1, "ofsted_rating": "Outstanding"},
|
||||
|
|
@ -384,9 +440,9 @@ def test_school_icon_category_handles_one_sided_age_ranges():
|
|||
},
|
||||
)
|
||||
|
||||
categories = df.select(
|
||||
_school_icon_category_expr().alias("category")
|
||||
)["category"].to_list()
|
||||
categories = df.select(_school_icon_category_expr().alias("category"))[
|
||||
"category"
|
||||
].to_list()
|
||||
|
||||
assert categories == [
|
||||
"Nursery school",
|
||||
|
|
@ -449,6 +505,45 @@ def test_osm_supermarkets_dropped(tmp_path):
|
|||
assert convenience.height == 1
|
||||
|
||||
|
||||
def test_transform_grocery_dedup_drops_only_grocery_aspect(tmp_path):
|
||||
# The _write_transform_inputs fixture seeds 5 GEOLYTIX "Tesco" points at
|
||||
# (51.52, -0.14). An OSM object colocated there carrying "Tesco" in its name
|
||||
# is the same physical store, so its Convenience Store (Groceries) row is a
|
||||
# duplicate and must be dropped — but its NON-grocery aspect (a Post Office
|
||||
# sharing the same OSM id) must survive. An independent shop away from the
|
||||
# GEOLYTIX point keeps its grocery row.
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": ["n1", "n1", "n2"],
|
||||
"name": ["Tesco Express", "Tesco Express", "Corner Shop"],
|
||||
"category": [
|
||||
"shop/convenience",
|
||||
"amenity/post_office",
|
||||
"shop/convenience",
|
||||
],
|
||||
"lat": [51.52, 51.52, 51.40],
|
||||
"lng": [-0.14, -0.14, -0.05],
|
||||
}
|
||||
)
|
||||
inputs = _write_transform_inputs(tmp_path, raw)
|
||||
|
||||
out = transform(**inputs).collect()
|
||||
|
||||
# The colocated, brand-matched grocery row is dropped.
|
||||
n1_grocery = out.filter((pl.col("id") == "n1") & (pl.col("group") == "Groceries"))
|
||||
assert n1_grocery.height == 0
|
||||
# Its non-grocery aspect (Post Office) survives.
|
||||
n1_post_office = out.filter(
|
||||
(pl.col("id") == "n1") & (pl.col("category") == "Post Office")
|
||||
)
|
||||
assert n1_post_office.height == 1
|
||||
# The independent corner shop (no brand, far away) keeps its grocery row.
|
||||
n2_grocery = out.filter(
|
||||
(pl.col("id") == "n2") & (pl.col("category") == "Convenience Store")
|
||||
)
|
||||
assert n2_grocery.height == 1
|
||||
|
||||
|
||||
def test_transform_output_unique_per_id_category(tmp_path):
|
||||
# Soundness: the full transform() output has at most one row per
|
||||
# (id, category) overall, across every source.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue