This commit is contained in:
Andras Schmelczer 2026-06-02 20:14:32 +01:00
parent fbfebc651c
commit aab85fe32e
33 changed files with 2016 additions and 283 deletions

View file

@ -1,12 +1,115 @@
import json
import polars as pl
from pipeline.transform.transform_poi import (
_load_ofsted_ratings,
_school_icon_category_expr,
transform,
transform_grocery_retail_points,
)
def _write_boundary(tmp_path):
"""A FeatureCollection whose single feature covers the London-area test
coords used by the transform() fixtures, so in_england_mask keeps them."""
boundary_path = tmp_path / "england.geojson"
coords = [[-1.0, 51.0], [1.0, 51.0], [1.0, 52.0], [-1.0, 52.0], [-1.0, 51.0]]
boundary_path.write_text(
json.dumps(
{
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"properties": {},
"geometry": {"type": "Polygon", "coordinates": [coords]},
}
],
}
)
)
return boundary_path
def _write_transform_inputs(tmp_path, raw_pois: pl.DataFrame):
"""Materialise the parquet inputs transform() requires around a given raw
OSM POIs frame. NaPTAN / grocery / GIAS / Ofsted are minimal but valid."""
input_path = tmp_path / "pois.parquet"
raw_pois.write_parquet(input_path)
naptan_path = tmp_path / "naptan.parquet"
pl.DataFrame(
{
"id": ["naptan-1"],
"name": ["Test Rail Station"],
"category": ["Rail station"],
"lat": [51.51],
"lng": [-0.13],
}
).write_parquet(naptan_path)
grocery_path = tmp_path / "grocery.parquet"
pl.DataFrame(
{
"id": list(range(1, 6)),
"retailer": ["Tesco"] * 5,
"fascia": ["Tesco"] * 5,
"store_name": [f"Tesco Test {i}" for i in range(1, 6)],
"long_wgs": [-0.14] * 5,
"lat_wgs": [51.52] * 5,
}
).write_parquet(grocery_path)
gias_path = tmp_path / "gias.parquet"
pl.DataFrame(
{
"urn": [1001],
"name": ["Test Primary School"],
"phase": ["Primary"],
"type": ["Community school"],
"type_group": ["Local authority maintained schools"],
"age_range": ["411"],
"gender": ["Mixed"],
"religious_character": [None],
"admissions_policy": ["Comprehensive"],
"nursery_provision": ["No"],
"sixth_form": ["No"],
"capacity": [200],
"pupils": [180],
"fsm_percent": [12.5],
"trust": [None],
"address": ["1 Test Street"],
"postcode": ["E1 1AA"],
"local_authority": ["Test LA"],
"website": [None],
"telephone": ["02012345678"],
"head_name": ["Jane Doe"],
"lat": [51.53],
"lng": [-0.12],
}
).write_parquet(gias_path)
ofsted_path = tmp_path / "ofsted.parquet"
pl.DataFrame(
{
"URN": [1001],
"Latest OEIF overall effectiveness": ["2"],
"Ungraded inspection overall outcome": [None],
}
).write_parquet(ofsted_path)
boundary_path = _write_boundary(tmp_path)
return {
"input_path": input_path,
"naptan_path": naptan_path,
"boundary_path": boundary_path,
"grocery_retail_points_path": grocery_path,
"gias_path": gias_path,
"ofsted_path": ofsted_path,
}
def test_transform_grocery_retail_points_outputs_chain_categories():
raw = pl.DataFrame(
{
@ -292,3 +395,79 @@ def test_school_icon_category_handles_one_sided_age_ranges():
"Primary school",
"School",
]
def test_transform_dedupes_multi_tag_pois(tmp_path):
# One OSM object can carry several tag keys that map to the SAME friendly
# category, so pois.py emits one raw row per key with the SAME id.
# "amenity/pharmacy" and "shop/chemist" both map to "Pharmacy".
raw = pl.DataFrame(
{
"id": ["n42", "n42"],
"name": ["Boots", "Boots"],
"category": ["amenity/pharmacy", "shop/chemist"],
"lat": [51.50, 51.50],
"lng": [-0.10, -0.10],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
# No (id, category) pair appears more than once.
assert out.group_by("id", "category").len()["len"].max() == 1
# The single physical pharmacy is present exactly once.
pharmacies = out.filter(
(pl.col("id") == "n42") & (pl.col("category") == "Pharmacy")
)
assert pharmacies.height == 1
def test_osm_supermarkets_dropped(tmp_path):
# GEOLYTIX is authoritative for supermarkets; an OSM "shop/supermarket" row
# must not flow through as a second Groceries/Supermarket pin. A
# complementary grocery category (Convenience Store) must still survive.
raw = pl.DataFrame(
{
"id": ["n1", "n2"],
"name": ["Some Supermarket", "Corner Shop"],
"category": ["shop/supermarket", "shop/convenience"],
"lat": [51.50, 51.51],
"lng": [-0.10, -0.11],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
osm_supermarkets = out.filter(
(pl.col("group") == "Groceries") & (pl.col("category") == "Supermarket")
)
assert osm_supermarkets.height == 0
# Complementary OSM grocery category survives.
convenience = out.filter(pl.col("category") == "Convenience Store")
assert convenience.height == 1
def test_transform_output_unique_per_id_category(tmp_path):
# Soundness: the full transform() output has at most one row per
# (id, category) overall, across every source.
raw = pl.DataFrame(
{
"id": ["n42", "n42", "n7", "n8"],
"name": ["Boots", "Boots", "St Mary's", "St Mary's"],
"category": [
"amenity/pharmacy",
"shop/chemist",
"amenity/place_of_worship",
"building/church",
],
"lat": [51.50, 51.50, 51.55, 51.55],
"lng": [-0.10, -0.10, -0.15, -0.15],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
assert out.group_by("id", "category").len()["len"].max() == 1