idgf
This commit is contained in:
parent
fbfebc651c
commit
aab85fe32e
33 changed files with 2016 additions and 283 deletions
|
|
@ -1,12 +1,115 @@
|
|||
import json
|
||||
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.transform_poi import (
|
||||
_load_ofsted_ratings,
|
||||
_school_icon_category_expr,
|
||||
transform,
|
||||
transform_grocery_retail_points,
|
||||
)
|
||||
|
||||
|
||||
def _write_boundary(tmp_path):
|
||||
"""A FeatureCollection whose single feature covers the London-area test
|
||||
coords used by the transform() fixtures, so in_england_mask keeps them."""
|
||||
boundary_path = tmp_path / "england.geojson"
|
||||
coords = [[-1.0, 51.0], [1.0, 51.0], [1.0, 52.0], [-1.0, 52.0], [-1.0, 51.0]]
|
||||
boundary_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"type": "FeatureCollection",
|
||||
"features": [
|
||||
{
|
||||
"type": "Feature",
|
||||
"properties": {},
|
||||
"geometry": {"type": "Polygon", "coordinates": [coords]},
|
||||
}
|
||||
],
|
||||
}
|
||||
)
|
||||
)
|
||||
return boundary_path
|
||||
|
||||
|
||||
def _write_transform_inputs(tmp_path, raw_pois: pl.DataFrame):
|
||||
"""Materialise the parquet inputs transform() requires around a given raw
|
||||
OSM POIs frame. NaPTAN / grocery / GIAS / Ofsted are minimal but valid."""
|
||||
input_path = tmp_path / "pois.parquet"
|
||||
raw_pois.write_parquet(input_path)
|
||||
|
||||
naptan_path = tmp_path / "naptan.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"id": ["naptan-1"],
|
||||
"name": ["Test Rail Station"],
|
||||
"category": ["Rail station"],
|
||||
"lat": [51.51],
|
||||
"lng": [-0.13],
|
||||
}
|
||||
).write_parquet(naptan_path)
|
||||
|
||||
grocery_path = tmp_path / "grocery.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"id": list(range(1, 6)),
|
||||
"retailer": ["Tesco"] * 5,
|
||||
"fascia": ["Tesco"] * 5,
|
||||
"store_name": [f"Tesco Test {i}" for i in range(1, 6)],
|
||||
"long_wgs": [-0.14] * 5,
|
||||
"lat_wgs": [51.52] * 5,
|
||||
}
|
||||
).write_parquet(grocery_path)
|
||||
|
||||
gias_path = tmp_path / "gias.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"urn": [1001],
|
||||
"name": ["Test Primary School"],
|
||||
"phase": ["Primary"],
|
||||
"type": ["Community school"],
|
||||
"type_group": ["Local authority maintained schools"],
|
||||
"age_range": ["4–11"],
|
||||
"gender": ["Mixed"],
|
||||
"religious_character": [None],
|
||||
"admissions_policy": ["Comprehensive"],
|
||||
"nursery_provision": ["No"],
|
||||
"sixth_form": ["No"],
|
||||
"capacity": [200],
|
||||
"pupils": [180],
|
||||
"fsm_percent": [12.5],
|
||||
"trust": [None],
|
||||
"address": ["1 Test Street"],
|
||||
"postcode": ["E1 1AA"],
|
||||
"local_authority": ["Test LA"],
|
||||
"website": [None],
|
||||
"telephone": ["02012345678"],
|
||||
"head_name": ["Jane Doe"],
|
||||
"lat": [51.53],
|
||||
"lng": [-0.12],
|
||||
}
|
||||
).write_parquet(gias_path)
|
||||
|
||||
ofsted_path = tmp_path / "ofsted.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"URN": [1001],
|
||||
"Latest OEIF overall effectiveness": ["2"],
|
||||
"Ungraded inspection overall outcome": [None],
|
||||
}
|
||||
).write_parquet(ofsted_path)
|
||||
|
||||
boundary_path = _write_boundary(tmp_path)
|
||||
return {
|
||||
"input_path": input_path,
|
||||
"naptan_path": naptan_path,
|
||||
"boundary_path": boundary_path,
|
||||
"grocery_retail_points_path": grocery_path,
|
||||
"gias_path": gias_path,
|
||||
"ofsted_path": ofsted_path,
|
||||
}
|
||||
|
||||
|
||||
def test_transform_grocery_retail_points_outputs_chain_categories():
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
|
|
@ -292,3 +395,79 @@ def test_school_icon_category_handles_one_sided_age_ranges():
|
|||
"Primary school",
|
||||
"School",
|
||||
]
|
||||
|
||||
|
||||
def test_transform_dedupes_multi_tag_pois(tmp_path):
|
||||
# One OSM object can carry several tag keys that map to the SAME friendly
|
||||
# category, so pois.py emits one raw row per key with the SAME id.
|
||||
# "amenity/pharmacy" and "shop/chemist" both map to "Pharmacy".
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": ["n42", "n42"],
|
||||
"name": ["Boots", "Boots"],
|
||||
"category": ["amenity/pharmacy", "shop/chemist"],
|
||||
"lat": [51.50, 51.50],
|
||||
"lng": [-0.10, -0.10],
|
||||
}
|
||||
)
|
||||
inputs = _write_transform_inputs(tmp_path, raw)
|
||||
|
||||
out = transform(**inputs).collect()
|
||||
|
||||
# No (id, category) pair appears more than once.
|
||||
assert out.group_by("id", "category").len()["len"].max() == 1
|
||||
# The single physical pharmacy is present exactly once.
|
||||
pharmacies = out.filter(
|
||||
(pl.col("id") == "n42") & (pl.col("category") == "Pharmacy")
|
||||
)
|
||||
assert pharmacies.height == 1
|
||||
|
||||
|
||||
def test_osm_supermarkets_dropped(tmp_path):
|
||||
# GEOLYTIX is authoritative for supermarkets; an OSM "shop/supermarket" row
|
||||
# must not flow through as a second Groceries/Supermarket pin. A
|
||||
# complementary grocery category (Convenience Store) must still survive.
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": ["n1", "n2"],
|
||||
"name": ["Some Supermarket", "Corner Shop"],
|
||||
"category": ["shop/supermarket", "shop/convenience"],
|
||||
"lat": [51.50, 51.51],
|
||||
"lng": [-0.10, -0.11],
|
||||
}
|
||||
)
|
||||
inputs = _write_transform_inputs(tmp_path, raw)
|
||||
|
||||
out = transform(**inputs).collect()
|
||||
|
||||
osm_supermarkets = out.filter(
|
||||
(pl.col("group") == "Groceries") & (pl.col("category") == "Supermarket")
|
||||
)
|
||||
assert osm_supermarkets.height == 0
|
||||
# Complementary OSM grocery category survives.
|
||||
convenience = out.filter(pl.col("category") == "Convenience Store")
|
||||
assert convenience.height == 1
|
||||
|
||||
|
||||
def test_transform_output_unique_per_id_category(tmp_path):
|
||||
# Soundness: the full transform() output has at most one row per
|
||||
# (id, category) overall, across every source.
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": ["n42", "n42", "n7", "n8"],
|
||||
"name": ["Boots", "Boots", "St Mary's", "St Mary's"],
|
||||
"category": [
|
||||
"amenity/pharmacy",
|
||||
"shop/chemist",
|
||||
"amenity/place_of_worship",
|
||||
"building/church",
|
||||
],
|
||||
"lat": [51.50, 51.50, 51.55, 51.55],
|
||||
"lng": [-0.10, -0.10, -0.15, -0.15],
|
||||
}
|
||||
)
|
||||
inputs = _write_transform_inputs(tmp_path, raw)
|
||||
|
||||
out = transform(**inputs).collect()
|
||||
|
||||
assert out.group_by("id", "category").len()["len"].max() == 1
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue