try
Some checks failed
CI / Check (push) Failing after 3m22s
Build and publish Docker image / build-and-push (push) Successful in 7m25s

This commit is contained in:
Andras Schmelczer 2026-06-04 22:34:26 +01:00
parent 843d14b7ba
commit c938b71904
13 changed files with 698 additions and 109 deletions

View file

@ -21,7 +21,15 @@ from ..utils import (
pl.Config.set_tbl_cols(-1)
RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
MIN_PRICE = 50_000
# Value-quality floor for price aggregations. A flat nominal floor is a blunt
# tool against a deflating threshold — £50k was completely normal for a 1990s
# house, so a 50k floor wrongly discarded ~a third of legitimate 1990s
# open-market sales (and deleted properties whose only sales were old/cheap),
# biasing early-year price history upward. 10k recovers the large [10k,50k)
# band of genuine cheaper sales while still excluding the nominal/junk transfers
# (£1 etc.). A small tail of real sub-10k sales is still dropped — a deliberate
# conservative tradeoff to keep clearly-implausible transfers out.
MIN_PRICE = 10_000
# Plausible construction-year range; band-derived years outside it (e.g. OCR
# noise like 1012 or 2202) are nulled rather than published.

View file

@ -30,6 +30,7 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
MIN_FLOOR_AREA_M2 = 10
CONSERVATION_AREA_FEATURE = "Within conservation area"
TREE_DENSITY_FEATURE = "Street tree density percentile"
LISTED_BUILDING_FEATURE = "Listed building"
LISTED_BUILDING_MATCH_RADIUS_M = 250.0
LISTED_BUILDING_NEAREST_POSTCODES = 3
@ -92,6 +93,10 @@ _AREA_COLUMNS = [
"Noise (dB)",
"Max available download speed (Mbps)",
CONSERVATION_AREA_FEATURE,
# Tree canopy is a 50m-radius percentile around the postcode centroid, so it
# is postcode-grain: it belongs in the area output (one value per postcode,
# covering property-less postcodes too) rather than duplicated per property.
TREE_DENSITY_FEATURE,
# Schools
"Good+ primary schools within 5km",
"Good+ secondary schools within 5km",
@ -116,7 +121,6 @@ _AREA_COLUMNS = [
_DYNAMIC_POI_DISTANCE_RE = re.compile(r"^Distance to nearest amenity \(.+\) \(km\)$")
_DYNAMIC_POI_COUNT_RE = re.compile(r"^Number of amenities \(.+\) within (2|5)km$")
TREE_DENSITY_FEATURE = "Street tree density percentile"
_POSTCODE_TREE_DENSITY_PERCENTILE_RE = re.compile(
r"^Tree canopy density percentile within \d+m$"
)
@ -818,9 +822,9 @@ def _dedupe_collapsed_properties(wide: pl.LazyFrame) -> pl.LazyFrame:
untouched. pp_address is non-null here (join_epc_pp filters it), so the key
never merges unrelated rows.
"""
return wide.sort(
"date_of_transfer", descending=True, nulls_last=True
).unique(subset=["postcode", "pp_address"], keep="first", maintain_order=True)
return wide.sort("date_of_transfer", descending=True, nulls_last=True).unique(
subset=["postcode", "pp_address"], keep="first", maintain_order=True
)
def _filter_to_active_english_postcodes(
@ -1108,13 +1112,26 @@ def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
return epc_band_to_year(pl.col(column))
def _address_score(query: str, candidate: str | None) -> int:
def _address_score(query: str, candidate: str | None, *, allow_token_set: bool) -> int:
if not candidate:
return 0
return max(
fuzz.token_set_ratio(query, candidate),
fuzz.token_sort_ratio(query, candidate),
)
# token_set_ratio returns 100 whenever the shorter token set is a subset of
# the longer. For a NUMBER-LESS query that is unsafe — a single locality
# token (e.g. "KINGSWOOD") subsets to 100 against any long address that
# merely contains it — so number-less queries score with token_sort_ratio
# only, matching the canonical fuzzy_join._score_bucket. For a NUMBERED
# query the unconditional _numbers_compatible gate has already guaranteed the
# candidate carries compatible house numbers, so token_set cannot inflate
# across different addresses; allowing it recovers genuine matches where the
# scraped listing appends trailing town/county tokens the bare register
# address omits (e.g. "105 RIDGEWAY DRIVE BROMLEY KENT" vs "105 RIDGEWAY
# DRIVE").
if allow_token_set:
return max(
fuzz.token_set_ratio(query, candidate),
fuzz.token_sort_ratio(query, candidate),
)
return fuzz.token_sort_ratio(query, candidate)
def _has_number(address: str | None) -> bool:
@ -1153,9 +1170,12 @@ def _best_listing_match(
``uprn_index`` (postcode-independent, so it is robust even when the
listing's postcode is slightly off); (2) failing that, the highest
fuzzy street-address similarity within the listing's own postcode bucket.
No property-attribute heuristics are used a house number in the listing
address gates the fuzzy match (`_numbers_compatible`) and lowers the score
threshold; a number-less address must match the street almost exactly.
No property-attribute heuristics are used `_numbers_compatible` gates
every fuzzy match unconditionally (so a number-less listing can never match
a numbered property, and vice versa), as in the canonical
`fuzzy_join._score_bucket`. A house number additionally lowers the score
threshold and (via `_address_score`) permits token_set scoring; a number-less
address scores on token_sort only and must match the street almost exactly.
``addressed_fields`` names the candidate columns to fuzzy-match against (a
candidate may carry both a register and an EPC address). Returns
@ -1180,9 +1200,11 @@ def _best_listing_match(
address = candidate.get(field)
if not address:
continue
if listing_has_numbers and not _numbers_compatible(query, address):
# Unconditional number gate (matches fuzzy_join): a number-less
# listing cannot match a numbered candidate and vice versa.
if not _numbers_compatible(query, address):
continue
score = _address_score(query, address)
score = _address_score(query, address, allow_token_set=listing_has_numbers)
if score > best_score:
best_score = score
best = candidate
@ -1675,7 +1697,9 @@ def _coalesce_direct_epc_columns(wide: pl.LazyFrame) -> pl.LazyFrame:
# "Yes". "Former council house" should fire if EITHER side says so.
if raw_column == "was_council_house":
return (
pl.when((pl.col(raw_column) == "Yes") | (pl.col(direct_column) == "Yes"))
pl.when(
(pl.col(raw_column) == "Yes") | (pl.col(direct_column) == "Yes")
)
.then(pl.lit("Yes"))
.otherwise(coalesce)
.alias(raw_column)
@ -1716,9 +1740,13 @@ def _build_unmatched_listing_seed_rows(
"total_floor_area": pl.coalesce(
pl.col("_actual_total_floor_area"), pl.col("_direct_total_floor_area")
),
# Prefer the direct-EPC habitable-room count over the listing's value:
# the scraped room count is bedrooms + bathrooms (upstream storage.py
# defect), so it over-counts. Fall back to the listing value only when
# the direct-EPC match has no count.
"number_habitable_rooms": pl.coalesce(
pl.col("_actual_number_habitable_rooms"),
pl.col("_direct_number_habitable_rooms"),
pl.col("_actual_number_habitable_rooms"),
),
"latest_price": pl.col("_actual_asking_price"),
}
@ -1836,14 +1864,19 @@ def _finalize_listings(df: pl.DataFrame) -> pl.DataFrame:
# Listing coordinates win over the postcode centroid.
pl.coalesce(pl.col("_actual_lat").cast(pl.Float64), pl.col("lat")).alias("lat"),
pl.coalesce(pl.col("_actual_lon").cast(pl.Float64), pl.col("lon")).alias("lon"),
# Listing's floor area / rooms override any EPC/PP value when present.
# Listing's floor area overrides any EPC/PP value when present.
pl.coalesce(
pl.col("_actual_total_floor_area").cast(pl.Float64),
pl.col("Total floor area (sqm)"),
).alias("Total floor area (sqm)"),
# Rooms: prefer the EPC habitable-room count and fall back to the listing
# value only when no EPC count exists. The scraped "Number of bedrooms &
# living rooms" is actually bedrooms + bathrooms (an upstream storage.py
# defect), so preferring it would inflate the room count and overwrite a
# correct EPC value.
pl.coalesce(
pl.col("_actual_number_habitable_rooms").cast(pl.Int16),
pl.col("Number of bedrooms & living rooms"),
pl.col("_actual_number_habitable_rooms").cast(pl.Int16),
).alias("Number of bedrooms & living rooms"),
pl.when(pl.col("_actual_property_type").is_in(_PROPERTY_TYPE_VALUES))
.then(pl.col("_actual_property_type"))
@ -2130,12 +2163,15 @@ def _build(
pl.when(
(pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
& (
(pl.col("latest_price") / pl.col("total_floor_area"))
.is_between(MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM)
(pl.col("latest_price") / pl.col("total_floor_area")).is_between(
MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM
)
)
)
.then(
(pl.col("latest_price") / pl.col("total_floor_area")).round(0).cast(pl.Int32)
(pl.col("latest_price") / pl.col("total_floor_area"))
.round(0)
.cast(pl.Int32)
)
.otherwise(None)
.alias("Price per sqm"),

View file

@ -378,7 +378,10 @@ def test_run_new_build_keeps_early_first_transfer_when_sub_min_price(tmp_path: P
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [30_000, 300_000],
# 5_000 is below MIN_PRICE (10_000) — a nominal/junk transfer that
# must still anchor the construction year but stay out of the price
# aggregations.
"price": [5_000, 300_000],
"date_of_transfer": [date(2015, 2, 3), date(2022, 2, 3)],
"property_type": ["T", "T"],
"postcode": ["AA1 1AA", "AA1 1AA"],
@ -408,6 +411,48 @@ def test_run_new_build_keeps_early_first_transfer_when_sub_min_price(tmp_path: P
assert df.get_column("historical_prices").list.len().to_list() == [1]
def test_run_keeps_sale_above_lowered_min_price(tmp_path: Path):
# A genuine cheap sale of 30_000 sits between the OLD floor (50k) and the
# NEW floor (10k): it must now be RETAINED in the price aggregations. This
# pins the 50k->10k change — it fails on the pre-fix 50k floor (where 30k was
# excluded, giving historical_prices length 1 / latest_price 250_000).
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
csv_buffer = io.StringIO()
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
writer.writeheader()
writer.writerow(_row())
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [250_000, 30_000],
"date_of_transfer": [date(2018, 2, 3), date(2022, 2, 3)],
"property_type": ["T", "T"],
"postcode": ["AA1 1AA", "AA1 1AA"],
"paon": ["1", "1"],
"saon": [None, None],
"street": ["Example Street", "Example Street"],
"locality": [None, None],
"town_city": ["Exampletown", "Exampletown"],
"duration": ["F", "F"],
"old_new": ["N", "N"],
"ppd_category": ["A", "A"],
}
).write_parquet(price_paid_path)
output_path = tmp_path / "epc-pp.parquet"
_run(zip_path, price_paid_path, output_path, tmp_path)
df = pl.read_parquet(output_path)
assert df.height == 1
# Both sales now survive the 10k floor; the 30_000 (2022) is the most recent.
assert df.get_column("historical_prices").list.len().to_list() == [2]
assert df.get_column("latest_price").to_list() == [30_000]
def test_epc_band_to_year_uses_midpoint_and_clamps():
import polars as pl

View file

@ -13,6 +13,7 @@ from pipeline.transform.merge import (
_active_english_postcode_area,
_build_unmatched_listing_seed_rows,
_canonical_postcode_expr,
_best_listing_match,
_coalesce_direct_epc_columns,
_dedupe_collapsed_properties,
_filter_to_active_english_postcodes,
@ -78,6 +79,40 @@ def test_conservation_area_feature_is_area_level() -> None:
assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS
def test_tree_density_is_area_level_and_survives_the_split() -> None:
# Street tree density is a postcode-centroid percentile (constant per
# postcode), so it must route to the postcode/area output -- not be stripped
# by _area_columns_from -- and must NOT be duplicated into the property
# output. Regression for the drift where it landed only in properties.parquet
# and was lost for the ~308k property-less postcodes.
assert TREE_DENSITY_FEATURE in _AREA_COLUMNS
df = pl.DataFrame(
{
"Postcode": ["AA1 1AA"],
"Last known price": [250_000],
TREE_DENSITY_FEATURE: [42.0],
}
)
postcode_features = pl.DataFrame(
{
"Postcode": ["AA1 1AA", "BB1 1BB"],
"lat": [51.0, 52.0],
"lon": [-0.1, -0.2],
"ctry25cd": ["E92000001", "E92000001"],
TREE_DENSITY_FEATURE: [42.0, 7.0],
}
)
postcode_df, properties_df = _split_normal_outputs(
df, postcode_features, expected_postcode_count=2
)
assert TREE_DENSITY_FEATURE in postcode_df.columns
assert postcode_df[TREE_DENSITY_FEATURE].to_list() == [42.0, 7.0]
assert TREE_DENSITY_FEATURE not in properties_df.columns
def test_crime_columns_are_spatial_counts_not_per_capita() -> None:
# Crime is now a raw spatial count per postcode; the per-1k-residents
# variants were dropped along with the LSOA population denominator.
@ -767,6 +802,41 @@ def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks(
assert seed["was_council_house"].to_list() == ["No"]
def test_build_unmatched_listing_seed_rows_prefers_direct_epc_rooms_over_listing(
tmp_path,
) -> None:
# When BOTH the listing room count and a direct-EPC count exist, the EPC
# value must win: the scraped "Number of bedrooms & living rooms" is actually
# bedrooms + bathrooms (upstream defect), so preferring it would inflate the
# count. This pins the coalesce direction (direct-EPC before listing).
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns(
# The corrupt listing room count (beds + baths).
pl.lit(5, dtype=pl.Int32).alias("Number of bedrooms & living rooms"),
).write_parquet(listings_path)
_stub_arcgis(arcgis_path)
listings = _load_listings_for_merge(listings_path, arcgis_path).with_columns(
# The genuine EPC habitable-room count.
pl.lit(3, dtype=pl.Int16).alias("_direct_number_habitable_rooms"),
)
template_schema = pl.Schema(
{
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"number_habitable_rooms": pl.Int16,
**{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES},
}
)
seed = _build_unmatched_listing_seed_rows(
listings.select("_listing_idx"), listings, template_schema
)
assert seed["number_habitable_rooms"].to_list() == [3]
_DIRECT_EPC_CANDIDATE_SCHEMA = {
"_direct_epc_row": pl.UInt32,
"_direct_epc_match_address": pl.Utf8,
@ -1249,6 +1319,98 @@ def test_integrate_listings_seeds_listing_with_unmatched_street(tmp_path) -> Non
assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
def test_best_listing_match_rejects_numberless_listing_against_numbered_property() -> (
None
):
# Regression: a number-less listing (street/locality only) must NOT match a
# numbered property. The number gate is unconditional (like fuzzy_join), and
# the score is token_sort_ratio only, so a single locality token can no
# longer subset-inflate to 100 against a long numbered address.
candidates = [{"pp_address": "FLAT A3 CHESHAM HEIGHTS ST MONICAS ROAD"}]
result = _best_listing_match(
listing_uprn=None,
query="KINGSWOOD",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is None
def test_best_listing_match_allows_numberless_to_numberless_named_house() -> None:
# A number-less listing CAN still match a number-less (named-house) property
# when the street/name matches almost exactly.
candidates = [{"pp_address": "WOODLANDS HOUSE OAK LANE"}]
result = _best_listing_match(
listing_uprn=None,
query="WOODLANDS HOUSE OAK LANE",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is not None
candidate, score, method, field = result
assert method == "address"
assert score >= 90.0
def test_best_listing_match_still_matches_numbered_listing_to_numbered_property() -> (
None
):
# No regression for numbered listings: the number gate still permits a
# compatible house number and the lower with-numbers threshold applies.
candidates = [{"pp_address": "10 OAK LANE"}]
result = _best_listing_match(
listing_uprn=None,
query="10 OAK LANE",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is not None
_candidate, score, method, _field = result
assert method == "address"
assert score >= 82.0
def test_best_listing_match_numbered_listing_with_trailing_locality_still_matches() -> (
None
):
# A scraped numbered listing often appends town/county tokens that the bare
# Price-Paid register address omits. token_sort alone would score this ~73
# (below 82) and drop a correct match; token_set (allowed for numbered
# queries, where the number gate makes it safe) recovers it.
candidates = [{"pp_address": "105 RIDGEWAY DRIVE"}]
result = _best_listing_match(
listing_uprn=None,
query="105 RIDGEWAY DRIVE BROMLEY KENT",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is not None
candidate, score, _method, _field = result
assert candidate["pp_address"] == "105 RIDGEWAY DRIVE"
assert score >= 82.0
def test_best_listing_match_numbered_query_cannot_subset_inflate_across_numbers() -> (
None
):
# token_set for numbered queries is safe only because the number gate runs
# first: a query and candidate with incompatible house numbers never reach
# scoring, so token_set cannot inflate "10 OAK LANE" onto "12 OAK LANE".
candidates = [{"pp_address": "12 OAK LANE KINGSTON"}]
result = _best_listing_match(
listing_uprn=None,
query="10 OAK LANE",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is None
def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows() -> (
None
):
@ -1325,9 +1487,12 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
assert finalized["Est. price per sqm"].to_list() == [5_000, 7_368]
assert finalized["Estimated current price"].to_list() == [600_000, 700_000]
assert finalized["Last known price"].to_list() == [500_000, 700_000]
# Listing's preferred floor area / rooms / property type / tenure.
# Listing's preferred floor area / property type / tenure.
assert finalized["Total floor area (sqm)"].to_list() == [110.0, 95.0]
assert finalized["Number of bedrooms & living rooms"].to_list() == [4, 3]
# Rooms prefer the EPC habitable-room count over the listing's beds+baths
# value: row 0 keeps the EPC 3 (not the listing's _actual 4); row 1 has no
# EPC count so it falls back to the listing's 3.
assert finalized["Number of bedrooms & living rooms"].to_list() == [3, 3]
assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"]
assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"]
# Postcode-level feature carried through to both matched and unmatched rows.

View file

@ -5,11 +5,72 @@ import polars as pl
from pipeline.transform.transform_poi import (
_load_ofsted_ratings,
_school_icon_category_expr,
osm_groceries_colocated_with_geolytix,
transform,
transform_grocery_retail_points,
)
def test_osm_groceries_colocated_with_geolytix_drops_only_brand_matched_duplicates():
# GEOLYTIX is authoritative for its chains. An OSM grocery that sits on top
# of a GEOLYTIX store AND carries its brand is the same physical store and
# must be dropped; an independent shop at the same spot, and a same-brand
# store far from any GEOLYTIX point, must be kept.
geolytix = pl.DataFrame(
{
"category": ["Tesco"],
"lat": [51.5000],
"lng": [-0.1000],
}
)
osm = pl.DataFrame(
{
"id": ["dup-brand", "independent", "far-brand"],
"name": ["Tesco Express", "Bob's Corner Shop", "Tesco Express"],
# ~1 m, ~2 m, and ~55 km from the GEOLYTIX Tesco.
"lat": [51.50001, 51.50002, 52.0],
"lng": [-0.10001, -0.1000, -1.0],
}
)
drop_ids = osm_groceries_colocated_with_geolytix(osm, geolytix, radius_m=50.0)
assert drop_ids == ["dup-brand"]
def test_osm_groceries_colocated_with_geolytix_dedupes_cooperative_spelling():
# GEOLYTIX brand "Co-op" tokenises to "coop"; OSM commonly spells it
# "The Co-operative Food" -> "cooperative". The alias folds them so the
# genuine duplicate is still dropped.
geolytix = pl.DataFrame({"category": ["Co-op"], "lat": [53.0], "lng": [-1.5]})
osm = pl.DataFrame(
{
"id": ["coop-dup"],
"name": ["The Co-operative Food"],
"lat": [53.00001],
"lng": [-1.5],
}
)
assert osm_groceries_colocated_with_geolytix(osm, geolytix, radius_m=50.0) == [
"coop-dup"
]
def test_osm_groceries_colocated_with_geolytix_handles_empty_inputs():
geolytix = pl.DataFrame({"category": ["Tesco"], "lat": [51.5], "lng": [-0.1]})
empty = pl.DataFrame(
schema={"id": pl.Utf8, "name": pl.Utf8, "lat": pl.Float64, "lng": pl.Float64}
)
assert osm_groceries_colocated_with_geolytix(empty, geolytix) == []
osm = pl.DataFrame(
{"id": ["x"], "name": ["Tesco Express"], "lat": [51.5], "lng": [-0.1]}
)
empty_glx = pl.DataFrame(
schema={"category": pl.Utf8, "lat": pl.Float64, "lng": pl.Float64}
)
assert osm_groceries_colocated_with_geolytix(osm, empty_glx) == []
def _write_boundary(tmp_path):
"""A FeatureCollection whose single feature covers the London-area test
coords used by the transform() fixtures, so in_england_mask keeps them."""
@ -345,12 +406,7 @@ def test_load_ofsted_ratings_falls_back_to_ungraded_outcome(tmp_path):
}
).write_parquet(ofsted_path)
ratings = (
_load_ofsted_ratings(ofsted_path)
.collect()
.sort("urn")
.to_dicts()
)
ratings = _load_ofsted_ratings(ofsted_path).collect().sort("urn").to_dicts()
assert ratings == [
{"urn": 1, "ofsted_rating": "Outstanding"},
@ -384,9 +440,9 @@ def test_school_icon_category_handles_one_sided_age_ranges():
},
)
categories = df.select(
_school_icon_category_expr().alias("category")
)["category"].to_list()
categories = df.select(_school_icon_category_expr().alias("category"))[
"category"
].to_list()
assert categories == [
"Nursery school",
@ -449,6 +505,45 @@ def test_osm_supermarkets_dropped(tmp_path):
assert convenience.height == 1
def test_transform_grocery_dedup_drops_only_grocery_aspect(tmp_path):
# The _write_transform_inputs fixture seeds 5 GEOLYTIX "Tesco" points at
# (51.52, -0.14). An OSM object colocated there carrying "Tesco" in its name
# is the same physical store, so its Convenience Store (Groceries) row is a
# duplicate and must be dropped — but its NON-grocery aspect (a Post Office
# sharing the same OSM id) must survive. An independent shop away from the
# GEOLYTIX point keeps its grocery row.
raw = pl.DataFrame(
{
"id": ["n1", "n1", "n2"],
"name": ["Tesco Express", "Tesco Express", "Corner Shop"],
"category": [
"shop/convenience",
"amenity/post_office",
"shop/convenience",
],
"lat": [51.52, 51.52, 51.40],
"lng": [-0.14, -0.14, -0.05],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
# The colocated, brand-matched grocery row is dropped.
n1_grocery = out.filter((pl.col("id") == "n1") & (pl.col("group") == "Groceries"))
assert n1_grocery.height == 0
# Its non-grocery aspect (Post Office) survives.
n1_post_office = out.filter(
(pl.col("id") == "n1") & (pl.col("category") == "Post Office")
)
assert n1_post_office.height == 1
# The independent corner shop (no brand, far away) keeps its grocery row.
n2_grocery = out.filter(
(pl.col("id") == "n2") & (pl.col("category") == "Convenience Store")
)
assert n2_grocery.height == 1
def test_transform_output_unique_per_id_category(tmp_path):
# Soundness: the full transform() output has at most one row per
# (id, category) overall, across every source.

View file

@ -1,6 +1,7 @@
import argparse
from pathlib import Path
import numpy as np
import polars as pl
from pipeline.utils.england_geometry import in_england_mask
@ -955,7 +956,6 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
# Note: schools come from the GIAS register (see transform_gias_schools).
# Niche/tertiary education amenities that GIAS does not cover are dropped
# rather than mixed in with state-funded schools.
(
"Local Businesses",
"Hotel",
@ -1441,38 +1441,128 @@ def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
# category mirrors icon_category so the dashboard renders one toggle per
# school type (Nursery / Primary / Secondary / Sixth form / University /…)
# instead of bundling every GIAS row under a single "School" pill.
return pl.scan_parquet(gias_path).join(ofsted, on="urn", how="left").select(
pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
pl.col("name"),
icon_category_expr.alias("category"),
icon_category_expr.alias("icon_category"),
pl.lit("Education").alias("group"),
pl.col("lat").cast(pl.Float64),
pl.col("lng").cast(pl.Float64),
emoji_expr.alias("emoji"),
pl.col("phase").alias("school_phase"),
pl.col("type").alias("school_type"),
pl.col("type_group").alias("school_type_group"),
pl.col("age_range").alias("school_age_range"),
pl.col("gender").alias("school_gender"),
pl.col("religious_character").alias("school_religious_character"),
pl.col("admissions_policy").alias("school_admissions_policy"),
pl.col("nursery_provision").alias("school_nursery_provision"),
pl.col("sixth_form").alias("school_sixth_form"),
pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
pl.col("fsm_percent").cast(pl.Float32, strict=False).alias("school_fsm_percent"),
pl.col("trust").alias("school_trust"),
pl.col("address").alias("school_address"),
pl.col("postcode").alias("school_postcode"),
pl.col("local_authority").alias("school_local_authority"),
pl.col("website").alias("school_website"),
pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
pl.col("head_name").alias("school_head_name"),
pl.col("ofsted_rating").alias("school_ofsted_rating"),
return (
pl.scan_parquet(gias_path)
.join(ofsted, on="urn", how="left")
.select(
pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
pl.col("name"),
icon_category_expr.alias("category"),
icon_category_expr.alias("icon_category"),
pl.lit("Education").alias("group"),
pl.col("lat").cast(pl.Float64),
pl.col("lng").cast(pl.Float64),
emoji_expr.alias("emoji"),
pl.col("phase").alias("school_phase"),
pl.col("type").alias("school_type"),
pl.col("type_group").alias("school_type_group"),
pl.col("age_range").alias("school_age_range"),
pl.col("gender").alias("school_gender"),
pl.col("religious_character").alias("school_religious_character"),
pl.col("admissions_policy").alias("school_admissions_policy"),
pl.col("nursery_provision").alias("school_nursery_provision"),
pl.col("sixth_form").alias("school_sixth_form"),
pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
pl.col("fsm_percent")
.cast(pl.Float32, strict=False)
.alias("school_fsm_percent"),
pl.col("trust").alias("school_trust"),
pl.col("address").alias("school_address"),
pl.col("postcode").alias("school_postcode"),
pl.col("local_authority").alias("school_local_authority"),
pl.col("website").alias("school_website"),
pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
pl.col("head_name").alias("school_head_name"),
pl.col("ofsted_rating").alias("school_ofsted_rating"),
)
)
# OSM convenience-format stores that GEOLYTIX also covers (Tesco Express,
# Sainsbury's Local, Co-op Food, Morrisons Daily, Spar, ...) would otherwise be
# counted twice: once as a GEOLYTIX brand row and once as an OSM "Convenience
# Store". GEOLYTIX is authoritative for its chains, so an OSM grocery row that
# sits on top of a GEOLYTIX point AND carries that point's brand name is the
# same physical store and is dropped. Independent corner shops never carry a
# chain brand, so they are kept.
GROCERY_DEDUP_RADIUS_M = 50.0
# Brand-token aliases so an OSM name spelt differently from the GEOLYTIX brand
# still matches. GEOLYTIX's "Co-op" tokenises to "coop", but OSM frequently
# spells it "The Co-operative Food" -> "cooperative"; without this, ~300+ genuine
# Co-op duplicates would survive. Keys/values are post-strip (alnum-only) tokens.
_GROCERY_TOKEN_ALIASES = {
"cooperative": "coop",
"cooperatives": "coop",
}
def _significant_tokens(name: str | None) -> set[str]:
"""Lower-case alphanumeric tokens of length >= 3 from a POI name (aliased)."""
if not name:
return set()
tokens: set[str] = set()
for raw in str(name).lower().split():
token = "".join(ch for ch in raw if ch.isalnum())
if len(token) >= 3:
tokens.add(_GROCERY_TOKEN_ALIASES.get(token, token))
return tokens
def osm_groceries_colocated_with_geolytix(
osm_groceries: pl.DataFrame,
geolytix: pl.DataFrame,
radius_m: float = GROCERY_DEDUP_RADIUS_M,
) -> list[str]:
"""Return OSM grocery ids that duplicate a GEOLYTIX store.
An OSM Groceries row is a duplicate when a GEOLYTIX point lies within
``radius_m`` metres AND that point's brand tokens (its ``category``, e.g.
"Tesco", "Co-op") are all present in the OSM row's name — i.e. the same
physical branded store. Brands with no token >= 3 chars (e.g. "M&S") never
match, so they are conservatively kept rather than risk a false drop.
``osm_groceries`` needs columns ``id``, ``name``, ``lat``, ``lng``;
``geolytix`` needs ``category`` (the brand), ``lat``, ``lng``.
"""
if osm_groceries.is_empty() or geolytix.is_empty():
return []
from scipy.spatial import cKDTree
glx_lat = geolytix["lat"].to_numpy().astype(float)
glx_lng = geolytix["lng"].to_numpy().astype(float)
glx_brand_tokens = [_significant_tokens(b) for b in geolytix["category"].to_list()]
osm_lat = osm_groceries["lat"].to_numpy().astype(float)
osm_lng = osm_groceries["lng"].to_numpy().astype(float)
osm_ids = osm_groceries["id"].to_list()
osm_name_tokens = [_significant_tokens(n) for n in osm_groceries["name"].to_list()]
# Equirectangular projection to metres around the shared mean latitude — at
# England's scale this is accurate to well under the dedup radius.
mean_lat = float(np.mean(np.concatenate([glx_lat, osm_lat])))
cos_lat = float(np.cos(np.radians(mean_lat)))
glx_xy = np.column_stack([glx_lng * cos_lat * 111_320.0, glx_lat * 110_540.0])
osm_xy = np.column_stack([osm_lng * cos_lat * 111_320.0, osm_lat * 110_540.0])
tree = cKDTree(glx_xy)
neighbours = tree.query_ball_point(osm_xy, r=radius_m)
drop_ids: list[str] = []
for osm_idx, glx_indices in enumerate(neighbours):
tokens = osm_name_tokens[osm_idx]
if not tokens:
continue
for glx_idx in glx_indices:
brand = glx_brand_tokens[glx_idx]
if brand and brand.issubset(tokens):
drop_ids.append(osm_ids[osm_idx])
break
return drop_ids
def transform(
input_path: Path,
naptan_path: Path,
@ -1553,6 +1643,27 @@ def transform(
grocery_df = pl.read_parquet(grocery_retail_points_path)
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
# Drop OSM grocery rows that duplicate a GEOLYTIX store (same brand,
# colocated) so a Tesco Express / Co-op / Spar isn't counted twice.
osm_groceries = (
lf.filter(pl.col("group") == "Groceries")
.select("id", "name", "lat", "lng")
.collect(engine="streaming")
)
duplicate_ids = osm_groceries_colocated_with_geolytix(osm_groceries, grocery_pois)
if duplicate_ids:
print(
f"Dropping {len(duplicate_ids):,} OSM grocery POIs that duplicate a "
"GEOLYTIX store"
)
# Scope the drop to the Groceries group: a single OSM object can also
# carry a non-grocery aspect (e.g. a convenience store that is also a
# Post Office), which must survive — only its duplicate grocery row goes.
lf = lf.filter(
~((pl.col("group") == "Groceries") & pl.col("id").is_in(duplicate_ids))
)
frames = [
lf,
naptan,