try
This commit is contained in:
parent
843d14b7ba
commit
c938b71904
13 changed files with 698 additions and 109 deletions
|
|
@ -1,6 +1,7 @@
|
|||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from pipeline.utils.england_geometry import in_england_mask
|
||||
|
|
@ -955,7 +956,6 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
# Note: schools come from the GIAS register (see transform_gias_schools).
|
||||
# Niche/tertiary education amenities that GIAS does not cover are dropped
|
||||
# rather than mixed in with state-funded schools.
|
||||
|
||||
(
|
||||
"Local Businesses",
|
||||
"Hotel",
|
||||
|
|
@ -1441,38 +1441,128 @@ def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
|
|||
# category mirrors icon_category so the dashboard renders one toggle per
|
||||
# school type (Nursery / Primary / Secondary / Sixth form / University /…)
|
||||
# instead of bundling every GIAS row under a single "School" pill.
|
||||
return pl.scan_parquet(gias_path).join(ofsted, on="urn", how="left").select(
|
||||
pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
|
||||
pl.col("name"),
|
||||
icon_category_expr.alias("category"),
|
||||
icon_category_expr.alias("icon_category"),
|
||||
pl.lit("Education").alias("group"),
|
||||
pl.col("lat").cast(pl.Float64),
|
||||
pl.col("lng").cast(pl.Float64),
|
||||
emoji_expr.alias("emoji"),
|
||||
pl.col("phase").alias("school_phase"),
|
||||
pl.col("type").alias("school_type"),
|
||||
pl.col("type_group").alias("school_type_group"),
|
||||
pl.col("age_range").alias("school_age_range"),
|
||||
pl.col("gender").alias("school_gender"),
|
||||
pl.col("religious_character").alias("school_religious_character"),
|
||||
pl.col("admissions_policy").alias("school_admissions_policy"),
|
||||
pl.col("nursery_provision").alias("school_nursery_provision"),
|
||||
pl.col("sixth_form").alias("school_sixth_form"),
|
||||
pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
|
||||
pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
|
||||
pl.col("fsm_percent").cast(pl.Float32, strict=False).alias("school_fsm_percent"),
|
||||
pl.col("trust").alias("school_trust"),
|
||||
pl.col("address").alias("school_address"),
|
||||
pl.col("postcode").alias("school_postcode"),
|
||||
pl.col("local_authority").alias("school_local_authority"),
|
||||
pl.col("website").alias("school_website"),
|
||||
pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
|
||||
pl.col("head_name").alias("school_head_name"),
|
||||
pl.col("ofsted_rating").alias("school_ofsted_rating"),
|
||||
return (
|
||||
pl.scan_parquet(gias_path)
|
||||
.join(ofsted, on="urn", how="left")
|
||||
.select(
|
||||
pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
|
||||
pl.col("name"),
|
||||
icon_category_expr.alias("category"),
|
||||
icon_category_expr.alias("icon_category"),
|
||||
pl.lit("Education").alias("group"),
|
||||
pl.col("lat").cast(pl.Float64),
|
||||
pl.col("lng").cast(pl.Float64),
|
||||
emoji_expr.alias("emoji"),
|
||||
pl.col("phase").alias("school_phase"),
|
||||
pl.col("type").alias("school_type"),
|
||||
pl.col("type_group").alias("school_type_group"),
|
||||
pl.col("age_range").alias("school_age_range"),
|
||||
pl.col("gender").alias("school_gender"),
|
||||
pl.col("religious_character").alias("school_religious_character"),
|
||||
pl.col("admissions_policy").alias("school_admissions_policy"),
|
||||
pl.col("nursery_provision").alias("school_nursery_provision"),
|
||||
pl.col("sixth_form").alias("school_sixth_form"),
|
||||
pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
|
||||
pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
|
||||
pl.col("fsm_percent")
|
||||
.cast(pl.Float32, strict=False)
|
||||
.alias("school_fsm_percent"),
|
||||
pl.col("trust").alias("school_trust"),
|
||||
pl.col("address").alias("school_address"),
|
||||
pl.col("postcode").alias("school_postcode"),
|
||||
pl.col("local_authority").alias("school_local_authority"),
|
||||
pl.col("website").alias("school_website"),
|
||||
pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
|
||||
pl.col("head_name").alias("school_head_name"),
|
||||
pl.col("ofsted_rating").alias("school_ofsted_rating"),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# OSM convenience-format stores that GEOLYTIX also covers (Tesco Express,
|
||||
# Sainsbury's Local, Co-op Food, Morrisons Daily, Spar, ...) would otherwise be
|
||||
# counted twice: once as a GEOLYTIX brand row and once as an OSM "Convenience
|
||||
# Store". GEOLYTIX is authoritative for its chains, so an OSM grocery row that
|
||||
# sits on top of a GEOLYTIX point AND carries that point's brand name is the
|
||||
# same physical store and is dropped. Independent corner shops never carry a
|
||||
# chain brand, so they are kept.
|
||||
GROCERY_DEDUP_RADIUS_M = 50.0
|
||||
|
||||
# Brand-token aliases so an OSM name spelt differently from the GEOLYTIX brand
|
||||
# still matches. GEOLYTIX's "Co-op" tokenises to "coop", but OSM frequently
|
||||
# spells it "The Co-operative Food" -> "cooperative"; without this, ~300+ genuine
|
||||
# Co-op duplicates would survive. Keys/values are post-strip (alnum-only) tokens.
|
||||
_GROCERY_TOKEN_ALIASES = {
|
||||
"cooperative": "coop",
|
||||
"cooperatives": "coop",
|
||||
}
|
||||
|
||||
|
||||
def _significant_tokens(name: str | None) -> set[str]:
|
||||
"""Lower-case alphanumeric tokens of length >= 3 from a POI name (aliased)."""
|
||||
if not name:
|
||||
return set()
|
||||
tokens: set[str] = set()
|
||||
for raw in str(name).lower().split():
|
||||
token = "".join(ch for ch in raw if ch.isalnum())
|
||||
if len(token) >= 3:
|
||||
tokens.add(_GROCERY_TOKEN_ALIASES.get(token, token))
|
||||
return tokens
|
||||
|
||||
|
||||
def osm_groceries_colocated_with_geolytix(
|
||||
osm_groceries: pl.DataFrame,
|
||||
geolytix: pl.DataFrame,
|
||||
radius_m: float = GROCERY_DEDUP_RADIUS_M,
|
||||
) -> list[str]:
|
||||
"""Return OSM grocery ids that duplicate a GEOLYTIX store.
|
||||
|
||||
An OSM Groceries row is a duplicate when a GEOLYTIX point lies within
|
||||
``radius_m`` metres AND that point's brand tokens (its ``category``, e.g.
|
||||
"Tesco", "Co-op") are all present in the OSM row's name — i.e. the same
|
||||
physical branded store. Brands with no token >= 3 chars (e.g. "M&S") never
|
||||
match, so they are conservatively kept rather than risk a false drop.
|
||||
|
||||
``osm_groceries`` needs columns ``id``, ``name``, ``lat``, ``lng``;
|
||||
``geolytix`` needs ``category`` (the brand), ``lat``, ``lng``.
|
||||
"""
|
||||
if osm_groceries.is_empty() or geolytix.is_empty():
|
||||
return []
|
||||
|
||||
from scipy.spatial import cKDTree
|
||||
|
||||
glx_lat = geolytix["lat"].to_numpy().astype(float)
|
||||
glx_lng = geolytix["lng"].to_numpy().astype(float)
|
||||
glx_brand_tokens = [_significant_tokens(b) for b in geolytix["category"].to_list()]
|
||||
|
||||
osm_lat = osm_groceries["lat"].to_numpy().astype(float)
|
||||
osm_lng = osm_groceries["lng"].to_numpy().astype(float)
|
||||
osm_ids = osm_groceries["id"].to_list()
|
||||
osm_name_tokens = [_significant_tokens(n) for n in osm_groceries["name"].to_list()]
|
||||
|
||||
# Equirectangular projection to metres around the shared mean latitude — at
|
||||
# England's scale this is accurate to well under the dedup radius.
|
||||
mean_lat = float(np.mean(np.concatenate([glx_lat, osm_lat])))
|
||||
cos_lat = float(np.cos(np.radians(mean_lat)))
|
||||
glx_xy = np.column_stack([glx_lng * cos_lat * 111_320.0, glx_lat * 110_540.0])
|
||||
osm_xy = np.column_stack([osm_lng * cos_lat * 111_320.0, osm_lat * 110_540.0])
|
||||
|
||||
tree = cKDTree(glx_xy)
|
||||
neighbours = tree.query_ball_point(osm_xy, r=radius_m)
|
||||
|
||||
drop_ids: list[str] = []
|
||||
for osm_idx, glx_indices in enumerate(neighbours):
|
||||
tokens = osm_name_tokens[osm_idx]
|
||||
if not tokens:
|
||||
continue
|
||||
for glx_idx in glx_indices:
|
||||
brand = glx_brand_tokens[glx_idx]
|
||||
if brand and brand.issubset(tokens):
|
||||
drop_ids.append(osm_ids[osm_idx])
|
||||
break
|
||||
return drop_ids
|
||||
|
||||
|
||||
def transform(
|
||||
input_path: Path,
|
||||
naptan_path: Path,
|
||||
|
|
@ -1553,6 +1643,27 @@ def transform(
|
|||
|
||||
grocery_df = pl.read_parquet(grocery_retail_points_path)
|
||||
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
|
||||
|
||||
# Drop OSM grocery rows that duplicate a GEOLYTIX store (same brand,
|
||||
# colocated) so a Tesco Express / Co-op / Spar isn't counted twice.
|
||||
osm_groceries = (
|
||||
lf.filter(pl.col("group") == "Groceries")
|
||||
.select("id", "name", "lat", "lng")
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
duplicate_ids = osm_groceries_colocated_with_geolytix(osm_groceries, grocery_pois)
|
||||
if duplicate_ids:
|
||||
print(
|
||||
f"Dropping {len(duplicate_ids):,} OSM grocery POIs that duplicate a "
|
||||
"GEOLYTIX store"
|
||||
)
|
||||
# Scope the drop to the Groceries group: a single OSM object can also
|
||||
# carry a non-grocery aspect (e.g. a convenience store that is also a
|
||||
# Post Office), which must survive — only its duplicate grocery row goes.
|
||||
lf = lf.filter(
|
||||
~((pl.col("group") == "Groceries") & pl.col("id").is_in(duplicate_ids))
|
||||
)
|
||||
|
||||
frames = [
|
||||
lf,
|
||||
naptan,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue