Test changes
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 8m20s
CI / Check (push) Failing after 10m40s

This commit is contained in:
Andras Schmelczer 2026-05-09 11:35:38 +01:00
parent 4c95815dc8
commit be02fc16bb
41 changed files with 4224 additions and 759 deletions

View file

@ -7,6 +7,15 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
MIN_FLOOR_AREA_M2 = 10
_IOD_PERCENTILE_COLUMNS = [
"Education, Skills and Training Score",
"Income Score (rate)",
"Employment Score (rate)",
"Health Deprivation and Disability Score",
"Indoors Sub-domain Score",
"Outdoors Sub-domain Score",
]
_AREA_COLUMNS = [
"Postcode",
@ -51,6 +60,14 @@ _AREA_COLUMNS = [
"Number of parks within 1km",
"Distance to nearest train or tube station (km)",
"Distance to nearest park (km)",
"Distance to nearest grocery store (km)",
"Distance to nearest tube station (km)",
"Distance to nearest rail station (km)",
"Distance to nearest Waitrose (km)",
"Distance to nearest Tesco (km)",
"Distance to nearest cafe (km)",
"Distance to nearest pub (km)",
"Distance to nearest restaurant (km)",
# Environment
"Noise (dB)",
"Max available download speed (Mbps)",
@ -76,6 +93,34 @@ _AREA_COLUMNS = [
]
def _is_dynamic_poi_metric_column(column: str) -> bool:
return (
column.startswith("Distance to nearest ")
and column.endswith(" POI (km)")
) or (
column.startswith("Number of ")
and (column.endswith(" POIs within 2km") or column.endswith(" POIs within 5km"))
)
def _less_deprived_percentile_expr(column: str) -> pl.Expr:
"""Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
non_null_count = pl.col(column).count()
descending_rank = pl.col(column).rank("average", descending=True)
return (
pl.when(pl.col(column).is_null())
.then(None)
.when(pl.col(column) == pl.col(column).min())
.then(100.0)
.when(pl.col(column) == pl.col(column).max())
.then(0.0)
.when(non_null_count > 1)
.then(((descending_rank - 1) / (non_null_count - 1) * 100).round(1))
.otherwise(100.0)
.alias(column)
)
def _build(
epc_pp_path: Path,
arcgis_path: Path,
@ -134,20 +179,11 @@ def _build(
)
wide = wide.join(arcgis, on="postcode", how="left")
iod = pl.scan_parquet(iod_path)
iod = pl.scan_parquet(iod_path).with_columns(
*(_less_deprived_percentile_expr(c) for c in _IOD_PERCENTILE_COLUMNS)
)
wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
# Invert deprivation scores so that higher values = less deprived (better)
iod_score_cols = [
"Education, Skills and Training Score",
"Income Score (rate)",
"Employment Score (rate)",
"Health Deprivation and Disability Score",
"Indoors Sub-domain Score",
"Outdoors Sub-domain Score",
]
wide = wide.with_columns(*(pl.col(c).max() - pl.col(c) for c in iod_score_cols))
ethnicity = pl.scan_parquet(ethnicity_path)
wide = wide.join(
ethnicity,
@ -351,6 +387,14 @@ def _build(
"parks_1km": "Number of parks within 1km",
"train_tube_nearest_km": "Distance to nearest train or tube station (km)",
"parks_nearest_km": "Distance to nearest park (km)",
"grocery_store_nearest_km": "Distance to nearest grocery store (km)",
"tube_station_nearest_km": "Distance to nearest tube station (km)",
"rail_station_nearest_km": "Distance to nearest rail station (km)",
"waitrose_nearest_km": "Distance to nearest Waitrose (km)",
"tesco_nearest_km": "Distance to nearest Tesco (km)",
"cafe_nearest_km": "Distance to nearest cafe (km)",
"pub_nearest_km": "Distance to nearest pub (km)",
"restaurant_nearest_km": "Distance to nearest restaurant (km)",
"latest_price": "Last known price",
"number_habitable_rooms": "Number of bedrooms & living rooms",
"noise_lden_db": "Noise (dB)",
@ -381,10 +425,14 @@ def _build(
# Split into postcode-level and property-level dataframes
area_cols = [c for c in _AREA_COLUMNS if c in df.columns]
area_cols.extend(
c for c in df.columns if _is_dynamic_poi_metric_column(c) and c not in area_cols
)
area_col_set = set(area_cols)
postcode_df = df.select(area_cols).group_by("Postcode").first()
print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
property_cols = [c for c in df.columns if c not in _AREA_COLUMNS or c == "Postcode"]
property_cols = [c for c in df.columns if c not in area_col_set or c == "Postcode"]
properties_df = df.select(property_cols)
print(f"Property rows: {properties_df.height}")

View file

@ -1,6 +1,8 @@
"""Compute POI proximity counts and distances per postcode from ArcGIS + filtered POIs."""
import argparse
import re
import unicodedata
from pathlib import Path
import polars as pl
@ -15,9 +17,25 @@ POI_GROUPS_2KM = {
"groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
}
# Groups for which to compute distance to nearest POI (from filtered POIs)
# Groups for which to compute distance to nearest POI (from filtered POIs).
# Keep `train_tube` for the existing backend feature; the individual POI
# distance filters below power the frontend dropdown.
DISTANCE_GROUPS = {
"train_tube": ["Tube station", "Rail station"],
"grocery_store": [
"Greengrocer",
"Supermarket",
"Convenience Store",
"Waitrose",
"Tesco",
],
"tube_station": ["Tube station"],
"rail_station": ["Rail station"],
"waitrose": ["Waitrose"],
"tesco": ["Tesco"],
"cafe": ["Café"],
"pub": ["Pub"],
"restaurant": ["Restaurant"],
}
# OS Open Greenspace function types used for park counts and distance calculation.
@ -27,6 +45,69 @@ GREENSPACE_PARK_FUNCTIONS = {
"parks": ["Public Park Or Garden", "Playing Field", "Play Space"],
}
GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure"}
DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS = {"Groceries"}
def _poi_category_slug(category: str) -> str:
ascii_text = (
unicodedata.normalize("NFKD", category)
.encode("ascii", "ignore")
.decode("ascii")
.lower()
)
slug = re.sub(r"[^a-z0-9]+", "_", ascii_text).strip("_")
return slug or "poi"
def _build_poi_category_groups(
pois: pl.DataFrame,
) -> tuple[dict[str, list[str]], dict[str, str]]:
"""Build one proximity group for each POI category selected for filters."""
if "group" not in pois.columns:
raise ValueError("POI dataframe must include a 'group' column")
categories = (
pois.group_by("group", "category")
.len()
.filter(
pl.col("group").is_in(list(DYNAMIC_FILTER_ALL_GROUPS))
| (
pl.col("group").is_in(list(DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS))
& (pl.col("len") > GROCERY_DYNAMIC_FILTER_MIN_POIS)
)
)
.select("category")
.sort("category")
.to_series()
.to_list()
)
used_slugs: dict[str, int] = {}
groups: dict[str, list[str]] = {}
display_names: dict[str, str] = {}
for category in categories:
if not isinstance(category, str) or not category:
continue
base_slug = f"poi_{_poi_category_slug(category)}"
slug_count = used_slugs.get(base_slug, 0)
used_slugs[base_slug] = slug_count + 1
group_key = base_slug if slug_count == 0 else f"{base_slug}_{slug_count + 1}"
groups[group_key] = [category]
display_names[group_key] = category
return groups, display_names
def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]:
renames: dict[str, str] = {}
for group_key, category in display_names.items():
renames[f"{group_key}_nearest_km"] = f"Distance to nearest {category} POI (km)"
renames[f"{group_key}_2km"] = f"Number of {category} POIs within 2km"
renames[f"{group_key}_5km"] = f"Number of {category} POIs within 5km"
return renames
def main():
parser = argparse.ArgumentParser(
@ -56,12 +137,35 @@ def main():
)
pois = pl.read_parquet(args.pois)
poi_category_groups, poi_display_names = _build_poi_category_groups(pois)
# Count amenity POIs within 2km
counts_2km = count_pois_per_postcode(
postcodes, pois, groups=POI_GROUPS_2KM, radius_km=2
)
# Dynamic POI filters: nearest distance plus counts within 2km and 5km for
# the selected public transport, grocery, and leisure categories.
dynamic_counts_2km = count_pois_per_postcode(
postcodes, pois, groups=poi_category_groups, radius_km=2
)
dynamic_counts_5km = count_pois_per_postcode(
postcodes, pois, groups=poi_category_groups, radius_km=5
)
dynamic_distances = min_distance_per_postcode(
postcodes, pois, groups=poi_category_groups
)
dynamic_renames = _dynamic_poi_metric_renames(poi_display_names)
dynamic_counts_2km = dynamic_counts_2km.rename(
{k: v for k, v in dynamic_renames.items() if k in dynamic_counts_2km.columns}
)
dynamic_counts_5km = dynamic_counts_5km.rename(
{k: v for k, v in dynamic_renames.items() if k in dynamic_counts_5km.columns}
)
dynamic_distances = dynamic_distances.rename(
{k: v for k, v in dynamic_renames.items() if k in dynamic_distances.columns}
)
# Distance to nearest train/tube station (from filtered POIs)
distances = min_distance_per_postcode(postcodes, pois, groups=DISTANCE_GROUPS)
@ -77,6 +181,9 @@ def main():
# Join all results on postcode
result = (
counts_2km.join(distances, on="postcode")
.join(dynamic_counts_2km, on="postcode")
.join(dynamic_counts_5km, on="postcode")
.join(dynamic_distances, on="postcode")
.join(park_counts_1km, on="postcode")
.join(park_distances, on="postcode")
)

View file

@ -0,0 +1,33 @@
import polars as pl
from pipeline.transform.merge import (
_is_dynamic_poi_metric_column,
_less_deprived_percentile_expr,
)
def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None:
df = pl.DataFrame({"Income Score (rate)": [1.0, 2.0, 3.0, None]})
result = df.lazy().with_columns(
_less_deprived_percentile_expr("Income Score (rate)")
).collect()
assert result["Income Score (rate)"].to_list() == [100.0, 50.0, 0.0, None]
def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None:
df = pl.DataFrame({"Income Score (rate)": [1.0, 1.0, 2.0, 3.0, 3.0]})
result = df.lazy().with_columns(
_less_deprived_percentile_expr("Income Score (rate)")
).collect()
assert result["Income Score (rate)"].to_list() == [100.0, 100.0, 50.0, 0.0, 0.0]
def test_dynamic_poi_metric_columns_are_area_level() -> None:
assert _is_dynamic_poi_metric_column("Distance to nearest Cafe POI (km)")
assert _is_dynamic_poi_metric_column("Number of Cafe POIs within 2km")
assert _is_dynamic_poi_metric_column("Number of Cafe POIs within 5km")
assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")

View file

@ -0,0 +1,41 @@
import polars as pl
from pipeline.transform.poi_proximity import _build_poi_category_groups
def test_dynamic_poi_groups_include_requested_categories_only() -> None:
pois = pl.DataFrame(
{
"group": (
["Public Transport"] * 2
+ ["Leisure"] * 2
+ ["Groceries"] * 101
+ ["Groceries"] * 100
+ ["Education"] * 200
+ ["Health"] * 200
),
"category": (
["Rail station", "Bus stop"]
+ ["Café", "Restaurant"]
+ ["Tesco"] * 101
+ ["Waitrose"] * 100
+ ["School"] * 200
+ ["Pharmacy"] * 200
),
"lat": [51.5] * 605,
"lng": [-0.1] * 605,
}
)
groups, display_names = _build_poi_category_groups(pois)
assert set(display_names.values()) == {
"Bus stop",
"Café",
"Rail station",
"Restaurant",
"Tesco",
}
assert "poi_waitrose" not in groups
assert "poi_school" not in groups
assert "poi_pharmacy" not in groups

View file

@ -1128,12 +1128,18 @@ GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
def normalize_grocery_retailer(retailer: str | None) -> str:
if retailer is None:
return ""
return GROCERY_RETAILER_DISPLAY_NAMES.get(retailer, retailer)
display_name = GROCERY_RETAILER_DISPLAY_NAMES.get(retailer)
if display_name is None:
raise ValueError(f"Missing grocery retailer display name for {retailer!r}")
return display_name
def normalize_grocery_icon_category(fascia: str | None, retailer: str | None) -> str:
if fascia:
return GROCERY_FASCIA_ICON_NAMES.get(fascia, normalize_grocery_retailer(fascia))
icon_name = GROCERY_FASCIA_ICON_NAMES.get(fascia)
if icon_name is None:
raise ValueError(f"Missing grocery fascia icon name for {fascia!r}")
return icon_name
return normalize_grocery_retailer(retailer)