Test changes

2026-05-09 11:35:38 +01:00 · 2026-05-09 11:35:38 +01:00 · be02fc16bb
commit be02fc16bb
parent 4c95815dc8
41 changed files with 4224 additions and 759 deletions
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -7,6 +7,15 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping

 MIN_FLOOR_AREA_M2 = 10

+_IOD_PERCENTILE_COLUMNS = [
+    "Education, Skills and Training Score",
+    "Income Score (rate)",
+    "Employment Score (rate)",
+    "Health Deprivation and Disability Score",
+    "Indoors Sub-domain Score",
+    "Outdoors Sub-domain Score",
+]
+

 _AREA_COLUMNS = [
    "Postcode",
@ -51,6 +60,14 @@ _AREA_COLUMNS = [
    "Number of parks within 1km",
    "Distance to nearest train or tube station (km)",
    "Distance to nearest park (km)",
+    "Distance to nearest grocery store (km)",
+    "Distance to nearest tube station (km)",
+    "Distance to nearest rail station (km)",
+    "Distance to nearest Waitrose (km)",
+    "Distance to nearest Tesco (km)",
+    "Distance to nearest cafe (km)",
+    "Distance to nearest pub (km)",
+    "Distance to nearest restaurant (km)",
    # Environment
    "Noise (dB)",
    "Max available download speed (Mbps)",
@ -76,6 +93,34 @@ _AREA_COLUMNS = [
 ]


+def _is_dynamic_poi_metric_column(column: str) -> bool:
+    return (
+        column.startswith("Distance to nearest ")
+        and column.endswith(" POI (km)")
+    ) or (
+        column.startswith("Number of ")
+        and (column.endswith(" POIs within 2km") or column.endswith(" POIs within 5km"))
+    )
+
+
+def _less_deprived_percentile_expr(column: str) -> pl.Expr:
+    """Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
+    non_null_count = pl.col(column).count()
+    descending_rank = pl.col(column).rank("average", descending=True)
+    return (
+        pl.when(pl.col(column).is_null())
+        .then(None)
+        .when(pl.col(column) == pl.col(column).min())
+        .then(100.0)
+        .when(pl.col(column) == pl.col(column).max())
+        .then(0.0)
+        .when(non_null_count > 1)
+        .then(((descending_rank - 1) / (non_null_count - 1) * 100).round(1))
+        .otherwise(100.0)
+        .alias(column)
+    )
+
+
 def _build(
    epc_pp_path: Path,
    arcgis_path: Path,
@ -134,20 +179,11 @@ def _build(
    )
    wide = wide.join(arcgis, on="postcode", how="left")

-    iod = pl.scan_parquet(iod_path)
+    iod = pl.scan_parquet(iod_path).with_columns(
+        *(_less_deprived_percentile_expr(c) for c in _IOD_PERCENTILE_COLUMNS)
+    )
    wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")

-    # Invert deprivation scores so that higher values = less deprived (better)
-    iod_score_cols = [
-        "Education, Skills and Training Score",
-        "Income Score (rate)",
-        "Employment Score (rate)",
-        "Health Deprivation and Disability Score",
-        "Indoors Sub-domain Score",
-        "Outdoors Sub-domain Score",
-    ]
-    wide = wide.with_columns(*(pl.col(c).max() - pl.col(c) for c in iod_score_cols))
-
    ethnicity = pl.scan_parquet(ethnicity_path)
    wide = wide.join(
        ethnicity,
@ -351,6 +387,14 @@ def _build(
                "parks_1km": "Number of parks within 1km",
                "train_tube_nearest_km": "Distance to nearest train or tube station (km)",
                "parks_nearest_km": "Distance to nearest park (km)",
+                "grocery_store_nearest_km": "Distance to nearest grocery store (km)",
+                "tube_station_nearest_km": "Distance to nearest tube station (km)",
+                "rail_station_nearest_km": "Distance to nearest rail station (km)",
+                "waitrose_nearest_km": "Distance to nearest Waitrose (km)",
+                "tesco_nearest_km": "Distance to nearest Tesco (km)",
+                "cafe_nearest_km": "Distance to nearest cafe (km)",
+                "pub_nearest_km": "Distance to nearest pub (km)",
+                "restaurant_nearest_km": "Distance to nearest restaurant (km)",
                "latest_price": "Last known price",
                "number_habitable_rooms": "Number of bedrooms & living rooms",
                "noise_lden_db": "Noise (dB)",
@ -381,10 +425,14 @@ def _build(

    # Split into postcode-level and property-level dataframes
    area_cols = [c for c in _AREA_COLUMNS if c in df.columns]
+    area_cols.extend(
+        c for c in df.columns if _is_dynamic_poi_metric_column(c) and c not in area_cols
+    )
+    area_col_set = set(area_cols)
    postcode_df = df.select(area_cols).group_by("Postcode").first()
    print(f"Postcode rows: {postcode_df.height} (unique postcodes)")

-    property_cols = [c for c in df.columns if c not in _AREA_COLUMNS or c == "Postcode"]
+    property_cols = [c for c in df.columns if c not in area_col_set or c == "Postcode"]
    properties_df = df.select(property_cols)
    print(f"Property rows: {properties_df.height}")

--- a/pipeline/transform/poi_proximity.py
+++ b/pipeline/transform/poi_proximity.py
@ -1,6 +1,8 @@
 """Compute POI proximity counts and distances per postcode from ArcGIS + filtered POIs."""

 import argparse
+import re
+import unicodedata
 from pathlib import Path

 import polars as pl
@ -15,9 +17,25 @@ POI_GROUPS_2KM = {
    "groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
 }

-# Groups for which to compute distance to nearest POI (from filtered POIs)
+# Groups for which to compute distance to nearest POI (from filtered POIs).
+# Keep `train_tube` for the existing backend feature; the individual POI
+# distance filters below power the frontend dropdown.
 DISTANCE_GROUPS = {
    "train_tube": ["Tube station", "Rail station"],
+    "grocery_store": [
+        "Greengrocer",
+        "Supermarket",
+        "Convenience Store",
+        "Waitrose",
+        "Tesco",
+    ],
+    "tube_station": ["Tube station"],
+    "rail_station": ["Rail station"],
+    "waitrose": ["Waitrose"],
+    "tesco": ["Tesco"],
+    "cafe": ["Café"],
+    "pub": ["Pub"],
+    "restaurant": ["Restaurant"],
 }

 # OS Open Greenspace function types used for park counts and distance calculation.
@ -27,6 +45,69 @@ GREENSPACE_PARK_FUNCTIONS = {
    "parks": ["Public Park Or Garden", "Playing Field", "Play Space"],
 }

+GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
+DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure"}
+DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS = {"Groceries"}
+
+
+def _poi_category_slug(category: str) -> str:
+    ascii_text = (
+        unicodedata.normalize("NFKD", category)
+        .encode("ascii", "ignore")
+        .decode("ascii")
+        .lower()
+    )
+    slug = re.sub(r"[^a-z0-9]+", "_", ascii_text).strip("_")
+    return slug or "poi"
+
+
+def _build_poi_category_groups(
+    pois: pl.DataFrame,
+) -> tuple[dict[str, list[str]], dict[str, str]]:
+    """Build one proximity group for each POI category selected for filters."""
+    if "group" not in pois.columns:
+        raise ValueError("POI dataframe must include a 'group' column")
+
+    categories = (
+        pois.group_by("group", "category")
+        .len()
+        .filter(
+            pl.col("group").is_in(list(DYNAMIC_FILTER_ALL_GROUPS))
+            | (
+                pl.col("group").is_in(list(DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS))
+                & (pl.col("len") > GROCERY_DYNAMIC_FILTER_MIN_POIS)
+            )
+        )
+        .select("category")
+        .sort("category")
+        .to_series()
+        .to_list()
+    )
+    used_slugs: dict[str, int] = {}
+    groups: dict[str, list[str]] = {}
+    display_names: dict[str, str] = {}
+
+    for category in categories:
+        if not isinstance(category, str) or not category:
+            continue
+        base_slug = f"poi_{_poi_category_slug(category)}"
+        slug_count = used_slugs.get(base_slug, 0)
+        used_slugs[base_slug] = slug_count + 1
+        group_key = base_slug if slug_count == 0 else f"{base_slug}_{slug_count + 1}"
+        groups[group_key] = [category]
+        display_names[group_key] = category
+
+    return groups, display_names
+
+
+def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]:
+    renames: dict[str, str] = {}
+    for group_key, category in display_names.items():
+        renames[f"{group_key}_nearest_km"] = f"Distance to nearest {category} POI (km)"
+        renames[f"{group_key}_2km"] = f"Number of {category} POIs within 2km"
+        renames[f"{group_key}_5km"] = f"Number of {category} POIs within 5km"
+    return renames
+

 def main():
    parser = argparse.ArgumentParser(
@ -56,12 +137,35 @@ def main():
    )

    pois = pl.read_parquet(args.pois)
+    poi_category_groups, poi_display_names = _build_poi_category_groups(pois)

    # Count amenity POIs within 2km
    counts_2km = count_pois_per_postcode(
        postcodes, pois, groups=POI_GROUPS_2KM, radius_km=2
    )

+    # Dynamic POI filters: nearest distance plus counts within 2km and 5km for
+    # the selected public transport, grocery, and leisure categories.
+    dynamic_counts_2km = count_pois_per_postcode(
+        postcodes, pois, groups=poi_category_groups, radius_km=2
+    )
+    dynamic_counts_5km = count_pois_per_postcode(
+        postcodes, pois, groups=poi_category_groups, radius_km=5
+    )
+    dynamic_distances = min_distance_per_postcode(
+        postcodes, pois, groups=poi_category_groups
+    )
+    dynamic_renames = _dynamic_poi_metric_renames(poi_display_names)
+    dynamic_counts_2km = dynamic_counts_2km.rename(
+        {k: v for k, v in dynamic_renames.items() if k in dynamic_counts_2km.columns}
+    )
+    dynamic_counts_5km = dynamic_counts_5km.rename(
+        {k: v for k, v in dynamic_renames.items() if k in dynamic_counts_5km.columns}
+    )
+    dynamic_distances = dynamic_distances.rename(
+        {k: v for k, v in dynamic_renames.items() if k in dynamic_distances.columns}
+    )
+
    # Distance to nearest train/tube station (from filtered POIs)
    distances = min_distance_per_postcode(postcodes, pois, groups=DISTANCE_GROUPS)

@ -77,6 +181,9 @@ def main():
    # Join all results on postcode
    result = (
        counts_2km.join(distances, on="postcode")
+        .join(dynamic_counts_2km, on="postcode")
+        .join(dynamic_counts_5km, on="postcode")
+        .join(dynamic_distances, on="postcode")
        .join(park_counts_1km, on="postcode")
        .join(park_distances, on="postcode")
    )
--- a/pipeline/transform/test_merge.py
+++ b/pipeline/transform/test_merge.py
@ -0,0 +1,33 @@
+import polars as pl
+
+from pipeline.transform.merge import (
+    _is_dynamic_poi_metric_column,
+    _less_deprived_percentile_expr,
+)
+
+
+def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None:
+    df = pl.DataFrame({"Income Score (rate)": [1.0, 2.0, 3.0, None]})
+
+    result = df.lazy().with_columns(
+        _less_deprived_percentile_expr("Income Score (rate)")
+    ).collect()
+
+    assert result["Income Score (rate)"].to_list() == [100.0, 50.0, 0.0, None]
+
+
+def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None:
+    df = pl.DataFrame({"Income Score (rate)": [1.0, 1.0, 2.0, 3.0, 3.0]})
+
+    result = df.lazy().with_columns(
+        _less_deprived_percentile_expr("Income Score (rate)")
+    ).collect()
+
+    assert result["Income Score (rate)"].to_list() == [100.0, 100.0, 50.0, 0.0, 0.0]
+
+
+def test_dynamic_poi_metric_columns_are_area_level() -> None:
+    assert _is_dynamic_poi_metric_column("Distance to nearest Cafe POI (km)")
+    assert _is_dynamic_poi_metric_column("Number of Cafe POIs within 2km")
+    assert _is_dynamic_poi_metric_column("Number of Cafe POIs within 5km")
+    assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")
--- a/pipeline/transform/test_poi_proximity.py
+++ b/pipeline/transform/test_poi_proximity.py
@ -0,0 +1,41 @@
+import polars as pl
+
+from pipeline.transform.poi_proximity import _build_poi_category_groups
+
+
+def test_dynamic_poi_groups_include_requested_categories_only() -> None:
+    pois = pl.DataFrame(
+        {
+            "group": (
+                ["Public Transport"] * 2
+                + ["Leisure"] * 2
+                + ["Groceries"] * 101
+                + ["Groceries"] * 100
+                + ["Education"] * 200
+                + ["Health"] * 200
+            ),
+            "category": (
+                ["Rail station", "Bus stop"]
+                + ["Café", "Restaurant"]
+                + ["Tesco"] * 101
+                + ["Waitrose"] * 100
+                + ["School"] * 200
+                + ["Pharmacy"] * 200
+            ),
+            "lat": [51.5] * 605,
+            "lng": [-0.1] * 605,
+        }
+    )
+
+    groups, display_names = _build_poi_category_groups(pois)
+
+    assert set(display_names.values()) == {
+        "Bus stop",
+        "Café",
+        "Rail station",
+        "Restaurant",
+        "Tesco",
+    }
+    assert "poi_waitrose" not in groups
+    assert "poi_school" not in groups
+    assert "poi_pharmacy" not in groups
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -1128,12 +1128,18 @@ GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
 def normalize_grocery_retailer(retailer: str | None) -> str:
    if retailer is None:
        return ""
-    return GROCERY_RETAILER_DISPLAY_NAMES.get(retailer, retailer)
+    display_name = GROCERY_RETAILER_DISPLAY_NAMES.get(retailer)
+    if display_name is None:
+        raise ValueError(f"Missing grocery retailer display name for {retailer!r}")
+    return display_name


 def normalize_grocery_icon_category(fascia: str | None, retailer: str | None) -> str:
    if fascia:
-        return GROCERY_FASCIA_ICON_NAMES.get(fascia, normalize_grocery_retailer(fascia))
+        icon_name = GROCERY_FASCIA_ICON_NAMES.get(fascia)
+        if icon_name is None:
+            raise ValueError(f"Missing grocery fascia icon name for {fascia!r}")
+        return icon_name
    return normalize_grocery_retailer(retailer)