Test changes
This commit is contained in:
parent
4c95815dc8
commit
be02fc16bb
41 changed files with 4224 additions and 759 deletions
|
|
@ -7,6 +7,15 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
|
|||
|
||||
MIN_FLOOR_AREA_M2 = 10
|
||||
|
||||
_IOD_PERCENTILE_COLUMNS = [
|
||||
"Education, Skills and Training Score",
|
||||
"Income Score (rate)",
|
||||
"Employment Score (rate)",
|
||||
"Health Deprivation and Disability Score",
|
||||
"Indoors Sub-domain Score",
|
||||
"Outdoors Sub-domain Score",
|
||||
]
|
||||
|
||||
|
||||
_AREA_COLUMNS = [
|
||||
"Postcode",
|
||||
|
|
@ -51,6 +60,14 @@ _AREA_COLUMNS = [
|
|||
"Number of parks within 1km",
|
||||
"Distance to nearest train or tube station (km)",
|
||||
"Distance to nearest park (km)",
|
||||
"Distance to nearest grocery store (km)",
|
||||
"Distance to nearest tube station (km)",
|
||||
"Distance to nearest rail station (km)",
|
||||
"Distance to nearest Waitrose (km)",
|
||||
"Distance to nearest Tesco (km)",
|
||||
"Distance to nearest cafe (km)",
|
||||
"Distance to nearest pub (km)",
|
||||
"Distance to nearest restaurant (km)",
|
||||
# Environment
|
||||
"Noise (dB)",
|
||||
"Max available download speed (Mbps)",
|
||||
|
|
@ -76,6 +93,34 @@ _AREA_COLUMNS = [
|
|||
]
|
||||
|
||||
|
||||
def _is_dynamic_poi_metric_column(column: str) -> bool:
|
||||
return (
|
||||
column.startswith("Distance to nearest ")
|
||||
and column.endswith(" POI (km)")
|
||||
) or (
|
||||
column.startswith("Number of ")
|
||||
and (column.endswith(" POIs within 2km") or column.endswith(" POIs within 5km"))
|
||||
)
|
||||
|
||||
|
||||
def _less_deprived_percentile_expr(column: str) -> pl.Expr:
|
||||
"""Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
|
||||
non_null_count = pl.col(column).count()
|
||||
descending_rank = pl.col(column).rank("average", descending=True)
|
||||
return (
|
||||
pl.when(pl.col(column).is_null())
|
||||
.then(None)
|
||||
.when(pl.col(column) == pl.col(column).min())
|
||||
.then(100.0)
|
||||
.when(pl.col(column) == pl.col(column).max())
|
||||
.then(0.0)
|
||||
.when(non_null_count > 1)
|
||||
.then(((descending_rank - 1) / (non_null_count - 1) * 100).round(1))
|
||||
.otherwise(100.0)
|
||||
.alias(column)
|
||||
)
|
||||
|
||||
|
||||
def _build(
|
||||
epc_pp_path: Path,
|
||||
arcgis_path: Path,
|
||||
|
|
@ -134,20 +179,11 @@ def _build(
|
|||
)
|
||||
wide = wide.join(arcgis, on="postcode", how="left")
|
||||
|
||||
iod = pl.scan_parquet(iod_path)
|
||||
iod = pl.scan_parquet(iod_path).with_columns(
|
||||
*(_less_deprived_percentile_expr(c) for c in _IOD_PERCENTILE_COLUMNS)
|
||||
)
|
||||
wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
|
||||
|
||||
# Invert deprivation scores so that higher values = less deprived (better)
|
||||
iod_score_cols = [
|
||||
"Education, Skills and Training Score",
|
||||
"Income Score (rate)",
|
||||
"Employment Score (rate)",
|
||||
"Health Deprivation and Disability Score",
|
||||
"Indoors Sub-domain Score",
|
||||
"Outdoors Sub-domain Score",
|
||||
]
|
||||
wide = wide.with_columns(*(pl.col(c).max() - pl.col(c) for c in iod_score_cols))
|
||||
|
||||
ethnicity = pl.scan_parquet(ethnicity_path)
|
||||
wide = wide.join(
|
||||
ethnicity,
|
||||
|
|
@ -351,6 +387,14 @@ def _build(
|
|||
"parks_1km": "Number of parks within 1km",
|
||||
"train_tube_nearest_km": "Distance to nearest train or tube station (km)",
|
||||
"parks_nearest_km": "Distance to nearest park (km)",
|
||||
"grocery_store_nearest_km": "Distance to nearest grocery store (km)",
|
||||
"tube_station_nearest_km": "Distance to nearest tube station (km)",
|
||||
"rail_station_nearest_km": "Distance to nearest rail station (km)",
|
||||
"waitrose_nearest_km": "Distance to nearest Waitrose (km)",
|
||||
"tesco_nearest_km": "Distance to nearest Tesco (km)",
|
||||
"cafe_nearest_km": "Distance to nearest cafe (km)",
|
||||
"pub_nearest_km": "Distance to nearest pub (km)",
|
||||
"restaurant_nearest_km": "Distance to nearest restaurant (km)",
|
||||
"latest_price": "Last known price",
|
||||
"number_habitable_rooms": "Number of bedrooms & living rooms",
|
||||
"noise_lden_db": "Noise (dB)",
|
||||
|
|
@ -381,10 +425,14 @@ def _build(
|
|||
|
||||
# Split into postcode-level and property-level dataframes
|
||||
area_cols = [c for c in _AREA_COLUMNS if c in df.columns]
|
||||
area_cols.extend(
|
||||
c for c in df.columns if _is_dynamic_poi_metric_column(c) and c not in area_cols
|
||||
)
|
||||
area_col_set = set(area_cols)
|
||||
postcode_df = df.select(area_cols).group_by("Postcode").first()
|
||||
print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
|
||||
|
||||
property_cols = [c for c in df.columns if c not in _AREA_COLUMNS or c == "Postcode"]
|
||||
property_cols = [c for c in df.columns if c not in area_col_set or c == "Postcode"]
|
||||
properties_df = df.select(property_cols)
|
||||
print(f"Property rows: {properties_df.height}")
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
"""Compute POI proximity counts and distances per postcode from ArcGIS + filtered POIs."""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
|
|
@ -15,9 +17,25 @@ POI_GROUPS_2KM = {
|
|||
"groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
|
||||
}
|
||||
|
||||
# Groups for which to compute distance to nearest POI (from filtered POIs)
|
||||
# Groups for which to compute distance to nearest POI (from filtered POIs).
|
||||
# Keep `train_tube` for the existing backend feature; the individual POI
|
||||
# distance filters below power the frontend dropdown.
|
||||
DISTANCE_GROUPS = {
|
||||
"train_tube": ["Tube station", "Rail station"],
|
||||
"grocery_store": [
|
||||
"Greengrocer",
|
||||
"Supermarket",
|
||||
"Convenience Store",
|
||||
"Waitrose",
|
||||
"Tesco",
|
||||
],
|
||||
"tube_station": ["Tube station"],
|
||||
"rail_station": ["Rail station"],
|
||||
"waitrose": ["Waitrose"],
|
||||
"tesco": ["Tesco"],
|
||||
"cafe": ["Café"],
|
||||
"pub": ["Pub"],
|
||||
"restaurant": ["Restaurant"],
|
||||
}
|
||||
|
||||
# OS Open Greenspace function types used for park counts and distance calculation.
|
||||
|
|
@ -27,6 +45,69 @@ GREENSPACE_PARK_FUNCTIONS = {
|
|||
"parks": ["Public Park Or Garden", "Playing Field", "Play Space"],
|
||||
}
|
||||
|
||||
GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
|
||||
DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure"}
|
||||
DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS = {"Groceries"}
|
||||
|
||||
|
||||
def _poi_category_slug(category: str) -> str:
|
||||
ascii_text = (
|
||||
unicodedata.normalize("NFKD", category)
|
||||
.encode("ascii", "ignore")
|
||||
.decode("ascii")
|
||||
.lower()
|
||||
)
|
||||
slug = re.sub(r"[^a-z0-9]+", "_", ascii_text).strip("_")
|
||||
return slug or "poi"
|
||||
|
||||
|
||||
def _build_poi_category_groups(
|
||||
pois: pl.DataFrame,
|
||||
) -> tuple[dict[str, list[str]], dict[str, str]]:
|
||||
"""Build one proximity group for each POI category selected for filters."""
|
||||
if "group" not in pois.columns:
|
||||
raise ValueError("POI dataframe must include a 'group' column")
|
||||
|
||||
categories = (
|
||||
pois.group_by("group", "category")
|
||||
.len()
|
||||
.filter(
|
||||
pl.col("group").is_in(list(DYNAMIC_FILTER_ALL_GROUPS))
|
||||
| (
|
||||
pl.col("group").is_in(list(DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS))
|
||||
& (pl.col("len") > GROCERY_DYNAMIC_FILTER_MIN_POIS)
|
||||
)
|
||||
)
|
||||
.select("category")
|
||||
.sort("category")
|
||||
.to_series()
|
||||
.to_list()
|
||||
)
|
||||
used_slugs: dict[str, int] = {}
|
||||
groups: dict[str, list[str]] = {}
|
||||
display_names: dict[str, str] = {}
|
||||
|
||||
for category in categories:
|
||||
if not isinstance(category, str) or not category:
|
||||
continue
|
||||
base_slug = f"poi_{_poi_category_slug(category)}"
|
||||
slug_count = used_slugs.get(base_slug, 0)
|
||||
used_slugs[base_slug] = slug_count + 1
|
||||
group_key = base_slug if slug_count == 0 else f"{base_slug}_{slug_count + 1}"
|
||||
groups[group_key] = [category]
|
||||
display_names[group_key] = category
|
||||
|
||||
return groups, display_names
|
||||
|
||||
|
||||
def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]:
|
||||
renames: dict[str, str] = {}
|
||||
for group_key, category in display_names.items():
|
||||
renames[f"{group_key}_nearest_km"] = f"Distance to nearest {category} POI (km)"
|
||||
renames[f"{group_key}_2km"] = f"Number of {category} POIs within 2km"
|
||||
renames[f"{group_key}_5km"] = f"Number of {category} POIs within 5km"
|
||||
return renames
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
|
|
@ -56,12 +137,35 @@ def main():
|
|||
)
|
||||
|
||||
pois = pl.read_parquet(args.pois)
|
||||
poi_category_groups, poi_display_names = _build_poi_category_groups(pois)
|
||||
|
||||
# Count amenity POIs within 2km
|
||||
counts_2km = count_pois_per_postcode(
|
||||
postcodes, pois, groups=POI_GROUPS_2KM, radius_km=2
|
||||
)
|
||||
|
||||
# Dynamic POI filters: nearest distance plus counts within 2km and 5km for
|
||||
# the selected public transport, grocery, and leisure categories.
|
||||
dynamic_counts_2km = count_pois_per_postcode(
|
||||
postcodes, pois, groups=poi_category_groups, radius_km=2
|
||||
)
|
||||
dynamic_counts_5km = count_pois_per_postcode(
|
||||
postcodes, pois, groups=poi_category_groups, radius_km=5
|
||||
)
|
||||
dynamic_distances = min_distance_per_postcode(
|
||||
postcodes, pois, groups=poi_category_groups
|
||||
)
|
||||
dynamic_renames = _dynamic_poi_metric_renames(poi_display_names)
|
||||
dynamic_counts_2km = dynamic_counts_2km.rename(
|
||||
{k: v for k, v in dynamic_renames.items() if k in dynamic_counts_2km.columns}
|
||||
)
|
||||
dynamic_counts_5km = dynamic_counts_5km.rename(
|
||||
{k: v for k, v in dynamic_renames.items() if k in dynamic_counts_5km.columns}
|
||||
)
|
||||
dynamic_distances = dynamic_distances.rename(
|
||||
{k: v for k, v in dynamic_renames.items() if k in dynamic_distances.columns}
|
||||
)
|
||||
|
||||
# Distance to nearest train/tube station (from filtered POIs)
|
||||
distances = min_distance_per_postcode(postcodes, pois, groups=DISTANCE_GROUPS)
|
||||
|
||||
|
|
@ -77,6 +181,9 @@ def main():
|
|||
# Join all results on postcode
|
||||
result = (
|
||||
counts_2km.join(distances, on="postcode")
|
||||
.join(dynamic_counts_2km, on="postcode")
|
||||
.join(dynamic_counts_5km, on="postcode")
|
||||
.join(dynamic_distances, on="postcode")
|
||||
.join(park_counts_1km, on="postcode")
|
||||
.join(park_distances, on="postcode")
|
||||
)
|
||||
|
|
|
|||
33
pipeline/transform/test_merge.py
Normal file
33
pipeline/transform/test_merge.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.transform.merge import (
|
||||
_is_dynamic_poi_metric_column,
|
||||
_less_deprived_percentile_expr,
|
||||
)
|
||||
|
||||
|
||||
def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None:
|
||||
df = pl.DataFrame({"Income Score (rate)": [1.0, 2.0, 3.0, None]})
|
||||
|
||||
result = df.lazy().with_columns(
|
||||
_less_deprived_percentile_expr("Income Score (rate)")
|
||||
).collect()
|
||||
|
||||
assert result["Income Score (rate)"].to_list() == [100.0, 50.0, 0.0, None]
|
||||
|
||||
|
||||
def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None:
|
||||
df = pl.DataFrame({"Income Score (rate)": [1.0, 1.0, 2.0, 3.0, 3.0]})
|
||||
|
||||
result = df.lazy().with_columns(
|
||||
_less_deprived_percentile_expr("Income Score (rate)")
|
||||
).collect()
|
||||
|
||||
assert result["Income Score (rate)"].to_list() == [100.0, 100.0, 50.0, 0.0, 0.0]
|
||||
|
||||
|
||||
def test_dynamic_poi_metric_columns_are_area_level() -> None:
|
||||
assert _is_dynamic_poi_metric_column("Distance to nearest Cafe POI (km)")
|
||||
assert _is_dynamic_poi_metric_column("Number of Cafe POIs within 2km")
|
||||
assert _is_dynamic_poi_metric_column("Number of Cafe POIs within 5km")
|
||||
assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")
|
||||
41
pipeline/transform/test_poi_proximity.py
Normal file
41
pipeline/transform/test_poi_proximity.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.transform.poi_proximity import _build_poi_category_groups
|
||||
|
||||
|
||||
def test_dynamic_poi_groups_include_requested_categories_only() -> None:
|
||||
pois = pl.DataFrame(
|
||||
{
|
||||
"group": (
|
||||
["Public Transport"] * 2
|
||||
+ ["Leisure"] * 2
|
||||
+ ["Groceries"] * 101
|
||||
+ ["Groceries"] * 100
|
||||
+ ["Education"] * 200
|
||||
+ ["Health"] * 200
|
||||
),
|
||||
"category": (
|
||||
["Rail station", "Bus stop"]
|
||||
+ ["Café", "Restaurant"]
|
||||
+ ["Tesco"] * 101
|
||||
+ ["Waitrose"] * 100
|
||||
+ ["School"] * 200
|
||||
+ ["Pharmacy"] * 200
|
||||
),
|
||||
"lat": [51.5] * 605,
|
||||
"lng": [-0.1] * 605,
|
||||
}
|
||||
)
|
||||
|
||||
groups, display_names = _build_poi_category_groups(pois)
|
||||
|
||||
assert set(display_names.values()) == {
|
||||
"Bus stop",
|
||||
"Café",
|
||||
"Rail station",
|
||||
"Restaurant",
|
||||
"Tesco",
|
||||
}
|
||||
assert "poi_waitrose" not in groups
|
||||
assert "poi_school" not in groups
|
||||
assert "poi_pharmacy" not in groups
|
||||
|
|
@ -1128,12 +1128,18 @@ GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
|
|||
def normalize_grocery_retailer(retailer: str | None) -> str:
|
||||
if retailer is None:
|
||||
return ""
|
||||
return GROCERY_RETAILER_DISPLAY_NAMES.get(retailer, retailer)
|
||||
display_name = GROCERY_RETAILER_DISPLAY_NAMES.get(retailer)
|
||||
if display_name is None:
|
||||
raise ValueError(f"Missing grocery retailer display name for {retailer!r}")
|
||||
return display_name
|
||||
|
||||
|
||||
def normalize_grocery_icon_category(fascia: str | None, retailer: str | None) -> str:
|
||||
if fascia:
|
||||
return GROCERY_FASCIA_ICON_NAMES.get(fascia, normalize_grocery_retailer(fascia))
|
||||
icon_name = GROCERY_FASCIA_ICON_NAMES.get(fascia)
|
||||
if icon_name is None:
|
||||
raise ValueError(f"Missing grocery fascia icon name for {fascia!r}")
|
||||
return icon_name
|
||||
return normalize_grocery_retailer(retailer)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue