This commit is contained in:
Andras Schmelczer 2026-05-13 08:00:12 +01:00
parent 63713c3a2b
commit bd6b511f16
17 changed files with 544 additions and 377 deletions

View file

@ -1,4 +1,5 @@
import argparse
import re
import polars as pl
from pathlib import Path
@ -57,9 +58,6 @@ _AREA_COLUMNS = [
# Amenities
"Number of restaurants within 2km",
"Number of grocery shops and supermarkets within 2km",
"Number of parks within 1km",
"Distance to nearest train or tube station (km)",
"Distance to nearest park (km)",
# Environment
"Noise (dB)",
"Max available download speed (Mbps)",
@ -85,6 +83,17 @@ _AREA_COLUMNS = [
]
_DYNAMIC_POI_DISTANCE_RE = re.compile(r"^Distance to nearest amenity \(.+\) \(km\)$")
_DYNAMIC_POI_COUNT_RE = re.compile(r"^Number of amenities \(.+\) within (2|5)km$")
TREE_DENSITY_FEATURE = "Street tree density percentile"
def _is_dynamic_poi_metric_column(column: str) -> bool:
return bool(
_DYNAMIC_POI_DISTANCE_RE.match(column) or _DYNAMIC_POI_COUNT_RE.match(column)
)
def _less_deprived_percentile_expr(column: str) -> pl.Expr:
"""Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
non_null_count = pl.col(column).count()
@ -117,6 +126,7 @@ def _build(
lsoa_population_path: Path,
median_age_path: Path,
election_results_path: Path,
tree_density_addresses_path: Path | None = None,
) -> tuple[pl.DataFrame, pl.DataFrame]:
"""Build postcode and properties dataframes from epc_pp + auxiliary data.
@ -250,6 +260,18 @@ def _build(
school_proximity = pl.scan_parquet(school_proximity_path)
wide = wide.join(school_proximity, on="postcode", how="left")
if tree_density_addresses_path is not None:
tree_density = (
pl.scan_parquet(tree_density_addresses_path)
.select(
pl.col("postcode"),
pl.col("pp_address"),
pl.col(TREE_DENSITY_FEATURE).cast(pl.Float32),
)
.unique(["postcode", "pp_address"])
)
wide = wide.join(tree_density, on=["postcode", "pp_address"], how="left")
# Broadband: derive max available download speed tier per postcode from
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
# UFBB(100) ≥100, SFBB ≥30 Mbps. Stored as string enum.
@ -366,9 +388,6 @@ def _build(
"property_type": "Property type",
"restaurants_2km": "Number of restaurants within 2km",
"groceries_2km": "Number of grocery shops and supermarkets within 2km",
"parks_1km": "Number of parks within 1km",
"train_tube_nearest_km": "Distance to nearest train or tube station (km)",
"parks_nearest_km": "Distance to nearest park (km)",
"latest_price": "Last known price",
"number_habitable_rooms": "Number of bedrooms & living rooms",
"noise_lden_db": "Noise (dB)",
@ -398,11 +417,18 @@ def _build(
df = wide.collect(engine="streaming")
# Split into postcode-level and property-level dataframes
area_cols = [c for c in _AREA_COLUMNS if c in df.columns]
area_cols = [
c for c in df.columns if c in _AREA_COLUMNS or _is_dynamic_poi_metric_column(c)
]
postcode_df = df.select(area_cols).group_by("Postcode").first()
print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
property_cols = [c for c in df.columns if c not in _AREA_COLUMNS or c == "Postcode"]
property_cols = [
c
for c in df.columns
if (c not in _AREA_COLUMNS and not _is_dynamic_poi_metric_column(c))
or c == "Postcode"
]
properties_df = df.select(property_cols)
print(f"Property rows: {properties_df.height}")
@ -481,6 +507,12 @@ def main():
required=True,
help="2024 General Election results by constituency parquet file",
)
parser.add_argument(
"--tree-density-addresses",
type=Path,
required=False,
help="Address-level tree density parquet from pipeline.transform.tree_density",
)
parser.add_argument(
"--output-postcodes",
type=Path,
@ -509,6 +541,7 @@ def main():
lsoa_population_path=args.lsoa_population,
median_age_path=args.median_age,
election_results_path=args.election_results,
tree_density_addresses_path=args.tree_density_addresses,
)
print(f"\nPostcode columns: {postcode_df.columns}")

View file

@ -17,27 +17,6 @@ POI_GROUPS_2KM = {
"groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
}
# Groups for which to compute distance to nearest POI (from filtered POIs).
# Keep `train_tube` for the existing backend feature; the individual POI
# distance filters below power the frontend dropdown.
DISTANCE_GROUPS = {
"train_tube": ["Tube station", "Rail station"],
"grocery_store": [
"Greengrocer",
"Supermarket",
"Convenience Store",
"Waitrose",
"Tesco",
],
"tube_station": ["Tube station"],
"rail_station": ["Rail station"],
"waitrose": ["Waitrose"],
"tesco": ["Tesco"],
"cafe": ["Café"],
"pub": ["Pub"],
"restaurant": ["Restaurant"],
}
# OS Open Greenspace function types used for park counts and distance calculation.
# Uses the authoritative OS dataset instead of OSM point POIs for better coverage
# of green spaces that are only mapped as polygons in OSM.
@ -48,6 +27,7 @@ GREENSPACE_PARK_FUNCTIONS = {
GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure"}
DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS = {"Groceries"}
DYNAMIC_FILTER_EXCLUDED_CATEGORIES = {"Park"}
def _poi_category_slug(category: str) -> str:
@ -78,6 +58,7 @@ def _build_poi_category_groups(
& (pl.col("len") > GROCERY_DYNAMIC_FILTER_MIN_POIS)
)
)
.filter(~pl.col("category").is_in(list(DYNAMIC_FILTER_EXCLUDED_CATEGORIES)))
.select("category")
.sort("category")
.to_series()
@ -103,9 +84,11 @@ def _build_poi_category_groups(
def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]:
renames: dict[str, str] = {}
for group_key, category in display_names.items():
renames[f"{group_key}_nearest_km"] = f"Distance to nearest {category} POI (km)"
renames[f"{group_key}_2km"] = f"Number of {category} POIs within 2km"
renames[f"{group_key}_5km"] = f"Number of {category} POIs within 5km"
renames[f"{group_key}_nearest_km"] = (
f"Distance to nearest amenity ({category}) (km)"
)
renames[f"{group_key}_2km"] = f"Number of amenities ({category}) within 2km"
renames[f"{group_key}_5km"] = f"Number of amenities ({category}) within 5km"
return renames
@ -139,12 +122,12 @@ def main():
pois = pl.read_parquet(args.pois)
poi_category_groups, poi_display_names = _build_poi_category_groups(pois)
# Count amenity POIs within 2km
# Count static amenity groups within 2km.
counts_2km = count_pois_per_postcode(
postcodes, pois, groups=POI_GROUPS_2KM, radius_km=2
)
# Dynamic POI filters: nearest distance plus counts within 2km and 5km for
# Dynamic amenity filters: nearest distance plus counts within 2km and 5km for
# the selected public transport, grocery, and leisure categories.
dynamic_counts_2km = count_pois_per_postcode(
postcodes, pois, groups=poi_category_groups, radius_km=2
@ -166,25 +149,37 @@ def main():
{k: v for k, v in dynamic_renames.items() if k in dynamic_distances.columns}
)
# Distance to nearest train/tube station (from filtered POIs)
distances = min_distance_per_postcode(postcodes, pois, groups=DISTANCE_GROUPS)
# Park counts and distances from OS Open Greenspace
# Park counts and distances from OS Open Greenspace. They use the dynamic
# amenity metric names so filters read through the same side-table path as
# OSM-derived amenity metrics.
greenspace = pl.read_parquet(args.greenspace)
park_counts_1km = count_pois_per_postcode(
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=1
park_counts_2km = count_pois_per_postcode(
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
)
park_counts_5km = count_pois_per_postcode(
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5
)
park_distances = min_distance_per_postcode(
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS
)
park_renames = _dynamic_poi_metric_renames({"parks": "Park"})
park_counts_2km = park_counts_2km.rename(
{k: v for k, v in park_renames.items() if k in park_counts_2km.columns}
)
park_counts_5km = park_counts_5km.rename(
{k: v for k, v in park_renames.items() if k in park_counts_5km.columns}
)
park_distances = park_distances.rename(
{k: v for k, v in park_renames.items() if k in park_distances.columns}
)
# Join all results on postcode
result = (
counts_2km.join(distances, on="postcode")
.join(dynamic_counts_2km, on="postcode")
counts_2km.join(dynamic_counts_2km, on="postcode")
.join(dynamic_counts_5km, on="postcode")
.join(dynamic_distances, on="postcode")
.join(park_counts_1km, on="postcode")
.join(park_counts_2km, on="postcode")
.join(park_counts_5km, on="postcode")
.join(park_distances, on="postcode")
)